From 5d513616803019b3b6712ccb8d298f1e68a1ddba Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Tue, 22 Oct 2024 15:49:18 +0000
Subject: [PATCH 01/92] Add cutlass 2:4 infrastructure

---
 .../semi_structured_benchmarks.py             |  373 ++++++
 csrc/ops.h                                    |    3 +
 csrc/semi_structured/cusparselt/binding.py    |   47 +
 .../cusparselt/cusparselt_mm.cu               | 1077 +++++++++++++++++
 .../cusparselt/cusparselt_mm_entry.cu         |  135 +++
 csrc/semi_structured/cutlass/common.hpp       |   27 +
 .../cutlass/semi_structured_mm_c3x.cu         |  223 ++++
 .../cutlass/semi_structured_mm_entry.cu       |   54 +
 csrc/torch_bindings.cpp                       |    7 +
 vllm/_custom_ops.py                           |   15 +
 10 files changed, 1961 insertions(+)
 create mode 100644 benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
 create mode 100644 csrc/semi_structured/cusparselt/binding.py
 create mode 100644 csrc/semi_structured/cusparselt/cusparselt_mm.cu
 create mode 100644 csrc/semi_structured/cusparselt/cusparselt_mm_entry.cu
 create mode 100644 csrc/semi_structured/cutlass/common.hpp
 create mode 100644 csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
 create mode 100644 csrc/semi_structured/cutlass/semi_structured_mm_entry.cu

diff --git a/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py b/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
new file mode 100644
index 0000000000000..61eed3da41458
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
@@ -0,0 +1,373 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+    
+    # cutlass impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_semi_structured_mm",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # cutlass impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "cutlass_fp16_fp16_fp16_semi_structured_mm",
+                 torch.mm, a.to(dtype=torch.float16),
+                 b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_semi_structured_mm",
+                 ops.cutlass_semi_structured_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # pytorch impl: bf16 output, without fp8 fast accum
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp8_fp8_bf16_semi_structured_mm",
+    #              torch._semi_structured_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16))
+
+    # # pytorch impl: bf16 output, with fp8 fast accum
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp8_fp8_bf16_semi_structured_mm_fast_accum",
+    #              torch._semi_structured_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16,
+    #              use_fast_accum=True))
+
+    # # pytorch impl: fp16 output, without fp8 fast accum
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp8_fp8_fp16_semi_structured_mm",
+    #              torch._semi_structured_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16))
+
+    # # pytorch impl: fp16 output, with fp8 fast accum
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp8_fp8_fp16_semi_structured_mm_fast_accum",
+    #              torch._semi_structured_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16,
+    #              use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_semi_structured_mm",
+                 ops.cutlass_semi_structured_mm, a, b,
+                 torch.bfloat16))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_semi_structured_mm",
+                 ops.cutlass_semi_structured_mm, a, b,
+                 torch.float16))
+
+    # # cutlass impl: bf16 output, with bias
+    # timers.append(
+    #     bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_semi_structured_mm_bias",
+    #              ops.cutlass_semi_structured_mm, a, b, scale_a, scale_b,
+    #              torch.bfloat16, bias))
+
+    # # cutlass impl: fp16 output, with bias
+    # timers.append(
+    #     bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_semi_structured_mm_bias",
+    #              ops.cutlass_semi_structured_mm, a, b, scale_a, scale_b,
+    #              torch.float16, bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"semi_structured-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/csrc/ops.h b/csrc/ops.h
index c10c34e085750..c0b4fa7f5d15e 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -115,6 +115,9 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& azp_adj,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
+
+void cutlass_semi_structured_mm(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/semi_structured/cusparselt/binding.py b/csrc/semi_structured/cusparselt/binding.py
new file mode 100644
index 0000000000000..035c18abd312a
--- /dev/null
+++ b/csrc/semi_structured/cusparselt/binding.py
@@ -0,0 +1,47 @@
+from torch.utils.cpp_extension import load
+import os
+import torch
+
+base_path = __file__.replace("spmm.py", "")
+
+if not os.path.exists(f"{base_path}/build"):
+    os.makedirs(f"{base_path}/build")
+
+if not os.path.exists(base_path + "/libcusparse_lt"):
+    os.system(
+    "wget https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
+    os.system("tar -xf libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
+    os.system(f"mv libcusparse_lt-linux-x86_64-0.5.1.1-archive {base_path}/libcusparse_lt")
+    os.system("rm libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
+
+pruner = load(name='pruner',
+              sources=[f'{base_path}/spmm_backend.cpp',
+                       f'{base_path}/spmm_backend.cu',
+                       ],
+              extra_cflags=[
+                  f'-L{base_path}/libcusparse_lt/lib',
+                  '-lcusparse',
+                  '-lcusparseLt',
+                  '-ldl'
+              ],
+              extra_cuda_cflags=[
+                  f'-L{base_path}/libcusparse_lt/lib',
+                  '-lcusparse',
+                  '-lcusparseLt',
+                  '-ldl'
+              ],
+              extra_ldflags=[
+                  f'-L{base_path}/libcusparse_lt/lib',
+                  '-lcusparse',
+                  '-lcusparseLt',
+                  '-ldl'
+              ],
+              extra_include_paths=[
+                  base_path + '/libcusparse_lt/include'
+              ],
+              build_directory=f'{base_path}/build',
+              with_cuda=True,
+              verbose=False)
+
+init_flag = pruner.init_cusparse_lt()
+assert init_flag == 0, "Failed to initialize CuSparseLT"
\ No newline at end of file
diff --git a/csrc/semi_structured/cusparselt/cusparselt_mm.cu b/csrc/semi_structured/cusparselt/cusparselt_mm.cu
new file mode 100644
index 0000000000000..0e088b35c7b87
--- /dev/null
+++ b/csrc/semi_structured/cusparselt/cusparselt_mm.cu
@@ -0,0 +1,1077 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cusparseLt.h>       // cusparseLt header
+#include <cstdio>             // printf
+#include <cstdlib>            // std::rand
+#include <vector>             // std::vector
+#include <torch/extension.h>
+#include <iostream>
+
+
+#define INT8_OUTPUT_TYPE int32_t //at::Half //int8_t
+#define INT8_OUTPUT_TYPE_CUDA CUDA_R_8I //CUDA_R_32I
+#define INT8_OUTPUT_TYPE_TORCH torch::kInt32 //torch::kInt32
+
+
+#define MAX(a, b) ((abs(a) > abs(b) ? (a) : (b)))
+#define MIN(a, b) ((abs(a) < abs(b) ? (a) : (b)))
+
+
+#define CHECK_CUDA(func)                                                       \
+{                                                                              \
+    cudaError_t status = (func);                                               \
+    if (status != cudaSuccess) {                                               \
+        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
+               __LINE__, cudaGetErrorString(status), status);                  \
+        return EXIT_FAILURE;                                                   \
+    }                                                                          \
+}
+
+
+#define CHECK_CUDA_TORCH(func)                                                       \
+{                                                                              \
+    cudaError_t status = (func);                                               \
+    if (status != cudaSuccess) {                                               \
+        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
+               __LINE__, cudaGetErrorString(status), status);                  \
+        return torch::ones(1);                                                   \
+    }                                                                          \
+}
+
+
+#define CHECK_CUSPARSE(func)                                                   \
+{                                                                              \
+    cusparseStatus_t status = (func);                                          \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
+               __LINE__, cusparseGetErrorString(status), status);              \
+        return EXIT_FAILURE;                                                   \
+    }                                                                          \
+}
+
+
+#define CHECK_CUSPARSE_TORCH(func)                                                   \
+{                                                                              \
+    cusparseStatus_t status = (func);                                          \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
+               __LINE__, cusparseGetErrorString(status), status);              \
+        return torch::ones(1);                                                   \
+    }                                                                          \
+}
+
+constexpr int EXIT_UNSUPPORTED = 2;
+
+cusparseLtHandle_t handle;
+
+float alpha = 1.0;
+float beta  = 0.0;
+
+
+typedef struct {
+   at::Half data;
+   int index;
+} indexed_half;
+
+
+int init_cusparse_lt_cuda()
+{
+    int major_cc, minor_cc;
+    CHECK_CUDA( cudaDeviceGetAttribute(&major_cc,
+                                       cudaDevAttrComputeCapabilityMajor, 0) )
+    CHECK_CUDA( cudaDeviceGetAttribute(&minor_cc,
+                                       cudaDevAttrComputeCapabilityMinor, 0) )
+    if (!(major_cc == 8 && minor_cc == 0) &&
+        !(major_cc == 8 && minor_cc == 6) &&
+        !(major_cc == 8 && minor_cc == 9)) {
+        std::printf("\ncusparseLt is supported only on GPU devices with"
+                    " compute capability == 8.0, 8.6, 8.9 current: %d.%d\n\n",
+                     major_cc, minor_cc);
+        return EXIT_UNSUPPORTED;
+    }
+    CHECK_CUSPARSE( cusparseLtInit(&handle) )
+
+    return EXIT_SUCCESS;
+}
+
+
+typedef struct cusparseLtMatmulArgs_t {
+    cusparseLtMatmulPlan_t*         plan;
+    cusparseLtMatmulDescriptor_t*   matmul;
+    cusparseLtMatmulAlgSelection_t* alg_sel;
+    cudaStream_t*                   streams;
+    int                             num_streams;
+    cudaStream_t                    stream;
+    size_t                          workspace_size;
+//     void*                           d_workspace;
+    void                            *dCompressed;
+    int                             m;
+    int                             n;
+//     torch::Tensor                   grad;
+
+    cusparseLtMatmulArgs_t()
+    {
+        plan = new cusparseLtMatmulPlan_t;
+        matmul = new cusparseLtMatmulDescriptor_t;
+        alg_sel = new cusparseLtMatmulAlgSelection_t;
+        streams = nullptr;
+        num_streams = 0;
+        stream = nullptr;
+        m = 0;
+        n = 0;
+        dCompressed = nullptr;
+    }
+
+    ~cusparseLtMatmulArgs_t()
+    {
+        cusparseLtMatmulPlanDestroy(plan);
+//         cudaFree(d_workspace);
+    }
+} cusparseLtMatmulArgs ;
+
+
+std::vector<cusparseLtMatmulArgs*> matmul_args;
+
+
+template <class T, class V>
+int setup_prune_matmul( const int                       m,
+                        const int                       n,
+                        const int                       k,
+                        T                               *dSparse,
+                        T                               *dDense,
+                        int                             *index,
+                        const bool                      transpose_A=false,
+                        const bool                      transpose_B=false,
+                        const bool                      sparseA=true,
+                        const bool                      transposable_mask=false,
+                        const bool                      is_sparse_pruned=false,
+                        const bool                      check_sparsity=false,
+                        cudaDataType_t                  input_type=CUDA_R_16F,
+                        cudaDataType_t                  output_type=CUDA_R_16F,
+                        cusparseComputeType             compute_type=CUSPARSE_COMPUTE_16F)
+{
+    matmul_args.push_back(new cusparseLtMatmulArgs_t);
+    *index = matmul_args.size() - 1;
+
+    auto args = matmul_args.back();
+    args->m = m;
+    args->n = n;
+
+    // Host problem definition, row-major order
+    // bigger sizes may require dynamic allocations
+    auto          order        = CUSPARSE_ORDER_ROW;
+    auto          opA          = transpose_A ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+    auto          opB          = transpose_B ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+
+    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
+    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
+    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
+    auto     num_A_rows     = (isA_transposed) ? k : m;
+    auto     num_A_cols     = (isA_transposed) ? m : k;
+    auto     num_B_rows     = (isB_transposed) ? n : k;
+    auto     num_B_cols     = (isB_transposed) ? k : n;
+    auto     num_C_rows     = m;
+    auto     num_C_cols     = n;
+    unsigned alignment      = 16;
+    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
+    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
+    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
+    auto     C_height       = (is_rowmajor) ? num_C_rows : num_C_cols;
+    auto     C_size         = C_height * ldc * sizeof(V);
+
+
+    cusparseLtMatDescriptor_t*      matA;
+    cusparseLtMatDescriptor_t*      matB;
+    cusparseLtMatDescriptor_t*      matC;
+    matA = new cusparseLtMatDescriptor_t;
+    matB = new cusparseLtMatDescriptor_t;
+    matC = new cusparseLtMatDescriptor_t;
+
+    V *dC, *dD;
+    CHECK_CUDA( cudaMalloc((void**) &dC, C_size) )
+    dD = dC;
+
+    int *d_valid;
+    CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
+
+    // matrix descriptor initialization
+    if(sparseA)
+    {
+        CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
+                                                &handle, matA, num_A_rows,
+                                                num_A_cols, lda, alignment,
+                                                input_type, order,
+                                                CUSPARSELT_SPARSITY_50_PERCENT) )
+
+        CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
+                                                &handle, matB, num_B_rows,
+                                                num_B_cols, ldb, alignment,
+                                                input_type, order) )
+    }
+    else
+    {
+        CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
+                                                &handle, matB, num_B_rows,
+                                                num_B_cols, ldb, alignment,
+                                                input_type, order,
+                                                CUSPARSELT_SPARSITY_50_PERCENT) )
+
+        CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
+                                                &handle, matA, num_A_rows,
+                                                num_A_cols, lda, alignment,
+                                                input_type, order) )
+    }
+    CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
+                                            &handle, matC, num_C_rows,
+                                            num_C_cols, ldc, alignment,
+                                            output_type, order) )
+
+    // matmul, algorithm selection, and plan initialization
+    CHECK_CUSPARSE( cusparseLtMatmulDescriptorInit(
+                                            &handle, args->matmul, opA, opB,
+                                            matA, matB, matC, matC,
+                                            compute_type) )
+
+    CHECK_CUSPARSE( cusparseLtMatmulAlgSelectionInit(
+                                            &handle, args->alg_sel, args->matmul,
+                                            CUSPARSELT_MATMUL_ALG_DEFAULT) )
+
+    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, args->plan, args->matmul, args->alg_sel))
+
+    //--------------------------------------------------------------------------
+    // Prune the A matrix (in-place) and check the correctness
+    if (!is_sparse_pruned){
+        cusparseLtPruneAlg_t prune_alg = transposable_mask ? CUSPARSELT_PRUNE_SPMMA_TILE : CUSPARSELT_PRUNE_SPMMA_STRIP;
+        CHECK_CUSPARSE( cusparseLtSpMMAPrune(&handle, args->matmul, dSparse, dSparse,
+                                             prune_alg, args->stream) )
+    }
+    if (check_sparsity)
+    {
+        CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck(&handle, args->matmul, dSparse, d_valid, args->stream) )
+        int is_valid;
+        CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int), cudaMemcpyDeviceToHost, args->stream) )
+        CHECK_CUDA( cudaStreamSynchronize(args->stream) )
+        if (is_valid != 0) {
+            std::printf("!!!! The matrix does not conform to the SpMMA sparsity pattern. "
+                        "cusparseLtMatmul does not provide correct results\n");
+            return EXIT_FAILURE;
+        }
+    }
+    
+
+//     int    *d_valid;
+//     CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
+//     CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck2(    &handle,
+//                                                     sparseA ? matA : matB,
+//                                                     sparseA,
+//                                                     sparseA ? opA : opB,
+//                                                     dSparse,
+//                                                     d_valid,
+//                                                     args->stream) )
+
+//     int is_valid;
+//     CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int),
+//                                 cudaMemcpyDeviceToHost, args->stream) )
+//     CHECK_CUDA( cudaStreamSynchronize(args->stream) )
+//     if (is_valid != 0) {
+//         std::printf("!!!! The matrix has been pruned in a wrong way. "
+//                     "cusparseLtMatmul will not provide correct results\n");
+//         return EXIT_FAILURE;
+//     }
+    CHECK_CUDA( cudaFree(d_valid) )
+
+    //--------------------------------------------------------------------------
+    // Compress the A matrix
+    size_t compressed_size, compressed_buffer_size;
+    void*  dCompressedBuffer;
+    CHECK_CUSPARSE( cusparseLtSpMMACompressedSize(&handle,
+                                                  args->plan,
+                                                  &compressed_size,
+                                                  &compressed_buffer_size) )
+
+    CHECK_CUDA( cudaMalloc((void**) &args->dCompressed, compressed_size) )
+    CHECK_CUDA( cudaMalloc((void**) &dCompressedBuffer,
+                           compressed_buffer_size) )
+
+    CHECK_CUSPARSE( cusparseLtSpMMACompress(&handle,
+                                            args->plan,
+                                            dSparse,
+                                            (T *) args->dCompressed,
+                                            dCompressedBuffer,
+                                            args->stream) )
+    CHECK_CUDA( cudaFree(dCompressedBuffer) )
+
+    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // Search the best kernel
+    if(sparseA)
+    {
+//         printf("%f, %f, %f, %f, %f, %f\n", alpha, beta, *dDense,0.,0.,0.);// , dDense[0], beta, dC[0], dD[0]);
+        CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, args->plan, &alpha,
+                                            (T*) args->dCompressed, dDense, &beta,
+                                            dC, dD, nullptr,
+                                            args->streams, args->num_streams) )
+    } else {
+        CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, args->plan, &alpha,
+                                            dDense, (T*) args->dCompressed, &beta,
+                                            dC, dD, nullptr,
+                                            args->streams, args->num_streams) )
+    }
+//     // otherwise, it is possible to set it directly:
+//     int alg = 0;
+//     CHECK_CUSPARSE( cusparseLtMatmulAlgSetAttribute(
+//                                            &handle, args->alg_sel,
+//                                            CUSPARSELT_MATMUL_ALG_CONFIG_ID,
+//                                            &alg, sizeof(alg)))
+
+
+    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, args->plan, args->matmul, args->alg_sel))
+
+    CHECK_CUSPARSE( cusparseLtMatmulGetWorkspace(&handle, args->plan,
+                                                 &args->workspace_size))
+
+//     printf("workspace_size: %lu (MB)\n", args->workspace_size / 1024 / 1024);
+    CHECK_CUDA( cudaFree(dC) )
+    cusparseLtMatDescriptorDestroy(matA);
+    cusparseLtMatDescriptorDestroy(matB);
+    cusparseLtMatDescriptorDestroy(matC);
+
+    return EXIT_SUCCESS;
+}
+
+int destroy_cusparse_matmul_cuda(int index){
+    if (index > matmul_args.size() - 1)
+        throw std::runtime_error("Index out of range of matmul_args");
+
+    auto args = matmul_args[index];
+    cusparseLtMatmulPlanDestroy(args->plan);
+    CHECK_CUDA(cudaFree(args->streams));
+    CHECK_CUDA(cudaFree(args->dCompressed));
+    matmul_args.erase(matmul_args.begin() + index);
+
+    return EXIT_SUCCESS;
+}
+
+torch::Tensor setup_spmatmul_cuda(torch::Tensor A,
+                                torch::Tensor B,
+                                const bool transpose_A=false,
+                                const bool transpose_B=false,
+                                const bool sparseA=true,
+                                const bool transposable_mask=false,
+                                const bool is_sparse_pruned=false,
+                                const bool check_sparsity=false) {
+   auto index = torch::zeros({1}, torch::kInt32);
+   int result;
+   int m, k, n;
+   if(transpose_A && transpose_B)
+   {
+        m = A.size(1);
+        k = A.size(0);
+        n = B.size(0);
+   } else if(transpose_A)
+   {
+        m = A.size(1);
+        k = A.size(0);
+        n = B.size(1);
+   } else if(transpose_B)
+   {
+        m = A.size(0);
+        k = A.size(1);
+        n = B.size(0);
+   } else {
+        m = A.size(0);
+        k = A.size(1);
+        n = B.size(1);
+   }
+   switch (A.type().scalarType()) {
+        case torch::ScalarType::Half:
+        {
+            auto sparse_mat = sparseA ? A.data_ptr<at::Half>() : B.data_ptr<at::Half>();
+            auto dense_mat = sparseA ? B.data_ptr<at::Half>() : A.data_ptr<at::Half>();
+            at::Half *dCompressed;
+            result = setup_prune_matmul<at::Half, at::Half>(     m,
+                                             n,
+                                             k,
+                                             sparse_mat,
+                                             dense_mat,
+                                             index.data_ptr<int>(),
+                                             transpose_A,
+                                             transpose_B,
+                                             sparseA,
+                                             transposable_mask,
+                                             is_sparse_pruned,
+                                             check_sparsity,
+                                             CUDA_R_16F,
+                                             CUDA_R_16F,
+                                             CUSPARSE_COMPUTE_16F);
+            break;
+        }
+        case torch::ScalarType::Char:
+        {
+            auto sparse_mat = sparseA ? A.data_ptr<int8_t>() : B.data_ptr<int8_t>();
+            auto dense_mat = sparseA ? B.data_ptr<int8_t>() : A.data_ptr<int8_t>();
+            int8_t *dCompressed;
+            result = setup_prune_matmul<int8_t, INT8_OUTPUT_TYPE>(     m,
+                                             n,
+                                             k,
+                                             sparse_mat,
+                                             dense_mat,
+                                             index.data_ptr<int>(),
+                                             transpose_A,
+                                             transpose_B,
+                                             sparseA,
+                                             transposable_mask,
+                                             is_sparse_pruned,
+                                             check_sparsity,
+                                             CUDA_R_8I,
+                                             INT8_OUTPUT_TYPE_CUDA,
+                                             CUSPARSE_COMPUTE_32I);
+            break;}
+        default:
+        {
+            std::cout << A.type().scalarType() << std::endl;
+            throw std::runtime_error("Unsupported data type");
+        }
+   }
+   if(result == EXIT_SUCCESS) {
+     return index;
+   } else {
+     return -torch::ones({1}, torch::kInt32);
+   }
+}
+
+
+template <class T, class V>
+torch::Tensor matmul(   T* dDense,
+                        int index,
+                        bool sparseA,
+                        int m,
+                        torch::TensorOptions options=torch::TensorOptions()
+                    )
+{
+    auto args = matmul_args[index];
+
+    torch::Tensor C = torch::zeros({m, args->n}, options);
+    auto dC = C.data_ptr<V>();
+    auto dD = dC;
+    auto dA = sparseA ? (T*) args->dCompressed : dDense;
+    auto dB = sparseA ? dDense : (T*) args->dCompressed;
+    void *d_workspace;
+    CHECK_CUDA_TORCH( cudaMalloc((void**) &d_workspace, args->workspace_size) )
+    // Perform the matrix multiplication
+    CHECK_CUSPARSE_TORCH( cusparseLtMatmul(&handle, args->plan, &alpha, dA, dB,
+                                     &beta, dC, dD, d_workspace, args->streams,
+                                     args->num_streams) )
+    CHECK_CUDA_TORCH( cudaFree(d_workspace) )
+    return C;
+}
+
+
+torch::Tensor spmatmul_cuda(torch::Tensor   Dense,
+                            int             index,
+                            bool            sparseA)
+{
+    switch (Dense.type().scalarType()) {
+        case torch::ScalarType::Half: {
+            auto options = torch::TensorOptions().dtype(torch::kHalf).device(torch::kCUDA);
+            return matmul<at::Half, at::Half>(Dense.data_ptr<at::Half>(), index, sparseA, Dense.size(0), options);
+        }
+        case torch::ScalarType::Char: {
+            auto options = torch::TensorOptions().dtype(INT8_OUTPUT_TYPE_TORCH).device(torch::kCUDA);
+            return matmul<int8_t, INT8_OUTPUT_TYPE>(Dense.data_ptr<int8_t>(), index, sparseA, Dense.size(0), options);
+        }
+        default:
+        {
+            throw std::runtime_error("Unsupported data type");
+        }
+    }
+}
+
+
+void save_grad_cuda(torch::Tensor grad, int index)
+{
+    auto args = matmul_args[index];
+//    args->grad = grad.clone().detach();
+}
+
+
+__global__ void prune_kernel(
+        const float* __restrict__ input,
+        float* __restrict__ output,
+        bool* __restrict__ mask,
+        size_t row_size) {
+    const int column = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        reinterpret_cast<float4*>(&output[index])[0] = reinterpret_cast<const float4*>(&input[index])[0];
+        if(abs(output[index]) > abs(output[index + 1])){
+            output[index + 1] = 0.;
+            mask[index + 1] = true;
+        } else {
+            output[index] = 0.;
+            mask[index] = true;
+        }
+        if(abs(output[index + 2]) > abs(output[index + 3])){
+            output[index + 3] = 0.;
+            mask[index + 3] = true;
+        } else {
+            output[index + 2] = 0.;
+            mask[index + 2] = true;
+        }
+  }
+}
+
+
+__global__ void prune_kernel(
+        const at::Half* __restrict__ input,
+        at::Half* __restrict__ output,
+        bool* __restrict__ mask,
+        size_t row_size) {
+    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        reinterpret_cast<float4*>(&output[index])[0] = reinterpret_cast<const float4*>(&input[index])[0];
+        at::Half min1, min2;
+        int min_idx1, min_idx2;
+        min1 = output[index];
+        min_idx1 = index;
+        if(MIN(min1, output[index + 1]) == output[index + 1]){
+            min1 = output[index + 1];
+            min_idx1 = index + 1;
+        }
+        if(MIN(min1, output[index + 2]) == output[index + 2]){
+            min1 = output[index + 2];
+            min_idx1 = index + 2;
+        }
+        if(MIN(min1, output[index + 3]) == output[index + 3]){
+            min1 = output[index + 3];
+            min_idx1 = index + 3;
+        }
+        min2 = min_idx1 == index ? output[index + 1] : output[index];
+        min_idx2 = min_idx1 == index ? index + 1 : index;
+        if((MIN(min2, output[index + 1]) == output[index + 1]) && min_idx1 != index + 1){
+            min2 = output[index + 1];
+            min_idx2 = index + 1;
+        }
+        if((MIN(min2, output[index + 2]) == output[index + 2]) && min_idx1 != index + 2){
+            min2 = output[index + 2];
+            min_idx2 = index + 2;
+        }
+        if((MIN(min2, output[index + 3]) == output[index + 3]) && min_idx1 != index + 3){
+            min2 = output[index + 3];
+            min_idx2 = index + 3;
+        }
+        output[min_idx1] = 0.; mask[min_idx1] = true;
+        output[min_idx2] = 0.; mask[min_idx2] = true;
+
+        min1 = output[index + 4];
+        min_idx1 = index + 4;
+        if(MIN(min1, output[index + 5]) == output[index + 5]){
+            min1 = output[index + 5];
+            min_idx1 = index + 5;
+        }
+        if(MIN(min1, output[index + 6]) == output[index + 6]){
+            min1 = output[index + 6];
+            min_idx1 = index + 6;
+        }
+        if(MIN(min1, output[index + 7]) == output[index + 7]){
+            min1 = output[index + 7];
+            min_idx1 = index + 7;
+        }
+        min2 = min_idx1 == index + 4 ? output[index + 5] : output[index + 4];
+        min_idx2 = min_idx1 == index + 4 ? index + 5 : index + 4;
+        if((MIN(min2, output[index + 5]) == output[index + 5]) && min_idx1 != index + 5){
+            min2 = output[index + 5];
+            min_idx2 = index + 5;
+        }
+        if((MIN(min2, output[index + 6]) == output[index + 6]) && min_idx1 != index + 6){
+            min2 = output[index + 6];
+            min_idx2 = index + 6;
+        }
+        if((MIN(min2, output[index + 7]) == output[index + 7]) && min_idx1 != index + 7){
+            min2 = output[index + 7];
+            min_idx2 = index + 7;
+        }
+
+        output[min_idx1] = 0.; mask[min_idx1] = true;
+        output[min_idx2] = 0.; mask[min_idx2] = true;
+  }
+}
+
+
+template <class T>
+__device__ void find_kth_smallest(
+                                    int *smallest_idx,
+                                    const T* __restrict__ input,
+                                    const int k,
+                                    const int M, int index) {
+    int min_idx = 0;
+    T min = 6.0e4;
+
+    for(int i = 0; i < M; i++)
+    {
+        bool ignore = false;
+        for(int j = 0; j < k; j++)
+        {
+            if(smallest_idx[j] == i)
+            {
+                ignore = true;
+            }
+        }
+        if(ignore)
+        {
+            continue;
+        }
+        if(MIN(min, input[i]) == input[i]){
+            min = input[i];
+            min_idx = i;
+        }
+    }
+    smallest_idx[k] = min_idx;
+}
+
+
+__global__ void prune_kernel(
+        const at::Half* __restrict__ input,
+        at::Half* __restrict__ output,
+        bool* __restrict__ mask,
+        size_t row_size,
+        const int N,
+        const int M) {
+
+    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        for(int i = 0; i < M / 8; i++)
+        {
+            reinterpret_cast<float4*>(&output[index + 8 * i])[0] = reinterpret_cast<const float4*>(&input[index + 8 * i])[0];
+        }
+
+        int min_idx_list[16];
+        for(int k = 0; k < (M - N); k++)
+        {
+            find_kth_smallest<at::Half>(min_idx_list, &input[index], k, M, index);
+        }
+
+        for(int i = 0; i < (M - N); i++)
+        {
+            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
+        }
+  }
+}
+
+
+__global__ void prune_kernel(
+        const float* __restrict__ input,
+        float* __restrict__ output,
+        bool* __restrict__ mask,
+        size_t row_size,
+        const int N,
+        const int M) {
+
+    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        for(int i = 0; i < M / 4; i++)
+        {
+            reinterpret_cast<float4*>(&output[index + 4 * i])[0] = reinterpret_cast<const float4*>(&input[index + 4 * i])[0];
+        }
+
+        int *min_idx_list;
+        min_idx_list = (int*)malloc((M - N) * sizeof(int));
+        for(int k = 0; k < (M - N); k++)
+        {
+            find_kth_smallest<float>(min_idx_list, &input[index], k, M, index);
+        }
+
+        for(int i = 0; i < (M - N); i++)
+        {
+            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
+        }
+  }
+}
+
+
+template <int N, int M>
+__global__ void prune_kernel(
+        const float* __restrict__ input,
+        float* __restrict__ output,
+        bool* __restrict__ mask,
+        size_t row_size) {
+
+    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        for(int i = 0; i < M / 4; i++)
+        {
+            reinterpret_cast<float4*>(&output[index + 4 * i])[0] = reinterpret_cast<const float4*>(&input[index + 4 * i])[0];
+        }
+
+        int min_idx_list[M - N];
+        for(int k = 0; k < (M - N); k++)
+        {
+            find_kth_smallest<float>(min_idx_list, &input[index], k, M, index);
+        }
+
+        for(int i = 0; i < (M - N); i++)
+        {
+            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
+        }
+  }
+}
+
+
+template <int N, int M>
+__global__ void prune_kernel(
+        const at::Half* __restrict__ input,
+        at::Half* __restrict__ output,
+        bool* __restrict__ mask,
+        size_t row_size) {
+
+    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        for(int i = 0; i < M / 8; i++)
+        {
+            reinterpret_cast<float4*>(&output[index + 8 * i])[0] = reinterpret_cast<const float4*>(&input[index + 8 * i])[0];
+        }
+
+        int min_idx_list[M - N];
+        for(int k = 0; k < (M - N); k++)
+        {
+            find_kth_smallest<at::Half>(min_idx_list, &input[index], k, M, index);
+        }
+
+        for(int i = 0; i < (M - N); i++)
+        {
+            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
+        }
+  }
+}
+
+
+std::vector<torch::Tensor> prune_cuda(
+    torch::Tensor input, const int N, const int M) {
+
+    auto output = torch::zeros_like(input);
+    auto options = torch::TensorOptions().dtype(torch::kBool);
+    auto mask = torch::zeros_like(input, options);
+
+    const auto batch_size = input.size(0);
+    const auto row_size = input.size(1);
+
+    const int threads = 1024;
+
+    if(N == 1 && M == 2) {
+        switch (input.type().scalarType()) {
+            case torch::ScalarType::Float: {
+                const dim3 blocks(((row_size / 4) + threads - 1) / threads, batch_size);
+                prune_kernel<<<blocks, threads>>>(
+                        input.data<float>(),
+                        output.data<float>(),
+                        mask.data<bool>(),
+                        row_size);
+                break;
+            }
+            case torch::ScalarType::Half: {
+                throw std::runtime_error("Half precision not supported for N=1, M=2");
+            }
+        }
+    }
+    else if(N == 2 && M == 4)
+    {
+            switch (input.type().scalarType()) {
+                case torch::ScalarType::Float: {
+                    throw std::runtime_error("Full precision not supported for N=2, M=4");
+                    break;
+                }
+                case torch::ScalarType::Half: {
+                    const dim3 blocks(((row_size / 8) + threads - 1) / threads, batch_size);
+                    prune_kernel<<<blocks, threads>>>(
+                            input.data<at::Half>(),
+                            output.data<at::Half>(),
+                            mask.data<bool>(),
+                            row_size);
+                }
+            }
+    }
+    else if((N == 2 && M == 8))
+    {
+        switch (input.type().scalarType()){
+            case torch::ScalarType::Float: {
+            const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
+            prune_kernel<2, 8><<<blocks, threads>>>(
+                    input.data<float>(),
+                    output.data<float>(),
+                    mask.data<bool>(),
+                    row_size);
+            break;
+            }
+            case torch::ScalarType::Half: {
+                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
+                prune_kernel<2, 8><<<blocks, threads>>>(
+                        input.data<at::Half>(),
+                        output.data<at::Half>(),
+                        mask.data<bool>(),
+                        row_size);
+            }
+        }
+    }
+    else if((N == 2 && M == 16))
+    {
+        switch (input.type().scalarType()){
+            case torch::ScalarType::Float: {
+            const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
+            prune_kernel<2, 16><<<blocks, threads>>>(
+                    input.data<float>(),
+                    output.data<float>(),
+                    mask.data<bool>(),
+                    row_size);
+            break;
+            }
+            case torch::ScalarType::Half: {
+                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
+                prune_kernel<2, 16><<<blocks, threads>>>(
+                        input.data<at::Half>(),
+                        output.data<at::Half>(),
+                        mask.data<bool>(),
+                        row_size);
+            }
+        }
+    }
+    else
+    {
+        if(M < 8 || M % 8 != 0)
+        {
+            throw std::runtime_error("M must be a multiple of 8");
+        }
+        switch (input.type().scalarType()) {
+            case torch::ScalarType::Float:
+            {
+                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
+                prune_kernel<<<blocks, threads>>>(
+                    input.data<float>(),
+                    output.data<float>(),
+                    mask.data<bool>(),
+                    row_size,
+                    N,
+                    M);
+                 break;
+            }
+            case torch::ScalarType::Half:
+            {
+                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
+                prune_kernel<<<blocks, threads>>>(
+                    input.data<at::Half>(),
+                    output.data<at::Half>(),
+                    mask.data<bool>(),
+                    row_size,
+                    N,
+                    M);
+            }
+        }
+    }
+  return {output, mask};
+}
+
+
+__global__ void prune_and_compress_kernel(
+        const at::Half* __restrict__ input,
+        at::Half* __restrict__ output,
+        bool* __restrict__ mask,
+        size_t row_size) {
+    const int input_column = 16 * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int output_column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int input_row = blockIdx.y * row_size;
+    const int output_row = blockIdx.y * (row_size / 2);
+    const int input_index = input_row + input_column;
+    const int output_index = output_row + output_column;
+    if (input_column < row_size) {
+        bool local_mask[16];
+        reinterpret_cast<float4*>(local_mask)[0] = reinterpret_cast<const float4*>(&mask[input_index])[0];
+
+        int local_index = 0;
+        #pragma unroll (2)
+        for(int i = 0; i < 2; i++)
+        {
+            at::Half local_data[8];
+            reinterpret_cast<float4*>(local_data)[0] = reinterpret_cast<const float4*>(&input[input_index + 8 * i])[0];
+            #pragma unroll (8)
+            for(int j = 0; j < 8; j++)
+            {
+                if(local_mask[8 * i + j])
+                {
+                    output[local_index + output_index] = local_data[j];
+                    local_index++;
+                }
+            }
+        }
+    }
+}
+
+
+torch::Tensor prune_and_compress_cuda(torch::Tensor dense, torch::Tensor mask)
+{
+    auto row_size = dense.size(1);
+    auto batch_size = dense.size(0);
+    if(row_size % 16 != 0)
+    {
+        throw std::runtime_error("Pruning dimension should be a multiple of 128.");
+    }
+    auto options = torch::TensorOptions().dtype(torch::kHalf).device(torch::kCUDA);
+    torch::Tensor result = torch::zeros({dense.size(0), dense.size(1) / 2}, options);
+    const int threads = 1024;
+    switch (dense.type().scalarType()) {
+        case torch::ScalarType::Float:
+        {
+            throw std::runtime_error("Full precision not supported for prune_and_compress");
+        }
+        case torch::ScalarType::Half:
+        {
+            const dim3 blocks(((row_size / 16) + threads - 1) / threads, batch_size);
+            prune_and_compress_kernel<<<blocks, threads>>>(
+                dense.data<at::Half>(),
+                result.data<at::Half>(),
+                mask.data<bool>(),
+                row_size);
+        }
+    }
+    return result;
+}
+
+
+__global__ void sparse_add_kernel(
+        const at::Half* __restrict__ mat1,
+        const at::Half* __restrict__ mat2,
+        const at::Half alpha,
+        const at::Half beta,
+        at::Half* __restrict__ output,
+        size_t row_size) {
+    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        at::Half mat1_local[8], mat2_local[8];
+        reinterpret_cast<float4 *>(&mat1_local)[0] = reinterpret_cast<const float4 *>(&mat1[index])[0];
+        reinterpret_cast<float4 *>(&mat2_local)[0] = reinterpret_cast<const float4 *>(&mat2[index])[0];
+        #pragma unroll (8)
+        for(int i = 0; i < 8; i++)
+        {
+            output[index + i] = alpha * mat1_local[i] + beta * mat2_local[i];
+        }
+    }
+
+}
+
+
+torch::Tensor sparse_add_cuda(torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta)
+{
+    int row_size = dense.size(1);
+    int batch_size = dense.size(0);
+    if(row_size % 8 != 0)
+    {
+        throw std::runtime_error("Pruning dimension should be a multiple of 8.");
+    }
+    int index = sparse_index.item<int>();
+    auto args = matmul_args[index];
+    torch::Tensor result = torch::zeros_like(dense);
+    const int threads = 1024;
+    switch (dense.type().scalarType()) {
+        case torch::ScalarType::Float:
+        {
+            throw std::runtime_error("Full precision not supported for prune_and_compress");
+        }
+        case torch::ScalarType::Half:
+        {
+            const dim3 blocks(((row_size / 8) + threads - 1) / threads, batch_size);
+            sparse_add_kernel<<<blocks, threads>>>(
+                dense.data<at::Half>(),
+                (at::Half*) args->dCompressed,
+                alpha.item<float>(),
+                beta.item<float>(),
+                result.data<at::Half>(),
+                row_size);
+        }
+    }
+    return result;
+}
+
+
+__global__ void update_sparse_matrix_kernel(
+        const at::Half* __restrict__ new_data,
+        at::Half* __restrict__ output,
+        size_t row_size) {
+    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
+    const int index = blockIdx.y * row_size + column;
+    if (column < row_size) {
+        reinterpret_cast<float4 *>(&output[index])[0] = reinterpret_cast<const float4 *>(&new_data[index])[0];
+    }
+}
+
+
+void update_sparse_matrix_cuda(torch::Tensor new_data, torch::Tensor sparse_idx)
+{
+    auto args = matmul_args[sparse_idx.item<int>()];
+    const int threads = 1024;
+    switch (new_data.type().scalarType()) {
+        case torch::ScalarType::Float:
+        {
+            throw std::runtime_error("Full precision not supported for prune_and_compress");
+        }
+        case torch::ScalarType::Half:
+        {
+            cudaMemcpy(args->dCompressed, new_data.data<at::Half>(), new_data.size(0) * new_data.size(1) * sizeof(at::Half), cudaMemcpyDeviceToDevice);
+        }
+    }
+}
+
+
+// sparse = prune_and_compress(dense, mask)
+// result = add_sparse_dense(sparse_idx, dense, alpha, beta)
+// update_sparse(data, sparse_idx, sparse_transpose_idx)
diff --git a/csrc/semi_structured/cusparselt/cusparselt_mm_entry.cu b/csrc/semi_structured/cusparselt/cusparselt_mm_entry.cu
new file mode 100644
index 0000000000000..ddc5ef090ec2b
--- /dev/null
+++ b/csrc/semi_structured/cusparselt/cusparselt_mm_entry.cu
@@ -0,0 +1,135 @@
+#include <torch/extension.h>
+#include <cusparseLt.h>       // cusparseLt header
+#include <iostream>
+
+#define CHECK_CUDA_DEVICE(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA_DEVICE(x); CHECK_CONTIGUOUS(x)
+
+int init_cusparse_lt_cuda();
+torch::Tensor setup_spmatmul_cuda(torch::Tensor A,
+                                torch::Tensor B,
+                                const bool transpose_A=false,
+                                const bool transpose_B=false,
+                                const bool sparseA=true,
+                                const bool transposable_mask=false,
+                                const bool is_sparse_pruned=false,
+                                const bool check_sparsity=false);
+
+
+torch::Tensor spmatmul_cuda(torch::Tensor       Dense,
+                            int                 index,
+                            bool                sparseA);
+
+int destroy_cusparse_matmul_cuda(int index);
+
+void save_grad_cuda(torch::Tensor grad, int index);
+
+
+torch::Tensor init_cusparse_lt() {
+  int result = init_cusparse_lt_cuda();
+  if(result == EXIT_SUCCESS) {
+    return torch::zeros({1}, torch::kInt32);
+  } else {
+    return torch::ones({1}, torch::kInt32);
+  }
+}
+
+
+torch::Tensor setup_spmatmul(torch::Tensor A,
+                                torch::Tensor B,
+                                const bool transpose_A=false,
+                                const bool transpose_B=false,
+                                const bool sparseA=true,
+                                const bool transposable_mask=false,
+                                const bool is_sparse_pruned=false,
+                                const bool check_sparsity=false) {
+
+   CHECK_INPUT(A);
+   CHECK_INPUT(B);
+   return setup_spmatmul_cuda(A,
+                              B,
+                              transpose_A,
+                              transpose_B,
+                              sparseA,
+                              transposable_mask,
+                              is_sparse_pruned,
+                              check_sparsity);
+}
+
+
+torch::Tensor spmatmul( torch::Tensor Dense,
+                        torch::Tensor index,
+                        const bool sparseA=true) {
+   CHECK_INPUT(Dense);
+//   std::cout << Dense.data_ptr<at::Half>()[0] << std::endl;
+   auto result = spmatmul_cuda(     Dense,
+                                    *index.data_ptr<int>(),
+                                    sparseA);
+   return result;
+}
+
+int destroy_cusparse_matmul(int index){
+    return destroy_cusparse_matmul_cuda(index);
+}
+
+torch::Tensor save_grad(torch::Tensor input, torch::Tensor index) {
+    CHECK_INPUT(input);
+    save_grad_cuda(input, *index.data_ptr<int>());
+}
+
+
+std::vector<torch::Tensor> prune_cuda(torch::Tensor input, const int N, const int M);
+
+
+std::vector<torch::Tensor> prune(
+        torch::Tensor input, const int N, const int M) {
+    CHECK_INPUT(input);
+    return prune_cuda(input, N, M);
+}
+
+
+torch::Tensor prune_and_compress_cuda(torch::Tensor input, torch::Tensor mask);
+
+
+torch::Tensor prune_and_compress(
+        torch::Tensor input, torch::Tensor mask) {
+    CHECK_INPUT(input);
+    return prune_and_compress_cuda(input, mask);
+}
+
+
+torch::Tensor sparse_add_cuda(torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta);
+
+
+torch::Tensor sparse_add(
+        torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta) {
+    CHECK_INPUT(dense);
+    return sparse_add_cuda(dense, sparse_index, alpha, beta);
+}
+
+
+void update_sparse_matrix_cuda(torch::Tensor new_data, torch::Tensor sparse_idx);
+
+
+void update_sparse_matrix(
+        torch::Tensor new_data, torch::Tensor sparse_idx) {
+    CHECK_INPUT(new_data);
+    update_sparse_matrix_cuda(new_data, sparse_idx);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("init_cusparse_lt", &init_cusparse_lt, "Initialize CUSPARSE LT");
+    m.def("setup_spmatmul", &setup_spmatmul, "Setup Sparse Matrix Multiplication");
+    m.def("destroy_cusparse_matmul", &destroy_cusparse_matmul, "Destroy matmul arguments");
+    m.def("spmatmul", &spmatmul, "Sparse Matrix Multiplication");
+    m.def("save_grad", &save_grad, "Save Gradient");
+    m.def("prune", &prune, "N:M Prune (CUDA)");
+    m.def("prune_and_compress", &prune_and_compress, "Prune the dense matrix using the mask and store it in a "
+                                                     "compressed tensor (CUDA)");
+    m.def("sparse_add", &sparse_add, "Add the sparse matrix to the dense matrix and return a "
+                                     "compressed dense matrix(CUDA)");
+    m.def("update_sparse_matrix", &update_sparse_matrix, "Update the sparse matrix with the new dense matrix "
+                                                         "data (CUDA)");
+}
diff --git a/csrc/semi_structured/cutlass/common.hpp b/csrc/semi_structured/cutlass/common.hpp
new file mode 100644
index 0000000000000..bf04bb400790f
--- /dev/null
+++ b/csrc/semi_structured/cutlass/common.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                        \
+  {                                                  \
+    TORCH_CHECK(status == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(status))      \
+  }
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
diff --git a/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu b/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
new file mode 100644
index 0000000000000..794d325b36eba
--- /dev/null
+++ b/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
@@ -0,0 +1,223 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace {
+
+template <typename ElementAB_, typename ElementD_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_sparse_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  // using ElementAcc =
+  //     typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+  //                               float>::type;
+  using ElementAcc = ElementD;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;
+  using LayoutTagC  = cutlass::layout::ColumnMajor;
+  using StrideC = StrideD;
+
+  constexpr int AlignmentAB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  // using CollectiveEpilogue =
+  //     typename cutlass::epilogue::collective::CollectiveBuilder<
+  //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+  //         ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+  //         ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+  //         EpilogueSchedule, EVTCompute>::CollectiveOp;
+  
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAcc, ElementAcc,
+    ElementC, LayoutTagC, AlignmentC,
+    ElementD, LayoutTagC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+  // static constexpr size_t CEStorageSize =
+  //     sizeof(typename CollectiveEpilogue::SharedStorage);
+  // using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+  //     static_cast<int>(CEStorageSize)>;
+
+  // using CollectiveMainloop =
+  //     typename cutlass::gemm::collective::CollectiveBuilder<
+  //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+  //         ElementAB, cutlass::layout::RowMajor, 16, 
+  //         ElementAB, cutlass::layout::ColumnMajor, 16, 
+  //         ElementAcc, TileShape, ClusterShape,
+  //         Stages,
+  //         KernelSchedule>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    // cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    ElementAB, cutlass::layout::RowMajor, AlignmentAB,
+    ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
+    ElementAcc,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+  // using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+  //     cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+  //     cutlass::gemm::PersistentScheduler>>;
+  
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    cute::Shape<int, int, int, int>,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  // typename GemmKernel::EpilogueArguments epilogue_args{
+  //     Gemm::Epilogue::prepare_args(
+  //         std::forward<EpilogueArgs>(epilogue_params)...),
+  //     c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_sparse_gemm<InType, OutType, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+}  // namespace
+
+template <typename InType, typename OutType>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& b) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType>::Cutlass3xGemm;
+
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(out, a, b);
+}
+
+void cutlass_semi_structured_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::bfloat16_t>(
+          out, a, b);
+  } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                          cutlass::half_t>(
+          out, a, b);
+  }
+  // TODO: Add other data types
+}
+
+#endif
diff --git a/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu b/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
new file mode 100644
index 0000000000000..0d570a48b39ac
--- /dev/null
+++ b/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
@@ -0,0 +1,54 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+void cutlass_semi_structured_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
+#endif
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
+
+void cutlass_semi_structured_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& b) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+  // Hopper
+
+  // TODO: Guard against compilation issues for sm90 kernels
+// #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_semi_structured_mm_sm90(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+// #endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_semi_structured_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b999028fe06a9..ff14f7fb97b52 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -264,6 +264,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // CUTLASS sparse GEMM, supporting semi-structured sparsity
+  ops.def(
+      "cutlass_semi_structured_mm(Tensor! out, Tensor a,"
+      "                  Tensor b) -> ()");
+  ops.impl("cutlass_semi_structured_mm", torch::kCUDA,
+    &cutlass_semi_structured_mm);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a25f7abca5498..e8efd5f339ced 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -509,6 +509,21 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_semi_structured_mm(a: torch.Tensor,
+                      b: torch.Tensor,
+                      out_dtype: torch.dtype) -> torch.Tensor:
+    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+
+    m = a.shape[0]
+    n = b.shape[1]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_semi_structured_mm(out, a, b)
+
+    return out
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,

From 17f5b963d30eff3d74b1627b5489f3a35e9cfcb2 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Mon, 28 Oct 2024 02:39:35 +0000
Subject: [PATCH 02/92] Update with test code

---
 CMakeLists.txt                                |   93 +-
 .../semi_structured_benchmarks.py             |  171 +-
 .../cutlass_benchmarks/test_benchmarks.py     |  367 +++
 csrc/ops.h                                    |   11 +
 .../broadcast_load_epilogue_c3x.hpp           |  447 ++++
 csrc/quantization/cutlass_test/common.hpp     |   27 +
 .../quantization/cutlass_test/common_gemm.cuh |  568 +++++
 .../quantization/cutlass_test/device_memory.h |  377 +++
 .../example/62_hopper_sparse_gemm.cu          |  596 +++++
 .../cutlass_test/example/Makefile             |   68 +
 .../cutlass_test/example/util/command_line.h  |  313 +++
 .../cutlass_test/example/util/distribution.h  |  154 ++
 .../example/util/gather_tensor.hpp            |  215 ++
 .../cutlass_test/example/util/helper.h        |  108 +
 .../cutlass_test/example/util/host_tensor.h   |  541 +++++
 .../example/util/packed_stride.hpp            |  570 +++++
 .../util/reference/detail/inner_product.h     |  135 ++
 .../reference/detail/linear_to_coordinate.h   |   94 +
 .../util/reference/device/convolution.h       | 1549 ++++++++++++
 .../example/util/reference/device/gemm.h      |  385 +++
 .../util/reference/device/gemm_complex.h      |  350 +++
 .../reference/device/gemm_planar_complex.h    |  311 +++
 .../example/util/reference/device/gett.hpp    |  146 ++
 .../util/reference/device/kernel/gemm.h       |  162 ++
 .../device/kernel/tensor_elementwise.h        |  168 ++
 .../reference/device/kernel/tensor_foreach.h  |  159 ++
 .../util/reference/device/rank_2k_complex.h   |  355 +++
 .../util/reference/device/tensor_compare.h    |  246 ++
 .../util/reference/device/tensor_fill.h       | 2077 +++++++++++++++++
 .../util/reference/device/tensor_foreach.h    |  144 ++
 .../util/reference/device/tensor_reduce.h     |  510 ++++
 .../util/reference/device/tensor_relu.h       |  141 ++
 .../util/reference/device/thread/gemm.h       |  186 ++
 .../example/util/reference/host/conv.hpp      |  698 ++++++
 .../example/util/reference/host/convolution.h |  802 +++++++
 .../util/reference/host/error_metrics.h       |   66 +
 .../example/util/reference/host/gemm.h        |  531 +++++
 .../util/reference/host/gemm_complex.h        |  210 ++
 .../util/reference/host/gemm_planar_complex.h |  228 ++
 .../example/util/reference/host/gett.hpp      |  538 +++++
 .../example/util/reference/host/rank_2k.h     |  261 +++
 .../util/reference/host/rank_2k_complex.h     |  318 +++
 .../util/reference/host/rank_k_complex.h      |  234 ++
 .../example/util/reference/host/symm.h        |  285 +++
 .../util/reference/host/symm_complex.h        |  319 +++
 .../util/reference/host/tensor_compare.h      |  423 ++++
 .../util/reference/host/tensor_compare.hpp    |  101 +
 .../example/util/reference/host/tensor_copy.h |  256 ++
 .../util/reference/host/tensor_elementwise.h  |  341 +++
 .../example/util/reference/host/tensor_fill.h | 1718 ++++++++++++++
 .../util/reference/host/tensor_fill.hpp       |  432 ++++
 .../util/reference/host/tensor_foreach.h      |  134 ++
 .../example/util/reference/host/tensor_norm.h |   42 +
 .../util/reference/host/tensor_reduce.h       |  203 ++
 .../util/reference/host/tensor_reduce.hpp     |  203 ++
 .../example/util/reference/host/trmm.h        |  215 ++
 .../util/reference/host/trmm_complex.h        |  262 +++
 .../example/util/tensor_view_io.h             |  270 +++
 csrc/quantization/cutlass_test/exceptions.h   |   69 +
 csrc/quantization/cutlass_test/helper.h       |   94 +
 csrc/quantization/cutlass_test/host_tensor.h  |  541 +++++
 .../cutlass_test/packed_stride.hpp            |  570 +++++
 csrc/quantization/cutlass_test/test_mm_c3x.cu |  205 ++
 .../cutlass_test/test_mm_entry.cu             |   82 +
 csrc/quantization/cutlass_test/test_util.cu   |  199 ++
 .../cutlass_w8a8/scaled_mm_c3x.cu             |    4 +-
 .../cutlass/semi_structured_mm_c3x.cu         |  231 +-
 .../cutlass/semi_structured_mm_entry.cu       |   16 +-
 csrc/torch_bindings.cpp                       |   17 +
 sane_cute_errors.py                           |  119 +
 vllm/_custom_ops.py                           |   42 +-
 71 files changed, 22778 insertions(+), 245 deletions(-)
 create mode 100644 benchmarks/cutlass_benchmarks/test_benchmarks.py
 create mode 100644 csrc/quantization/cutlass_test/broadcast_load_epilogue_c3x.hpp
 create mode 100644 csrc/quantization/cutlass_test/common.hpp
 create mode 100644 csrc/quantization/cutlass_test/common_gemm.cuh
 create mode 100644 csrc/quantization/cutlass_test/device_memory.h
 create mode 100644 csrc/quantization/cutlass_test/example/62_hopper_sparse_gemm.cu
 create mode 100644 csrc/quantization/cutlass_test/example/Makefile
 create mode 100644 csrc/quantization/cutlass_test/example/util/command_line.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/distribution.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/gather_tensor.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/helper.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/host_tensor.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/packed_stride.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/detail/inner_product.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/detail/linear_to_coordinate.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/convolution.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/gemm.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/gemm_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/gemm_planar_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/gett.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/kernel/gemm.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_elementwise.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_foreach.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/rank_2k_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/tensor_compare.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/tensor_fill.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/tensor_foreach.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/tensor_reduce.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/tensor_relu.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/device/thread/gemm.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/conv.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/convolution.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/error_metrics.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/gemm.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/gemm_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/gemm_planar_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/gett.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/rank_2k.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/rank_2k_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/rank_k_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/symm.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/symm_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_copy.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_elementwise.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_foreach.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_norm.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.hpp
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/trmm.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/reference/host/trmm_complex.h
 create mode 100644 csrc/quantization/cutlass_test/example/util/tensor_view_io.h
 create mode 100644 csrc/quantization/cutlass_test/exceptions.h
 create mode 100644 csrc/quantization/cutlass_test/helper.h
 create mode 100644 csrc/quantization/cutlass_test/host_tensor.h
 create mode 100644 csrc/quantization/cutlass_test/packed_stride.hpp
 create mode 100644 csrc/quantization/cutlass_test/test_mm_c3x.cu
 create mode 100644 csrc/quantization/cutlass_test/test_mm_entry.cu
 create mode 100644 csrc/quantization/cutlass_test/test_util.cu
 create mode 100644 sane_cute_errors.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f6d1c66b2cf7..a13a1e8065e21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -203,12 +203,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
 
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.5.1
+        GIT_TAG be692b48b01620eedabeef8325df5d4eeed6c2ae
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -226,7 +226,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gguf/gguf_kernel.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_test/test_mm_entry.cu"
+    "csrc/quantization/cutlass_test/test_util.cu"
+    "csrc/semi_structured/cutlass/semi_structured_mm_entry.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -283,6 +286,90 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(SCALED_MM_3X_ARCHS)
   endif()
 
+  #
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_test/test_util.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
+    message(STATUS "Building test_util for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building test_util as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building test_util as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # build any 3x kernels
+    set(SCALED_MM_3X_ARCHS)
+  endif()
+  
+  #
+  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_test/test_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
+    message(STATUS "Building test_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building test_mm_c3x as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building test_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # build any 3x kernels
+    set(SCALED_MM_3X_ARCHS)
+  endif()
+
+  #
+  # The cutlass_semi_structured_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+    set(SRCS "csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
+    message(STATUS "Building semi_structured_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+      message(STATUS "Not building semi_structured_mm_c3x as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # build any 3x kernels
+    set(SCALED_MM_3X_ARCHS)
+  endif()
+
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
diff --git a/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py b/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
index 61eed3da41458..ebe6668a89e43 100644
--- a/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
@@ -30,6 +30,18 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float16)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float16)
+
+
+def to_fp32(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float)
+
+
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                       k: int) -> Tuple[torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda') * 5
@@ -39,6 +51,10 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
         return to_int8(a), to_int8(b)
     if dtype == torch.float8_e4m3fn:
         return to_fp8(a), to_fp8(b)
+    if dtype == torch.float16:
+        return to_fp16(a), to_fp16(b)
+    if dtype == torch.float:
+        return to_fp32(a), to_fp32(b)
 
     raise ValueError("unsupported dtype")
 
@@ -61,150 +77,35 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
         description=description,
     ).blocked_autorange(min_run_time=min_run_time)
 
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
-
-    timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-    
-    # cutlass impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_semi_structured_mm",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # cutlass impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "cutlass_fp16_fp16_fp16_semi_structured_mm",
-                 torch.mm, a.to(dtype=torch.float16),
-                 b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_semi_structured_mm",
-                 ops.cutlass_semi_structured_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+def bench_fp32(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    assert dtype == torch.float
+    a, b = make_rand_tensors(torch.float, m, n, k)
 
     timers = []
 
-    # pytorch impl w. bf16
+    # pytorch impl w. fp32
     timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # # pytorch impl: bf16 output, without fp8 fast accum
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp8_fp8_bf16_semi_structured_mm",
-    #              torch._semi_structured_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.bfloat16))
-
-    # # pytorch impl: bf16 output, with fp8 fast accum
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp8_fp8_bf16_semi_structured_mm_fast_accum",
-    #              torch._semi_structured_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.bfloat16,
-    #              use_fast_accum=True))
-
-    # # pytorch impl: fp16 output, without fp8 fast accum
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp8_fp8_fp16_semi_structured_mm",
-    #              torch._semi_structured_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.float16))
-
-    # # pytorch impl: fp16 output, with fp8 fast accum
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp8_fp8_fp16_semi_structured_mm_fast_accum",
-    #              torch._semi_structured_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.float16,
-    #              use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_semi_structured_mm",
-                 ops.cutlass_semi_structured_mm, a, b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
+        bench_fn(label, sub_label, "pytorch_f32_f32_f32_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.float, device="cuda"),
+                 b.to(dtype=torch.float, device="cuda")))
+    
+    # cutlass impl: fp32
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_semi_structured_mm",
+        bench_fn(label, sub_label, "cutlass_fp32_fp32_fp32_semi_structured_mm",
                  ops.cutlass_semi_structured_mm, a, b,
-                 torch.float16))
-
-    # # cutlass impl: bf16 output, with bias
-    # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_semi_structured_mm_bias",
-    #              ops.cutlass_semi_structured_mm, a, b, scale_a, scale_b,
-    #              torch.bfloat16, bias))
-
-    # # cutlass impl: fp16 output, with bias
-    # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_semi_structured_mm_bias",
-    #              ops.cutlass_semi_structured_mm, a, b, scale_a, scale_b,
-    #              torch.float16, bias.to(dtype=torch.float16)))
-
+                 torch.float))
+    
     return timers
 
 
 def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
           sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
+    return bench_fp32(torch.float, m, k, n, label, sub_label)
+    # if dtype == torch.int8:
+    #     return bench_int8(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.float8_e4m3fn:
+    #     return bench_fp8(dtype, m, k, n, label, sub_label)
     raise ValueError("unsupported type")
 
 
@@ -312,6 +213,10 @@ def to_torch_dtype(dt):
             return torch.int8
         if dt == "fp8":
             return torch.float8_e4m3fn
+        if dt == "fp16":
+            return torch.float16
+        if dt == "fp32":
+            return torch.float
         raise ValueError("unsupported dtype")
 
     parser = FlexibleArgumentParser(
@@ -335,7 +240,7 @@ def to_torch_dtype(dt):
     parser.add_argument("--dtype",
                         type=to_torch_dtype,
                         required=True,
-                        help="Available options are ['int8', 'fp8']")
+                        help="Available options are ['int8', 'fp8', 'fp16', 'fp32']")
     subparsers = parser.add_subparsers(dest="cmd")
 
     square_parser = subparsers.add_parser("square_bench")
diff --git a/benchmarks/cutlass_benchmarks/test_benchmarks.py b/benchmarks/cutlass_benchmarks/test_benchmarks.py
new file mode 100644
index 0000000000000..4d1884dcd2135
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/test_benchmarks.py
@@ -0,0 +1,367 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    a_compressed, e = cutlass_sparsify_and_compress_entry(a)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    a_compressed, e = cutlass_sparsify_and_compress_entry(a)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/test_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/test_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/test_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/csrc/ops.h b/csrc/ops.h
index c0b4fa7f5d15e..e5d798cc832dd 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -116,6 +116,17 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
 
+bool cutlass_scaled_test_mm_supports_fp8(int64_t cuda_device_capability);
+
+void cutlass_scaled_test_mm(torch::Tensor& out, torch::Tensor const& a,
+                       torch::Tensor const& e,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias);
+
+bool cutlass_sparsify_and_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+                                 torch::Tensor const& a);
+
 void cutlass_semi_structured_mm(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& b);
 #endif
diff --git a/csrc/quantization/cutlass_test/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_test/broadcast_load_epilogue_c3x.hpp
new file mode 100644
index 0000000000000..58b1e8ff159fb
--- /dev/null
+++ b/csrc/quantization/cutlass_test/broadcast_load_epilogue_c3x.hpp
@@ -0,0 +1,447 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcast {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcast {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
+  }
+};
+
+}
diff --git a/csrc/quantization/cutlass_test/common.hpp b/csrc/quantization/cutlass_test/common.hpp
new file mode 100644
index 0000000000000..bf04bb400790f
--- /dev/null
+++ b/csrc/quantization/cutlass_test/common.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                        \
+  {                                                  \
+    TORCH_CHECK(status == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(status))      \
+  }
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
diff --git a/csrc/quantization/cutlass_test/common_gemm.cuh b/csrc/quantization/cutlass_test/common_gemm.cuh
new file mode 100644
index 0000000000000..b0298a6bf5971
--- /dev/null
+++ b/csrc/quantization/cutlass_test/common_gemm.cuh
@@ -0,0 +1,568 @@
+using namespace cute;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+  #endif
+  }
+};
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzp
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzpToken
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 32, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_test_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& e, torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(e.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_layout,
+                                                       b_ptr, b_stride,
+                                                       e_ptr, e_layout};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_test/device_memory.h b/csrc/quantization/cutlass_test/device_memory.h
new file mode 100644
index 0000000000000..7d3fa73f62df8
--- /dev/null
+++ b/csrc/quantization/cutlass_test/device_memory.h
@@ -0,0 +1,377 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ interface to CUDA device memory management functions.
+ */
+
+#include <memory>
+#include <sstream>
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/trace.h"
+#include "exceptions.h"
+
+namespace cutlass {
+namespace device_memory {
+
+/******************************************************************************
+ * Allocation lifetime
+ ******************************************************************************/
+
+/// Allocate a buffer of \p count elements of type \p T on the current CUDA device
+template <typename T>
+T* allocate(size_t count = 1) {
+
+  T* ptr = 0;
+  size_t bytes = 0;
+
+  bytes = count * sizeof(T);
+
+  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
+
+  if (cuda_error != cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+#endif
+    throw cuda_exception("Failed to allocate memory", cuda_error);
+  }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  else {
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+  }
+#endif
+
+  return ptr;
+}
+
+/// Free the buffer pointed to by \p ptr
+template <typename T>
+void free(T* ptr) {
+  if (ptr) {
+    cudaError_t cuda_error = (cudaFree(ptr));
+    if (cuda_error != cudaSuccess) {
+      throw cuda_exception("Failed to free device memory", cuda_error);
+    }
+  }
+}
+
+/******************************************************************************
+ * Data movement
+ ******************************************************************************/
+
+template <typename T>
+void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
+  size_t bytes = count * sizeof_bits<T>::value / 8;
+  if (bytes == 0 && count > 0) {
+    bytes = 1;
+  }
+  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
+  if (cuda_error != cudaSuccess) {
+    std::ostringstream os;
+    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
+       << "dst=" << dst << ", src=" << src
+       << ", bytes=" << bytes << ", count=" << count;
+    if (kind == cudaMemcpyHostToDevice) {
+      os << ", kind=cudaMemcpyHostToDevice";
+    }
+    else if (kind == cudaMemcpyDeviceToHost) {
+      os << ", kind=cudaMemcpyDeviceToHost";
+    }
+    else if (kind == cudaMemcpyDeviceToDevice) {
+      os << ", kind=cudaMemcpyDeviceToDevice";
+    }
+    else if (kind == cudaMemcpyHostToHost) {
+      os << ", kind=cudaMemcpyHostToHost";
+    }
+    else if (kind == cudaMemcpyDefault) {
+      os << ", kind=cudaMemcpyDefault";
+    }
+    else {
+      os << ", kind=Unknown";
+    }
+    os << ", error: " << cudaGetErrorString(cuda_error);
+
+    throw cuda_exception(os.str().c_str(), cuda_error);
+  }
+}
+
+template <typename T>
+void copy_to_device(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyHostToDevice);
+}
+
+template <typename T>
+void copy_to_host(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyDeviceToHost);
+}
+
+template <typename T>
+void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyDeviceToDevice);
+}
+
+template <typename T>
+void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyHostToHost);
+}
+
+/// Copies elements from device memory to host-side range
+template <typename OutputIterator, typename T>
+void insert_to_host(OutputIterator begin, OutputIterator end, T const* device_begin) {
+  size_t elements = end - begin;
+  copy_to_host(&*begin, device_begin, elements);
+}
+
+/// Copies elements to device memory from host-side range
+template <typename T, typename InputIterator>
+void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
+  size_t elements = end - begin;
+  copy_to_device(device_begin, &*begin, elements);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device_memory
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class DeviceAllocation {
+public:
+
+  /// Delete functor for CUDA device memory
+  struct deleter {
+    void operator()(T* ptr) {
+      cudaError_t cuda_error = (cudaFree(ptr));
+      if (cuda_error != cudaSuccess) {
+        // noexcept
+        //                throw cuda_exception("cudaFree() failed", cuda_error);
+        return;
+      }
+    }
+  };
+
+public:
+  //
+  // Data members
+  //
+
+  /// Number of elements of T allocated on the current CUDA device
+  size_t capacity;
+
+  /// Smart pointer
+  platform::unique_ptr<T, deleter> smart_ptr;
+
+public:
+
+  //
+  // Static methods
+  //
+
+  /// Static member to compute the number of bytes needed for a given number of elements
+  static size_t bytes(size_t elements) {
+    if (sizeof_bits<T>::value < 8) {
+      size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;
+      return elements / kElementsPerByte;
+    }
+    else {
+      size_t const kBytesPerElement = sizeof_bits<T>::value / 8;
+      return elements * kBytesPerElement;
+    }
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor: allocates no memory
+  DeviceAllocation() : capacity(0) {}
+
+  /// Constructor: allocates \p capacity elements on the current CUDA device
+  DeviceAllocation(size_t _capacity) : 
+    smart_ptr(device_memory::allocate<T>(_capacity)), capacity(_capacity) {}
+
+  /// Constructor: allocates \p capacity elements on the current CUDA device taking ownership of the allocation
+  DeviceAllocation(T *ptr, size_t _capacity) : smart_ptr(ptr), capacity(_capacity) {}
+
+  /// Copy constructor
+  DeviceAllocation(DeviceAllocation const &p): 
+    smart_ptr(device_memory::allocate<T>(p.capacity)), capacity(p.capacity) {
+
+    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
+  }
+
+  /// Move constructor
+  DeviceAllocation(DeviceAllocation &&p): capacity(0) {
+    std::swap(smart_ptr, p.smart_ptr);
+    std::swap(capacity, p.capacity);
+  }
+
+  /// Destructor
+  ~DeviceAllocation() { reset(); }
+
+  /// Returns a pointer to the managed object
+  T* get() const { return smart_ptr.get(); }
+
+  /// Releases the ownership of the managed object (without deleting) and resets capacity to zero
+  T* release() {
+    capacity = 0;
+    return smart_ptr.release();
+  }
+
+  /// Deletes the managed object and resets capacity to zero
+  void reset() {
+    capacity = 0;
+    smart_ptr.reset();
+  }
+
+  /// Deletes managed object, if owned, and allocates a new object
+  void reset(size_t _capacity) {
+    reset(device_memory::allocate<T>(_capacity), _capacity);
+  }
+
+  /// Deletes managed object, if owned, and replaces its reference with a given pointer and capacity
+  void reset(T* _ptr, size_t _capacity) {
+    smart_ptr.reset(_ptr);
+    capacity = _capacity;
+  }
+
+  /// Allocates a new buffer and copies the old buffer into it. The old buffer is then released.
+  void reallocate(size_t new_capacity) {
+    
+    platform::unique_ptr<T, deleter> new_allocation(device_memory::allocate<T>(new_capacity));
+
+    device_memory::copy_device_to_device(
+      new_allocation.get(), 
+      smart_ptr.get(), 
+      std::min(new_capacity, capacity));
+
+    std::swap(smart_ptr, new_allocation);
+    std::swap(new_capacity, capacity);
+  }
+
+  /// Returns the number of elements
+  size_t size() const {
+    return capacity;
+  }
+
+  /// Returns the number of bytes needed to store the allocation
+  size_t bytes() const {
+    return bytes(capacity);
+  }
+
+  /// Returns a pointer to the object owned by *this
+  T* operator->() const { return smart_ptr.get(); }
+
+  /// Returns the deleter object which would be used for destruction of the managed object.
+  deleter& get_deleter() { return smart_ptr.get_deleter(); }
+
+  /// Returns the deleter object which would be used for destruction of the managed object (const)
+  const deleter& get_deleter() const { return smart_ptr.get_deleter(); }
+
+  /// Copies a device-side memory allocation
+  DeviceAllocation & operator=(DeviceAllocation const &p) {
+    if (capacity != p.capacity) {
+      smart_ptr.reset(device_memory::allocate<T>(p.capacity));
+      capacity = p.capacity;
+    }
+    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
+    return *this;
+  }
+
+  /// Move assignment
+  DeviceAllocation & operator=(DeviceAllocation && p) {
+    std::swap(smart_ptr, p.smart_ptr);
+    std::swap(capacity, p.capacity);
+    return *this;
+  }
+
+  /// Copies the entire allocation from another location in device memory.
+  void copy_from_device(T const *ptr) const {
+    copy_from_device(ptr, capacity);
+  }
+
+  /// Copies a given number of elements from device memory
+  void copy_from_device(T const *ptr, size_t elements) const {
+    device_memory::copy_device_to_device(get(), ptr, elements);
+  }
+
+  void copy_to_device(T *ptr) const {
+    copy_to_device(ptr, capacity);
+  }
+
+  void copy_to_device(T *ptr, size_t elements) const {
+    device_memory::copy_device_to_device(ptr, get(), elements);
+  }
+
+  void copy_from_host(T const *ptr) const {
+    copy_from_host(ptr, capacity);
+  }
+
+  void copy_from_host(T const *ptr, size_t elements) const {
+    device_memory::copy_to_device(get(), ptr, elements);
+  }
+
+  void copy_to_host(T *ptr) const {
+    copy_to_host(ptr, capacity);
+  }
+
+  void copy_to_host(T *ptr, size_t elements) const {
+    device_memory::copy_to_host(ptr, get(), elements); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace device_memory {
+
+/// Device allocation abstraction that tracks size and capacity
+template <typename T>
+using allocation = cutlass::DeviceAllocation<T>;
+
+}  // namespace device_memory
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/62_hopper_sparse_gemm.cu b/csrc/quantization/cutlass_test/example/62_hopper_sparse_gemm.cu
new file mode 100644
index 0000000000000..5b7361f805098
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/62_hopper_sparse_gemm.cu
@@ -0,0 +1,596 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Hopper Sparse GEMM example.
+
+  This example demonstrates how to construct and run a structured sparse GEMM kernel
+  on NVIDIA Hopper architecture.
+    
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+
+#include "util/command_line.h"
+#include "util/distribution.h"
+#include "util/host_tensor.h"
+#include "util/packed_stride.hpp"
+#include "util/tensor_view_io.h"
+#include "util/reference/device/gemm.h"
+#include "util/reference/device/tensor_compare.h"
+#include "util/reference/device/tensor_fill.h"
+
+#include "util/helper.h"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutTagA  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutTagB  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = float;                                          // Element type for C and D matrix operands
+using         LayoutTagC  = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using TileShape           = Shape<_128,_128,_128>;                          // Threadblock-level tile size for sparse kernel
+using TileShapeRef        = Shape<_128,_128, _64>;                          // Threadblock-level tile size for reference (dense) kernel
+using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecialized;        // Kernel schedule policy
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;          // Epilogue schedule policy
+
+using ProblemShape = Shape<int,int,int,int>;
+
+// Sparse kernel setup
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutTagC, AlignmentC,
+    ElementC, LayoutTagC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+    ElementA, LayoutTagA, AlignmentA,
+    ElementB, LayoutTagB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Reference (dense) kernel setup
+
+using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShapeRef, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutTagC, AlignmentC,
+    ElementC, LayoutTagC, AlignmentC,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    ElementA, LayoutTagA, AlignmentA,
+    ElementB, LayoutTagB, AlignmentB,
+    ElementAccumulator,
+    TileShapeRef, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloopRef,
+    CollectiveEpilogue
+>;
+
+using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
+
+// Layouts 
+using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+// Layouts for reference (non-sparse) tensors
+using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+using StrideE = StrideA;
+
+using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
+using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
+
+// Offline compressor kernel
+using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig>;
+
+using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig,
+                            cutlass::arch::Sm90>;
+
+using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+//
+// Data members
+//
+
+ProblemShape problem_shape;
+
+StrideA stride_A;
+StrideA stride_A_compressed;
+StrideE stride_E;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+
+LayoutA layout_A;
+LayoutE layout_E;
+
+uint64_t seed;
+
+cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
+cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
+cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
+cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
+cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D_ref;
+
+#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  float alpha, beta;
+  int iterations;
+  int m, n, k, l;
+
+  Options():
+    help(false),
+    m(5120), n(4096), k(16384), l(1),
+    alpha(1.f), beta(0.f),
+    iterations(10)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "62_hopper_sparse_gemm\n\n"
+      << "  Hopper Sparse GEMM example.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent of the GEMM (batch size)\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "62_hopper_sparse_gemm" << " --m=4096 --n=5120 --k=8192 --l=1 --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const
+  {
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k;
+    double gflop = double(flop) / double(1.0e9);
+    return gflop / runtime_s;
+  }
+};
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed) {
+
+  Element scope_max, scope_min;
+  int bits_input = cutlass::sizeof_bits<Element>::value;
+
+  if (bits_input == 1) {
+    scope_max = Element(2);
+    scope_min = Element(0);
+  } else if (bits_input <= 8) {
+    scope_max = Element(2);
+    scope_min = Element(-2);
+  } else {
+    scope_max = Element(8);
+    scope_min = Element(-8);
+  }
+
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+bool sparsify_and_compress()
+{
+  auto [M, N, K, L] = problem_shape;
+  CompressorUtility compressor_utility(problem_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  block_A_compressed.reset(M * KC * L);
+  block_E.reset(ME * KE * L);
+
+  stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
+  stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
+
+  // Random sparsification is performed on host
+  std::vector<ElementA> block_A_host(block_A.size());
+  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
+  compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), static_cast<int>(seed + 2024));
+  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  typename Compressor::Arguments arguments {
+    problem_shape,
+    { block_A.get(),
+      stride_A,
+      block_A_compressed.get(),
+      block_E.get() },
+    {hw_info} };
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+bool initialize(Options const& options) {
+
+  problem_shape = make_tuple(options.m, options.n, options.k, options.l);
+  auto [M, N, K, L] = problem_shape;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+  // Allocate memory for tensors
+  block_A.reset(M * K * L);
+  block_B.reset(N * K * L);
+  block_C.reset(M * N * L);
+  block_D.reset(M * N * L);
+  block_D_ref.reset(M * N * L);
+
+  // Fill input tensors with data
+  initialize_block(block_A, seed + 2021);
+  initialize_block(block_B, seed + 2022);
+  initialize_block(block_C, seed + 2023);
+
+  // Replace 0 in A with 1 to avoid metadata changes
+  std::vector<ElementA> block_A_host(block_A.size());
+  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
+  for (size_t i = 0; i < block_A.size(); ++i) if (block_A_host[i] == ElementA(0)) block_A_host[i] = ElementA(1.0);
+  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
+
+  if (!sparsify_and_compress()) {
+    return false;
+  };
+
+  // Build the compressed/metadata layouts
+  layout_A = SparseConfig::fill_layoutA(problem_shape);
+  layout_E = SparseConfig::fill_layoutE(problem_shape);
+
+  return true;
+}
+
+/// Populates a Gemm::Arguments structure from the given commandline options
+typename Gemm::Arguments make_args(Options const& options)
+{
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    problem_shape,
+    { block_A_compressed.get(), layout_A, block_B.get(), stride_B, block_E.get(), layout_E },
+    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
+      block_C.get(), stride_C, block_D.get(), stride_D }
+  };
+
+  return arguments;
+}
+
+typename GemmRef::Arguments make_args_ref(Options const& options)
+{
+  typename GemmRef::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    problem_shape,
+    { block_A.get(), stride_A, block_B.get(), stride_B },
+    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
+      block_C.get(), stride_C, block_D_ref.get(), stride_D }
+  };
+
+  return arguments;
+}
+
+template<class Engine, class Layout>
+void print_device_tensor(cute::Tensor<Engine, Layout> const& t)
+{
+  // Assumes size = cosize, i.e. compact tensor
+  std::vector<typename Engine::value_type> data_host(t.size());
+  cutlass::device_memory::copy_to_host(data_host.data(), t.data(), t.size());
+  auto t_host = cute::make_tensor(data_host.data(), t.layout());
+  cute::print_tensor(t_host);
+}
+
+bool verify(Options const& options) {
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  bool passed = cutlass::reference::device::BlockCompareEqual(block_D_ref.get(), block_D.get(), block_D.size());
+
+#if 0
+  if (!passed) {
+    auto [M, N, K, L] = problem_shape;
+    CompressorUtility compressor_utility(problem_shape, stride_A);
+    int ME = compressor_utility.get_metadata_m_physical();
+    int KE = compressor_utility.get_metadata_k_physical();
+    int KC = compressor_utility.get_tensorA_k_physical();
+
+    cute::print("A (original): "); print_device_tensor(make_tensor(block_A.get(), make_shape(M, K, L), stride_A));
+    cute::print("A (compressed): "); print_device_tensor(make_tensor(block_A_compressed.get(), make_shape(M, KC, L), stride_A_compressed));
+    cute::print("E (physical): "); print_device_tensor(make_tensor(block_E.get(), make_shape(ME, KE, L), stride_E));
+    cute::print("E (logical): "); print_device_tensor(make_tensor(block_E.get(), upcast<CollectiveMainloop::ElementEMmaSparsity>(layout_E)));
+    cute::print("B: "); print_device_tensor(make_tensor(block_B.get(), make_shape(N, K, L), stride_B));
+    cute::print("C: "); print_device_tensor(make_tensor(block_C.get(), make_shape(M, N, L), stride_C));
+    cute::print("D reference: "); print_device_tensor(make_tensor(block_D_ref.get(), make_shape(M, N, L), stride_D));
+    cute::print("D  computed: "); print_device_tensor(make_tensor(block_D.get(), make_shape(M, N, L), stride_D));
+  }
+#endif
+
+  return passed;
+}
+
+template<typename Gemm>
+struct Runner
+{
+  using Arguments = typename Gemm::Arguments;
+
+  Runner(Arguments args): arguments(args) {
+    // Using the arguments, query for extra workspace required for matrix multiplication computation
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    // Allocate workspace memory
+    workspace.reset(workspace_size);
+
+    // Check if the problem size is supported or not
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+  }
+
+  void run() {
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
+    CUTLASS_CHECK(gemm.run());
+  }
+
+  void benchmark(Options const& options) {
+    if (options.iterations > 0)
+    {
+      GpuTimer timer;
+      timer.start();
+      for (int iter = 0; iter < options.iterations; ++iter) {
+        run();
+      }
+      timer.stop();
+
+      // Compute average runtime and GFLOPs.
+      float elapsed_ms = timer.elapsed_millis();
+      double avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+      double gflops = options.gflops(avg_runtime_ms / 1000.0);
+
+      std::cout << "  Avg runtime: " << avg_runtime_ms << " ms" << std::endl;
+      std::cout << "  GFLOPS: " << gflops << std::endl;
+    }
+  }
+
+  Gemm gemm;
+  Arguments arguments;
+  cutlass::device_memory::allocation<uint8_t> workspace;
+};
+
+/// Execute the example (verification and timing)
+void run(Options &options) {
+  bool init = initialize(options);
+  if (!init) {
+    std::cout << "Initialization failure" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  Runner<Gemm> gemm(make_args(options));
+  Runner<GemmRef> gemm_ref(make_args_ref(options));
+
+  gemm.run();
+  gemm_ref.run();
+
+  bool passed = verify(options);
+
+  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
+  std::cout << "  Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+  if (!passed) {
+    exit(EXIT_FAILURE);
+  }
+
+  std::cout << "Sparse GEMM:" << std::endl;
+  gemm.benchmark(options);
+
+  std::cout << "Dense GEMM:" << std::endl;
+  gemm_ref.benchmark(options);
+}
+
+#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA 12.2 Toolkit to run this example
+  // and must have compute capability at least 90.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 2)) {
+    std::cerr << "This example requires CUDA 12.2 or newer.\n";
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater).\n";
+    return 0;
+  }
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
+  run(options);
+#endif
+
+  return EXIT_SUCCESS;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_test/example/Makefile b/csrc/quantization/cutlass_test/example/Makefile
new file mode 100644
index 0000000000000..7e5eac250d2e3
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/Makefile
@@ -0,0 +1,68 @@
+# Copyright 2023 The FLash-LLM Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# host compiler
+HOST_COMPILER ?= g++
+CUDA_PATH ?= /usr/local/cuda/
+#below is the path for Narval
+#CUDA_PATH ?= /cvmfs/soft.computecanada.ca/easybuild/software/2020/Core/cudacore/11.7.0/
+# CUDA_PATH ?= /cvmfs/soft.computecanada.ca/easybuild/software/2023/x86-64-v3/Core/cudacore/12.2.2/
+NVCC          := /usr/local/cuda/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m$(shell getconf LONG_BIT)
+CCFLAGS     := -fPIC
+LDFLAGS     :=
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I/usr/local/cuda/include/ -I /home/ferrar/vllm/.deps/cutlass-src/include
+LIBRARIES := -lcublas -lcusparse
+
+################################################################################
+
+# Gencode arguments
+SMS ?= 90
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+ALL_CCFLAGS += --threads 0 --std=c++11 -lineinfo -O3
+
+FLASHLLM_CCFLAGS := -maxrregcount=255
+ALL_CCFLAGS += --use_fast_math
+ALL_CCFLAGS += --ptxas-options=-v,-warn-lmem-usage,--warn-on-spills
+################################################################################
+
+HEAD_FILES = ./util/command_line.h \
+			 ./util/distribution.h \
+			 ./util/host_tensor.h \
+			 ./util/packed_stride.hpp \
+			 ./util/tensor_view_io.h \
+			 ./util/reference/device/gemm.h \
+			 ./util/reference/device/tensor_compare.h \
+			 ./util/reference/device/tensor_fill.h
+
+
+# Target rules
+all: example
+
+example: 62_hopper_sparse_gemm.cu $(HEAD_FILES)
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(OUR_CCFLAGS) $(GENCODE_FLAGS) $< -o $@
+
+clean:
+	rm -f example
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_test/example/util/command_line.h b/csrc/quantization/cutlass_test/example/util/command_line.h
new file mode 100644
index 0000000000000..9dc3a1174067a
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/command_line.h
@@ -0,0 +1,313 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * Utility for parsing command line arguments
+ */
+
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+/******************************************************************************
+ * command_line
+ ******************************************************************************/
+
+/**
+ * Utility for parsing command line arguments
+ */
+struct CommandLine {
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  std::vector<std::string> args;
+
+  /**
+   * Constructor
+   */
+  CommandLine(int argc, const char** argv) {
+    using namespace std;
+
+    for (int i = 1; i < argc; i++) {
+      string arg = argv[i];
+
+      if ((arg[0] != '-') || (arg[1] != '-')) {
+        args.push_back(arg);
+        continue;
+      }
+
+      string::size_type pos;
+      string key, val;
+      if ((pos = arg.find('=')) == string::npos) {
+        key = string(arg, 2, arg.length() - 2);
+        val = "";
+      } else {
+        key = string(arg, 2, pos - 2);
+        val = string(arg, pos + 1, arg.length() - 1);
+      }
+
+      keys.push_back(key);
+      values.push_back(val);
+    }
+  }
+
+  /**
+   * Checks whether a flag "--<flag>" is present in the commandline
+   */
+  bool check_cmd_line_flag(const char* arg_name) const {
+    using namespace std;
+
+    for (int i = 0; i < int(keys.size()); ++i) {
+      if (keys[i] == string(arg_name)) return true;
+    }
+    return false;
+  }
+
+  /**
+   * Returns number of naked (non-flag and non-key-value) commandline parameters
+   */
+  size_t num_naked_args() const {
+    return args.size();
+  }
+
+  /**
+   * Print naked (non-flag and non-key-value) commandline parameters
+   */
+  void print_naked_args(std::ostream &out) const {
+    for (auto arg : args) {
+      out << "   " << arg <<"\n";
+    }
+  }
+
+  /**
+   * Returns the commandline parameter for a given index (not including flags)
+   */
+  template <typename value_t>
+  void get_cmd_line_argument(size_t index, value_t& val) const {
+    using namespace std;
+    if (index < args.size()) {
+      istringstream str_stream(args[index]);
+      str_stream >> val;
+    }
+  }
+
+  /**
+   * Obtains the boolean value specified for a given commandline parameter --<flag>=<bool>
+   */
+  void get_cmd_line_argument(const char* arg_name, bool& val, bool _default) const {
+    val = _default;
+    if (check_cmd_line_flag(arg_name)) {
+      std::string value;
+      get_cmd_line_argument(arg_name, value);
+
+      val = !(value == "0" || value == "false");
+    }
+  }
+  
+  /**
+   * Obtains the value specified for a given commandline parameter --<flag>=<value>
+   */
+  template <typename value_t>
+  void get_cmd_line_argument(const char* arg_name,
+                             value_t& val) const {
+
+    get_cmd_line_argument(arg_name, val, val);
+  }
+
+  /**
+   * Obtains the value specified for a given commandline parameter --<flag>=<value>
+   */
+  template <typename value_t>
+  void get_cmd_line_argument(const char* arg_name,
+                             value_t& val,
+                             value_t const& _default) const {
+    using namespace std;
+
+    val = _default;
+
+    for (int i = 0; i < int(keys.size()); ++i) {
+      if (keys[i] == string(arg_name)) {
+        istringstream str_stream(values[i]);
+        str_stream >> val;
+      }
+    }
+  }
+
+  /**
+   * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
+   */
+  template <typename value_t>
+  void get_cmd_line_arguments(const char* arg_name,
+                              std::vector<value_t>& vals,
+                              char sep = ',') const {
+    using namespace std;
+
+    if (check_cmd_line_flag(arg_name)) {
+      // Clear any default values
+      vals.clear();
+
+      // Recover from multi-value string
+      for (size_t i = 0; i < keys.size(); ++i) {
+        if (keys[i] == string(arg_name)) {
+          string val_string(values[i]);
+          separate_string(val_string, vals, sep);
+        }
+      }
+    }
+  }
+
+  /**
+   * Returns the values specified for a given commandline parameter
+   * --<flag>=<value>,<value_start:value_end>*
+   */
+  void get_cmd_line_argument_pairs(const char* arg_name,
+                                   std::vector<std::pair<std::string, std::string> >& tokens,
+                                   char delim = ',',
+                                   char sep = ':') const {
+    if (check_cmd_line_flag(arg_name)) {
+      std::string value;
+      get_cmd_line_argument(arg_name, value);
+
+      tokenize(tokens, value, delim, sep);
+    }
+  }
+
+  /**
+   * Returns a list of ranges specified for a given commandline parameter
+   * --<flag>=<key:value>,<key:value>*
+   */
+  void get_cmd_line_argument_ranges(const char* arg_name,
+                                    std::vector<std::vector<std::string> >& vals,
+                                    char delim = ',',
+                                    char sep = ':') const {
+    std::vector<std::string> ranges;
+    get_cmd_line_arguments(arg_name, ranges, delim);
+
+    for (std::vector<std::string>::const_iterator range = ranges.begin();
+      range != ranges.end(); ++range) {
+
+      std::vector<std::string> range_vals;
+      separate_string(*range, range_vals, sep);
+      vals.push_back(range_vals);
+    }
+  }
+
+  /**
+   * The number of pairs parsed
+   */
+  int parsed_argc() const { return (int)keys.size(); }
+
+  //-------------------------------------------------------------------------
+  // Utility functions
+  //-------------------------------------------------------------------------
+
+  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
+  static void tokenize(std::vector<std::pair<std::string, std::string> >& tokens,
+                       std::string const& str,
+                       char delim = ',',
+                       char sep = ':') {
+    // Home-built to avoid Boost dependency
+    size_t s_idx = 0;
+    size_t d_idx = std::string::npos;
+    while (s_idx < str.size()) {
+      d_idx = str.find_first_of(delim, s_idx);
+
+      size_t end_idx = (d_idx != std::string::npos ? d_idx : str.size());
+      size_t sep_idx = str.find_first_of(sep, s_idx);
+      size_t offset = 1;
+      if (sep_idx == std::string::npos || sep_idx >= end_idx) {
+        sep_idx = end_idx;
+        offset = 0;
+      }
+
+      std::pair<std::string, std::string> item(
+          str.substr(s_idx, sep_idx - s_idx),
+          str.substr(sep_idx + offset, end_idx - sep_idx - offset));
+
+      tokens.push_back(item);
+      s_idx = end_idx + 1;
+    }
+  }
+
+  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
+  static void tokenize(std::vector<std::string>& tokens,
+                       std::string const& str,
+                       char delim = ',',
+                       char sep = ':') {
+    typedef std::vector<std::pair<std::string, std::string> > TokenVector;
+    typedef TokenVector::const_iterator token_iterator;
+
+    std::vector<std::pair<std::string, std::string> > token_pairs;
+    tokenize(token_pairs, str, delim, sep);
+    for (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {
+      tokens.push_back(tok->first);
+    }
+  }
+
+  template <typename value_t>
+  static void separate_string(std::string const& str,
+                              std::vector<value_t>& vals,
+                              char sep = ',') {
+    std::istringstream str_stream(str);
+    std::string::size_type old_pos = 0;
+    std::string::size_type new_pos = 0;
+
+    // Iterate <sep>-delimited values
+    value_t val;
+    while ((new_pos = str.find(sep, old_pos)) != std::string::npos) {
+      if (new_pos != old_pos) {
+        str_stream.width(new_pos - old_pos);
+        str_stream >> val;
+        vals.push_back(val);
+      }
+
+      // skip over delimiter
+      str_stream.ignore(1);
+      old_pos = new_pos + 1;
+    }
+
+    // Read last value
+    str_stream >> val;
+    vals.push_back(val);
+  }
+};
+
+}  // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/distribution.h b/csrc/quantization/cutlass_test/example/util/distribution.h
new file mode 100644
index 0000000000000..649a573603ff5
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/distribution.h
@@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+    \brief This header contains a class to parametrize a statistical distribution function.
+*/
+
+#include <ostream>
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Distribution type
+struct Distribution {
+  /// Variant types
+  enum Kind { Invalid, Uniform, Gaussian, Identity, Sequential, AllZeros, AllOnes };
+
+  /// Distribution state
+  union {
+    /// Uniform distribution
+    struct {
+      double min;
+      double max;
+      // Percent elements set to NaN
+      double pnan;
+    } uniform;
+
+    /// Gaussian distribution
+    struct {
+      double mean;
+      double stddev;
+      double pnz;
+      double pnzA;
+      double pnzB;
+      double pnzC;
+    } gaussian;
+
+    /// Elements are linear combination of row and column index
+    struct {
+      double start;
+      double delta;
+    } sequential;
+  };
+
+  /// Active variant kind
+  Kind kind;
+
+  /// Random values are cast to integer after scaling by this power of two
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  Distribution() : kind(Invalid), int_scale(0) {}
+
+/// Configures distribution as uniform random
+  Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
+    kind = Uniform;
+    uniform.min = _min;
+    uniform.max = _max;
+    int_scale = _int_scale;
+    uniform.pnan = _pnan;
+    return *this;
+  }
+
+  /// Configures distribution as Gaussian distribution
+  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
+    kind = Gaussian;
+    gaussian.mean = _mean;
+    gaussian.stddev = _stddev;
+    gaussian.pnz = _pnz;
+    int_scale = _int_scale;
+    return *this;
+  }
+
+  /// Sets identity
+  Distribution &set_identity() {
+    kind = Identity;
+    return *this;
+  }
+
+  /// Sets sequential
+  Distribution &set_sequential(double start, double delta, int _int_scale = 0) {
+    kind = Sequential;
+    sequential.start = start;
+    sequential.delta = delta;
+    int_scale = _int_scale;
+    return *this;
+  }
+};
+
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Prints a Distribution to ostream
+inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
+  switch (dist.kind) {
+    case cutlass::Distribution::Uniform:
+      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
+          << ", pnan: " << dist.uniform.pnan;
+      break;
+    case cutlass::Distribution::Gaussian:
+      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
+          << ", pnzA: " << dist.gaussian.pnzA << ", pnzB: "
+          << dist.gaussian.pnzB << ", pnzC: " << dist.gaussian.pnzC;
+      break;
+    case cutlass::Distribution::Identity:
+      out << "identity";
+      break;
+    case cutlass::Distribution::Sequential:
+      out << "sequential";
+      break;
+    default:
+      out << "unknown";
+  }
+
+  out << ", int_scale: " << dist.int_scale;
+
+  return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/gather_tensor.hpp b/csrc/quantization/cutlass_test/example/util/gather_tensor.hpp
new file mode 100644
index 0000000000000..62616e00c7357
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/gather_tensor.hpp
@@ -0,0 +1,215 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/util/print.hpp"
+
+namespace example {
+
+using namespace cute;
+
+// Empty type used to disable gather/scatter for a GEMM argument
+struct NoGather
+{
+  template<class... Ts>
+  NoGather(Ts...) {};
+};
+
+/// Function object that applies an index to its argument
+template <class Index>
+struct IndexedGather
+{
+  CUTE_HOST_DEVICE constexpr
+  IndexedGather(Index const *indices = {}): indices_(indices) {}
+
+  template <typename I>
+  CUTE_HOST_DEVICE constexpr
+  Index
+  operator()(I i) const { return indices_[i]; }
+
+  CUTE_HOST_DEVICE friend
+  void 
+  print(IndexedGather const &s) {
+    cute::print("Indexed");
+  }
+
+  Index const *indices_;
+};
+
+/// Function object that applies a stride to its argument
+/// Example: StridedFunc<int,_2> gathers every other row/column
+template <class Stride>
+struct StridedGather
+{
+  CUTE_HOST_DEVICE constexpr
+  StridedGather(Stride stride = {}): stride_(stride) {}
+
+  template <class I>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  operator()(I i) const { return i * stride_; }
+
+  CUTE_HOST_DEVICE friend
+  void 
+  print(StridedGather const &s) {
+    cute::print("Strided{");
+    print(s.stride_);
+    cute::print("}");
+  }
+
+  Stride stride_;
+};
+
+/// Custom stride object that applies a function followed by a stride
+template <class Func, class Stride>
+struct CustomStride
+{
+  CUTE_HOST_DEVICE constexpr
+  CustomStride(Func const &func, Stride const &stride): func_(func), stride_(stride) {}
+
+  template <class I>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  operator*(I i, CustomStride const &s) { return s.func_(i) * s.stride_; }
+
+  template <class I>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  operator*(CustomStride const &s, I i) { return s.func_(i) * s.stride_; }
+
+  CUTE_HOST_DEVICE friend
+  void
+  print(CustomStride const & s) {
+    cute::print("Custom{");
+    print(s.func_);
+    cute::print(",");
+    print(s.stride_);
+    cute::print("}");
+  }
+
+  template<class Div>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  safe_div(CustomStride const &s, Div const &div)
+  {
+    return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_, safe_div(s.stride_, div));
+  }
+
+  // Circumvent the requirement on make_layout that shape and stride are integral
+  template <class Shape>
+  CUTE_HOST_DEVICE constexpr friend
+  auto
+  make_layout(Shape const &shape, CustomStride const &stride)
+  {
+    return Layout<Shape, CustomStride>(shape, stride);
+  }
+
+  Func func_;
+  Stride stride_;
+};
+
+template<class Stride, class Func>
+CUTLASS_HOST_DEVICE
+auto
+make_custom_stride_layout(Stride const &stride, Func&& func)
+{
+  // Use a dummy shape and replace the first non-unit stride with a custom gather stride
+  auto idx = find_if(stride, [](auto x){ return not is_constant<1, decltype(x)>{}; });
+  constexpr int I = decltype(idx)::value;
+  return make_layout(repeat_like(stride, _1{}),
+                     replace<I>(stride, CustomStride{static_cast<Func&&>(func), get<I>(stride)}));
+}
+
+/// Helper function to optionally create a gather tensor
+template<class Iterator, class Shape, class Stride, class Func>
+CUTLASS_HOST_DEVICE
+auto 
+make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func)
+{
+  if constexpr (not cutlass::platform::is_same<remove_cvref_t<Func>, NoGather>::value) {
+    Layout matrix_layout = make_identity_layout(shape);
+    auto offset = as_arithmetic_tuple(repeat_like(shape, _0{}));
+    Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func&&>(func));
+    return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
+  } else {
+    return make_tensor(iter, shape, stride);
+  }
+}
+
+} // namespace example
+
+namespace cute
+{
+
+template<int N, int I, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(Shape const& shape, Stride const& stride)
+{
+  if constexpr (is_tuple<Shape>::value) {
+    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N,I>(s,d); });
+  } else if constexpr (is_scaled_basis<Stride>::value) {
+    if constexpr (Stride::mode() == I) {
+      return make_layout(shape_div(shape, Int<N>{}), shape_div(stride, Int<N>{}));
+    } else {
+      return make_layout(shape, stride);
+    }
+  } else {
+    return upcast<N>(shape, stride);
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template <int N, class OuterShape, class OuterStride, class Offset, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr
+auto
+upcast(ComposedLayout<Layout<OuterShape,OuterStride>,Offset,Layout<Shape,Stride>> const& layout)
+{
+  // Find index of the stride-1 mode - that is the only one that requires updating inner shape and offset
+  auto idx = find_if(layout.layout_a().stride(), [](auto x){ return is_constant<1, decltype(x)>{}; });
+  constexpr int I = decltype(idx)::value;
+
+  // Upcast the outer layout (works as expected)
+  auto outer = upcast<N>(layout.layout_a());
+
+  // Upcast the accumulated offset along stride-1 mode
+  auto offset = as_arithmetic_tuple(replace<I>(layout.offset(), upcast<N>(get<I>(layout.offset()))));
+
+  // Upcast the inner layout's shape along stride-1 mode
+  auto inner = upcast<N,I>(layout.layout_b().shape(), layout.layout_b().stride());
+
+  return composition(outer, offset, inner);
+}
+
+} // namespace example
diff --git a/csrc/quantization/cutlass_test/example/util/helper.h b/csrc/quantization/cutlass_test/example/util/helper.h
new file mode 100644
index 0000000000000..a7a81e7479022
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/helper.h
@@ -0,0 +1,108 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Panic wrapper for unwinding CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                                                                    \
+  {                                                                                              \
+    cutlass::Status error = status;                                                              \
+    if (error != cutlass::Status::kSuccess) {                                                    \
+      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                << std::endl;                                                                    \
+      exit(EXIT_FAILURE);                                                                        \
+    }                                                                                            \
+  }
+
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                              \
+  {                                                                     \
+    cudaError_t error = status;                                         \
+    if (error != cudaSuccess) {                                         \
+      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                << " at line: " << __LINE__ << std::endl;               \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  }
+
+
+/**
+ * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
+ */
+struct GpuTimer
+{
+    cudaStream_t _stream_id;
+    cudaEvent_t _start;
+    cudaEvent_t _stop;
+
+    /// Constructor
+    GpuTimer() : _stream_id(0)
+    {
+        CUDA_CHECK(cudaEventCreate(&_start));
+        CUDA_CHECK(cudaEventCreate(&_stop));
+    }
+
+    /// Destructor
+    ~GpuTimer()
+    {
+        CUDA_CHECK(cudaEventDestroy(_start));
+        CUDA_CHECK(cudaEventDestroy(_stop));
+    }
+
+    /// Start the timer for a given stream (defaults to the default stream)
+    void start(cudaStream_t stream_id = 0)
+    {
+        _stream_id = stream_id;
+        CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+    }
+
+    /// Stop the timer
+    void stop()
+    {
+        CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
+    }
+
+    /// Return the elapsed time (in milliseconds)
+    float elapsed_millis()
+    {
+        float elapsed = 0.0;
+        CUDA_CHECK(cudaEventSynchronize(_stop));
+        CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
+        return elapsed;
+    }
+};
diff --git a/csrc/quantization/cutlass_test/example/util/host_tensor.h b/csrc/quantization/cutlass_test/example/util/host_tensor.h
new file mode 100644
index 0000000000000..3f061875b48dc
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/host_tensor.h
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+  \brief HostTensor contributes management for both host and device memory.
+
+  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
+  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
+  for CUDA memcpy operations.
+
+  Call {host, device}_{data, ref, view}() for accessing host or device memory.
+
+  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
+*/
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/fast_math.h"
+
+#include "device_memory.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Host tensor
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class HostTensor {
+public:
+
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+
+  /// Tensor reference to device memory
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Tensor reference to constant device memory
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  /// Tensor reference to device memory
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Tensor reference to constant device memory
+  using ConstTensorView = typename TensorView::ConstTensorView;
+
+  /// Reference to element in tensor
+  using Reference = typename TensorRef::Reference;
+
+  /// Constant reference to element in tensor
+  using ConstReference = typename ConstTensorRef::Reference;
+
+private:
+  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
+                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
+                                      Element, uint8_t>>;
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
+  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
+  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
+  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+
+  //
+  // Data members
+  //
+
+  /// Extent of tensor in logical dimensions
+  TensorCoord extent_;
+
+  /// Layout object
+  Layout layout_;
+
+  /// Host-side memory allocation
+  std::vector<StorageUnit> host_;
+
+  /// Device-side memory
+  device_memory::allocation<StorageUnit> device_;
+
+  /// number of containers 
+  size_t count_to_container_storage_unit_count(size_t count) {
+    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
+  }
+
+public:
+  //
+  // Device and Host Methods
+  //
+
+  /// Default constructor
+  HostTensor() {}
+
+  /// Constructs a tensor given an extent. Assumes a packed layout
+  HostTensor(
+    TensorCoord const &extent,
+    bool device_backed = true
+  ) {
+
+    this->reset(extent, Layout::packed(extent), device_backed);
+  }
+
+  /// Constructs a tensor given an extent and layout
+  HostTensor(
+    TensorCoord const &extent,
+    Layout const &layout,
+    bool device_backed = true
+  ) {
+
+    this->reset(extent, layout, device_backed);
+  }
+
+  ~HostTensor() { }
+
+  /// Clears the HostTensor allocation to size/capacity = 0
+  void reset() {
+    extent_ = TensorCoord();
+    layout_ = Layout::packed(extent_);
+
+    host_.clear();
+    device_.reset();
+  }
+
+  /// Resizes internal memory allocations without affecting layout or extent
+  void reserve(
+    size_t count,                                        ///< size of tensor in elements
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
+#endif
+
+    device_.reset();
+    host_.clear();
+
+    size_t count_container = count_to_container_storage_unit_count(count);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
+#endif    
+    host_.resize(count_container);
+
+    // Allocate memory
+    StorageUnit* device_memory = nullptr;
+    if (device_backed_) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
+#endif
+      device_memory = device_memory::allocate<StorageUnit>(count_container);
+    }
+    device_.reset(device_memory, device_backed_ ? count_container : 0);
+  }
+
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    extent_ = extent;
+    layout_ = layout;
+
+    reserve(size_t(layout_.capacity(extent_)), device_backed_);
+  }
+
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout. Assumes a packed tensor configuration.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    reset(extent, Layout::packed(extent), device_backed_);
+  }
+
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset().
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    extent_ = extent;
+    layout_ = layout;
+
+    LongIndex new_size = size_t(layout_.capacity(extent_));
+    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
+
+    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
+      reserve(new_size, device_backed_);
+    }
+  }
+
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    resize(extent, Layout::packed(extent), device_backed_);
+  }
+
+  /// Returns the logical number of elements stored in the host tensor
+  size_t size() const {
+    return layout_.capacity(extent_);
+  }
+
+  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
+  LongIndex capacity() const {
+    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
+  }
+
+  /// Gets pointer to host data
+  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
+
+  /// Gets pointer to host data with a pointer offset
+  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+
+  /// Gets a reference to an element in host memory
+  Reference host_data(LongIndex idx) {
+    return ReferenceFactory<Element>::get(host_data(), idx);
+  }
+
+  /// Gets pointer to host data
+  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
+
+  /// Gets pointer to host data with a pointer offset
+  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+
+  /// Gets a constant reference to an element in host memory
+  ConstReference host_data(LongIndex idx) const {
+    return ReferenceFactory<Element const>::get(host_data(), idx);
+  }
+
+  /// Gets pointer to device data
+  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
+
+  /// Gets pointer to device data
+  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
+
+  /// Gets pointer to device data with a pointer offset
+  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+
+  /// Gets pointer to device data with a pointer offset
+  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+
+  /// Accesses the tensor reference pointing to data
+  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+
+  /// Accesses the tensor reference pointing to data
+  TensorRef device_ref(LongIndex ptr_element_offset=0) {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  TensorView host_view(LongIndex ptr_element_offset=0) {
+    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  TensorView device_view(LongIndex ptr_element_offset=0) {
+    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Returns true if device memory is allocated
+  bool device_backed() const {
+    return (device_.get() == nullptr) ? false : true;
+  }
+
+
+  /// Returns the layout object
+  Layout & layout() {
+    return layout_;
+  }
+
+  /// Returns the layout object
+  Layout layout() const {
+    return layout_;
+  }
+
+  /// Returns the layout object's stride vector
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride vector
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+
+  /// Computes the offset of an index from the origin of the tensor
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+
+  /// Returns a reference to the element at the logical Coord in host memory
+  Reference at(TensorCoord const& coord) {
+    return host_data(offset(coord));
+  }
+
+  /// Returns a const reference to the element at the logical Coord in host memory
+  ConstReference at(TensorCoord const& coord) const {
+    return host_data(offset(coord));
+  }
+
+  /// Returns the extent of the tensor
+  TensorCoord extent() const {
+    return extent_;
+  }
+
+  /// Returns the extent of the tensor
+  TensorCoord & extent() {
+    return extent_;
+  }
+
+  /// Copies data from device to host
+  void sync_host() {
+    if (device_backed()) {
+      device_memory::copy_to_host(
+          host_.data(), device_.get(), device_.size());
+    }
+  }
+
+  /// Copies data from host to device
+  void sync_device() {
+    if (device_backed()) {
+      device_memory::copy_to_device(
+          device_.get(), host_.data(), host_.size());
+    }
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_host(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_device(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_device(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_host(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_host(
+    Element * ptr_host,               ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_device(
+    Element * ptr_device,             ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_device(
+    Element * ptr_device,             ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_host(
+    Element * ptr_host,               ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/packed_stride.hpp b/csrc/quantization/cutlass_test/example/util/packed_stride.hpp
new file mode 100644
index 0000000000000..e9a243a1322cc
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/packed_stride.hpp
@@ -0,0 +1,570 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/container/array.hpp"   // cute::array
+#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides without batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, int64_t>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT, int64_t>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with group mode
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides for convolutions
+
+// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
+// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
+// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
+// right in KTRSC order and can be coalesced to just k.
+// We enforce this condition here with asserts.
+template <class IntT, size_t RankT_>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
+    cute::array<int32_t, RankT_> shape_output,
+    cute::array<IntT, RankT_> stride_output,
+    cutlass::conv::Operator conv_op) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  static_assert(RankT_ >= 3u);
+  constexpr static int RankT = static_cast<int>(RankT_);
+
+  assert(stride_output[RankT-1] == 1);
+  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
+    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
+  });
+
+  auto s_copy = s;
+  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
+      stride_output[0] :
+      stride_output[RankT-2];
+  return s_copy;
+}
+
+//
+// Activation tensor ((w, h, d, n), _1) for fprop kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_nwc[1];
+  cute::get<0,1>(s_copy) = stride_nwc[0];
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
+  });
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
+  });
+  return s_copy;
+}
+
+//
+// Filter tensor (k, (_1, s, r, t)) for fprop kernel
+//
+
+// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+//
+// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
+//
+// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
+// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::get<1,0>(s_copy) = stride_nwc[1];
+    cute::get<1,1>(s_copy) = stride_nwc[0];
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nwc in dgrad is ksc.
+    cute::get<1,0>(s_copy) = stride_nwc[0];
+    cute::get<1,1>(s_copy) = stride_nwc[1];
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nhwc in dgrad is krsc.
+    cute::get<1,0>(s_copy) = stride_nhwc[0];
+    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_ndhwc in dgrad is ktrsc.
+    cute::get<1,0>(s_copy) = stride_ndhwc[0];
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+//
+// NZPQ tensor (_1, nzpq) for wgrad kernel
+//
+
+// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 3> stride_nqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nqk[2] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nqk[1];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 4> stride_npqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_npqk[3] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_npqk[2];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 5> stride_nzpqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nzpqk[4] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nzpqk[3];
+  return s_copy;
+}
+
+
+
+//
+// Wgrad output tensor (k, (_1, s, r, t), _0)
+//
+
+// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+
+//
+// Wgrad output tensor ((_1, s, r, t), k, _0)
+//
+
+// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ksc[0];
+  cute::get<0,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/detail/inner_product.h b/csrc/quantization/cutlass_test/example/util/reference/detail/inner_product.h
new file mode 100644
index 0000000000000..2bce60b1390c0
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/detail/inner_product.h
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+namespace cutlass {
+namespace reference {
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template function to compute an inner product.
+#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate with a
+                            // host-only type
+template <typename Atype, typename Btype, typename Ctype>
+CUTLASS_HOST_DEVICE
+Ctype inner_product(Atype a, Btype b, Ctype c) {
+  return Ctype(a) * Ctype(b) + c;
+}
+
+/// Specialization for matrix multiplication with binary operands
+template <>
+CUTLASS_HOST_DEVICE
+int inner_product<Array<bin1_t, 32>, Array<bin1_t, 32>, int>(
+    Array<bin1_t, 32> a,
+    Array<bin1_t, 32> b,
+    int c) {
+
+  int accum = 0;
+  for (int bit = 0; bit < 32; bit++) {
+    accum += a[bit] ^ b[bit];
+  }
+  return accum + c;
+}
+
+/*
+/// Specialization for matrix multiplication with signed 4-bit integer operands
+template <>
+CUTLASS_HOST_DEVICE
+int inner_product<Array<int4b_t, 8>, Array<int4b_t, 8>, int>(
+    Array<int4b_t, 8> a,
+    Array<int4b_t, 8> b,
+    int c) {
+
+  int accum = 0;
+  for (int k = 0; k < 8; k++) {
+    accum += a[k] * b[k];
+  }
+  return accum + c;
+}
+
+/// Specialization for matrix multiplication with unsigned 4-bit integer operands
+template <>
+CUTLASS_HOST_DEVICE
+int inner_product<Array<uint4b_t, 8>, Array<uint4b_t, 8>, int>(
+    Array<uint4b_t, 8> a,
+    Array<uint4b_t, 8> b,
+    int c) {
+
+  int accum = 0;
+  for (int k = 0; k < 8; k++) {
+    accum += a[k] * b[k];
+  }
+  return accum + c;
+}
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename SrcType, typename DstType>
+struct Cast {
+  // Default behavior: convert to the destination type
+#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate complex<T> with a
+                            // host-only type
+  CUTLASS_HOST_DEVICE
+  static DstType apply(SrcType src) { return static_cast<DstType>(src); };
+};
+
+template <>
+struct Cast<float, int8_t> {
+  CUTLASS_HOST_DEVICE
+  static int8_t apply(float src) {
+    // Clamp to the range of signed 8-bit integers.
+    return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
+  };
+};
+
+template <>
+struct Cast<float, uint8_t> {
+  CUTLASS_HOST_DEVICE
+  static uint8_t apply(float src) {
+    // Clamp to the range of signed 8-bit integers.
+    return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+} // namespace reference
+} // namespace cutlass
+
diff --git a/csrc/quantization/cutlass_test/example/util/reference/detail/linear_to_coordinate.h b/csrc/quantization/cutlass_test/example/util/reference/detail/linear_to_coordinate.h
new file mode 100644
index 0000000000000..1f784c46f6eb9
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/detail/linear_to_coordinate.h
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Rank, int Index>
+struct LinearToCoordinateHelper {
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
+
+    int64_t prod = 1;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Rank - Index; i < Rank; ++i) {
+      prod *= int64_t(extent[i]);
+    }
+
+    coord[Rank - Index - 1] = int(idx / prod);
+
+    int64_t residual = idx % prod;
+    LinearToCoordinateHelper<Rank, Index - 1>()(coord, residual, extent);
+  }
+};
+
+template <int Rank>
+struct LinearToCoordinateHelper<Rank, 0> {
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &) const {
+    coord[Rank - 1] = int(idx);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Rank>
+struct LinearToCoordinate {
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
+    LinearToCoordinateHelper<Rank, Rank - 1>()(coord, idx, extent);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+} // namespace reference
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/convolution.h b/csrc/quantization/cutlass_test/example/util/reference/device/convolution.h
new file mode 100644
index 0000000000000..c91cd0e229bdd
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/convolution.h
@@ -0,0 +1,1549 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Reference implementation for convolution in device-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+///                                   Conv2d device reference kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2d Fprop kernel - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_p[kThreadM];
+  int thread_q[kThreadM];
+
+  // Compute N, P, Q coordinates for each row of a thread's tile
+  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t npq = npq_start + m;
+
+    thread_n[m] = int(npq / PQ);
+    
+    int64_t residual = npq % PQ;
+    thread_p[m] = int(residual / problem_size.Q);
+    thread_q[m] = int(residual % problem_size.Q);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  int c_per_group = problem_size.C / problem_size.groups;
+  int k_per_group = problem_size.K / problem_size.groups;
+
+  // Compute convolution
+  for (int R = 0; R < problem_size.R; ++R) {
+    for (int S = 0; S < problem_size.S; ++S) {
+      for (int C = 0; C < problem_size.C; ++C) {
+
+        // Get group id of currnet channel
+        int c_group_idx = C / c_per_group;
+
+        // Load from activations tensor
+        int filter_r = R;
+        int filter_s = S;   
+
+        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+          filter_r = problem_size.R - 1 - R;
+          filter_s = problem_size.S - 1 - S;
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+          int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+          if (thread_n[m] < problem_size.N && h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
+            element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], h, w, C}));
+          }
+          else {
+            element_A[m] = ElementAccumulator();
+          }
+        }
+
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          int thread_k = k_start + n;
+          int k_group_idx = thread_k / k_per_group;
+
+          if (thread_k < problem_size.K && k_group_idx == c_group_idx) {
+            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C % c_per_group}));
+          }
+          else {
+            element_B[n] = ElementAccumulator();
+          }
+        }
+
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_k = k_start + n;
+        if (thread_k < problem_size.K) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
+          }
+
+          tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+// Conv3d Fprop kernel - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator =  ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t nzpq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_z[kThreadM];
+  int thread_p[kThreadM];
+  int thread_q[kThreadM];
+
+  // Compute N, Z, P, Q coordinates for each row of a thread's tile
+  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
+  int64_t ZPQ = PQ * problem_size.Z;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t nzpq = nzpq_start + m;
+
+    thread_n[m] = int(nzpq / ZPQ);
+    
+    int64_t residual = nzpq % ZPQ;
+    thread_z[m] = int(residual / PQ);
+
+    residual = residual % PQ;
+    thread_p[m] = int(residual / problem_size.Q);
+    thread_q[m] = int(residual % problem_size.Q);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int T = 0; T < problem_size.T; ++T) {
+    for (int R = 0; R < problem_size.R; ++R) {
+      for (int S = 0; S < problem_size.S; ++S) {
+        for (int C = 0; C < problem_size.C; ++C) {
+
+          // Load from activations tensor
+          int filter_t = T;
+          int filter_r = R;
+          int filter_s = S;   
+
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_t = problem_size.T - 1 - T;
+            filter_r = problem_size.R - 1 - R;
+            filter_s = problem_size.S - 1 - S;
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            int d = thread_z[m] * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+            int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+            int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+            if (thread_n[m] < problem_size.N && 
+              d >= 0 && d < problem_size.D && 
+              h >= 0 && h < problem_size.H && 
+              w >= 0 && w < problem_size.W) {
+
+              element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], d, h, w, C}));
+            }
+            else {
+              element_A[m] = ElementAccumulator();
+            }
+          }
+
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            int thread_k = k_start + n;
+
+            if (thread_k < problem_size.K) {
+              element_B[n] = ElementAccumulator(tensor_w.at({thread_k, T, R, S, C}));
+            }
+            else {
+              element_B[n] = ElementAccumulator();
+            }
+          }
+
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+
+        } // for (C)
+      } // for (S)
+    }  // for (R) 
+  } // for (T)
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    if (thread_n[m] < problem_size.N && 
+      thread_z[m] < problem_size.Z && 
+      thread_p[m] < problem_size.P && 
+      thread_q[m] < problem_size.Q) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_k = k_start + n;
+        if (thread_k < problem_size.K) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}));
+          }
+
+          tensor_y_out.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } // for (n)
+ 
+    }
+  } // for (m)
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2d dgrad kernel - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv2dDgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t nhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_h[kThreadM];
+  int thread_w[kThreadM];
+
+  // Compute N, H, W coordinates for each row of a thread's tile
+  int64_t HW = int64_t(problem_size.H) * problem_size.W;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t nhw = nhw_start + m;
+
+    thread_n[m] = int(nhw / HW);
+    
+    int64_t residual = nhw % HW;
+    thread_h[m] = int(residual / problem_size.W);
+    thread_w[m] = int(residual % problem_size.W);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int R = 0; R < problem_size.R; ++R) {
+    for (int S = 0; S < problem_size.S; ++S) {
+      for (int K = 0; K < problem_size.K; ++K) {
+
+        // Load from activations tensor
+        int filter_r = R;
+        int filter_s = S;   
+
+        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+          filter_r = problem_size.R - 1 - R;
+          filter_s = problem_size.S - 1 - S;
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+
+          int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
+          int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+          element_A[m] = ElementAccumulator();
+
+          if (p >= 0 && !(p % problem_size.stride_h) && q >= 0 && !(q % problem_size.stride_w)) {
+
+            p = p / problem_size.stride_h;
+            q = q / problem_size.stride_w;
+
+            if (thread_n[m] < problem_size.N && p < problem_size.P && q < problem_size.Q) {
+              element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], p, q, K}));  
+            }
+          }
+        }
+
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          int thread_c = c_start + n;
+
+          if (thread_c < problem_size.C) {
+            element_B[n] = ElementAccumulator(tensor_w.at({K, R, S, thread_c}));
+          }
+          else {
+            element_B[n] = ElementAccumulator();
+          }
+        }
+
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    
+    if (thread_n[m] < problem_size.N && thread_h[m] < problem_size.H && thread_w[m] < problem_size.W) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_c = c_start + n;
+        if (thread_c < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_h[m], thread_w[m], thread_c}));
+          }
+
+          tensor_dx_out.at({thread_n[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+// Conv3d dgrad kernel - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv3dDgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int64_t ndhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+
+  int thread_n[kThreadM];
+  int thread_d[kThreadM];
+  int thread_h[kThreadM];
+  int thread_w[kThreadM];
+
+  // Compute N, H, W coordinates for each row of a thread's tile
+  int64_t HW = int64_t(problem_size.H) * problem_size.W;
+  int64_t DHW = HW * problem_size.D;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+
+    int64_t ndhw = ndhw_start + m;
+
+    thread_n[m] = int(ndhw / DHW);
+    
+    int64_t residual = ndhw % DHW;
+    thread_d[m] = int(residual / HW);
+
+    residual = residual % HW;
+    thread_h[m] = int(residual / problem_size.W);
+    thread_w[m] = int(residual % problem_size.W);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int T = 0; T < problem_size.T; ++T) {
+    for (int R = 0; R < problem_size.R; ++R) {
+      for (int S = 0; S < problem_size.S; ++S) {
+        for (int K = 0; K < problem_size.K; ++K) {
+
+          // Load from activations tensor
+          int filter_t = T;
+          int filter_r = R;
+          int filter_s = S;   
+
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_t = problem_size.T - 1 - T;
+            filter_r = problem_size.R - 1 - R;
+            filter_s = problem_size.S - 1 - S;
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+
+            int z = thread_d[m] + problem_size.pad_d - filter_t * problem_size.dilation_d;
+            int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
+            int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+            element_A[m] = ElementAccumulator();
+
+            if (z >= 0 && !(z % problem_size.stride_d) && 
+              p >= 0 && !(p % problem_size.stride_h) && 
+              q >= 0 && !(q % problem_size.stride_w)) {
+
+              z = z / problem_size.stride_d;
+              p = p / problem_size.stride_h;
+              q = q / problem_size.stride_w;
+
+              if (thread_n[m] < problem_size.N && z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
+                element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], z, p, q, K}));  
+              }
+            }
+          }
+
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            int thread_c = c_start + n;
+
+            if (thread_c < problem_size.C) {
+              element_B[n] = ElementAccumulator(tensor_w.at({K, T, R, S, thread_c}));
+            }
+            else {
+              element_B[n] = ElementAccumulator();
+            }
+          }
+
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+
+        } // for (C)
+      } // for (S)
+    } // for (R)
+  } // for (T)
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    
+    if (thread_n[m] < problem_size.N && 
+      thread_d[m] < problem_size.D && 
+      thread_h[m] < problem_size.H && 
+      thread_w[m] < problem_size.W) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_c = c_start + n;
+        if (thread_c < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}));
+          }
+
+          tensor_dx_out.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2d wgrad kernel - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
+  int kCtaShapeN = 16     // shape of a threadblock in units of threads
+>
+__global__ void Conv2dWgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int64_t rsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  
+  int thread_r[kThreadN];
+  int thread_s[kThreadN];
+  int thread_c[kThreadN];
+
+  // Compute R, S, C coordinates for each row of a thread's tile
+  int64_t SC = int64_t(problem_size.S) * problem_size.C;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < kThreadN; ++n) {
+
+    int64_t rsc = rsc_start + n;
+    int64_t residual = rsc % SC;
+
+    thread_r[n] = int(rsc / SC);
+    thread_s[n] = int(residual / problem_size.C);
+    thread_c[n] = int(residual % problem_size.C);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int N = 0; N < problem_size.N; ++N) {
+    for (int P = 0; P < problem_size.P; ++P) {
+      for (int Q = 0; Q < problem_size.Q; ++Q) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          int thread_k = k_start + m;
+
+          element_A[m] = ElementAccumulator();
+
+          if (thread_k < problem_size.K) {
+            element_A[m] = ElementAccumulator(tensor_dy.at({N, P, Q, thread_k}));
+          }
+        }
+
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          
+          // Load from activations tensor
+          int filter_r = thread_r[n];
+          int filter_s = thread_s[n];
+
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_r = problem_size.R - 1 - filter_r;
+            filter_s = problem_size.S - 1 - filter_s;
+          }
+
+          int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+          int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+          element_B[n] = ElementAccumulator();
+
+          if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W && thread_c[n] < problem_size.C) {
+            element_B[n] = ElementAccumulator(tensor_x.at({N, h, w, thread_c[n]}));
+          }
+        }
+
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int thread_k = k_start + m;
+
+    if (thread_k < problem_size.K) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+
+        if (thread_r[n] < problem_size.R && thread_s[n] < problem_size.S && thread_c[n] < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}));
+          }
+
+          tensor_dw_out.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+// Conv3d wgrad kernel - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
+  int kCtaShapeN = 16     // shape of a threadblock in units of threads
+>
+__global__ void Conv3dWgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+
+  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int64_t trsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  
+  int thread_t[kThreadN];
+  int thread_r[kThreadN];
+  int thread_s[kThreadN];
+  int thread_c[kThreadN];
+
+  // Compute R, S, C coordinates for each row of a thread's tile
+  int64_t SC = int64_t(problem_size.S) * problem_size.C;
+  int64_t RSC = SC * problem_size.R;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < kThreadN; ++n) {
+
+    int64_t trsc = trsc_start + n;
+
+    thread_t[n] = int(trsc / RSC);
+
+    int64_t residual = trsc % RSC;
+    thread_r[n] = int(residual / SC);
+
+    residual = residual % SC; 
+    thread_s[n] = int(residual / problem_size.C);
+    thread_c[n] = int(residual % problem_size.C);
+  }
+
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+
+  // Compute convolution
+  for (int N = 0; N < problem_size.N; ++N) {
+    for (int Z = 0; Z < problem_size.Z; ++Z) {
+      for (int P = 0; P < problem_size.P; ++P) {
+        for (int Q = 0; Q < problem_size.Q; ++Q) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            int thread_k = k_start + m;
+
+            element_A[m] = ElementAccumulator();
+
+            if (thread_k < problem_size.K) {
+              element_A[m] = ElementAccumulator(tensor_dy.at({N, Z, P, Q, thread_k}));
+            }
+          }
+
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            
+            // Load from activations tensor
+            int filter_t = thread_t[n];
+            int filter_r = thread_r[n];
+            int filter_s = thread_s[n];
+
+            if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+              filter_t = problem_size.T - 1 - filter_t;
+              filter_r = problem_size.R - 1 - filter_r;
+              filter_s = problem_size.S - 1 - filter_s;
+            }
+
+            int d = Z * problem_size.stride_d - problem_size.pad_w + filter_t * problem_size.dilation_d;
+            int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+            int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+            element_B[n] = ElementAccumulator();
+
+            if (d >= 0 && d < problem_size.D && 
+              h >= 0 && h < problem_size.H && 
+              w >= 0 && w < problem_size.W && 
+              thread_c[n] < problem_size.C) {
+
+              element_B[n] = ElementAccumulator(tensor_x.at({N, d, h, w, thread_c[n]}));
+            }
+          }
+
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+
+        } // for (Q)
+      } // for (P)
+    } // for (Z)
+  } // for (N)
+
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int thread_k = k_start + m;
+
+    if (thread_k < problem_size.K) {
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+
+        if (thread_t[n] < problem_size.T && 
+          thread_r[n] < problem_size.R &&
+          thread_s[n] < problem_size.S && 
+          thread_c[n] < problem_size.C) {
+
+          ElementCompute c_ref = ElementCompute();
+
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}));
+          }
+
+          tensor_dw_out.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } 
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Conv2d Fprop dispatcher - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q;
+  int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv2dFprop<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_x,
+    tensor_w,
+    tensor_y_in,
+    tensor_y_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv3d Fprop dispatcher - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t nzpq = int64_t(problem_size.N) * problem_size.Z * problem_size.P * problem_size.Q;
+  int64_t blocks_m = (nzpq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv3dFprop<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_x,
+    tensor_w,
+    tensor_y_in,
+    tensor_y_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv2d Dgrad dispatcher - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dDgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t nhw = int64_t(problem_size.N) * problem_size.H * problem_size.W;
+  int64_t blocks_m = (nhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv2dDgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_w,
+    tensor_dx_in,
+    tensor_dx_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv3d Dgrad dispatcher - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dDgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+
+  int64_t ndhw = int64_t(problem_size.N) * problem_size.D * problem_size.H * problem_size.W;
+  int64_t blocks_m = (ndhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+
+  kernel::Conv3dDgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_w,
+    tensor_dx_in,
+    tensor_dx_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv2d Wgrad dispatcher - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dWgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
+  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
+
+  int64_t rsc = int64_t(problem_size.R) * problem_size.S * problem_size.C;
+  int64_t blocks_n = (rsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
+
+  kernel::Conv2dWgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_x,
+    tensor_dw_in,
+    tensor_dw_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/// Conv3d Wgrad dispatcher - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dWgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
+  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
+
+  int64_t trsc = int64_t(problem_size.T) * problem_size.R * problem_size.S * problem_size.C;
+  int64_t blocks_n = (trsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
+
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
+
+  kernel::Conv3dWgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_x,
+    tensor_dw_in,
+    tensor_dw_out,
+    alpha,
+    beta
+  );
+
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+
+  return Status::kSuccess;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2d(
+  conv::Operator convolutional_operator,
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    return Conv2dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+
+  case conv::Operator::kDgrad:
+    return Conv2dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+
+  case conv::Operator::kWgrad:
+    return Conv2dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+
+  default: break;
+  }
+  
+  return Status::kErrorNotSupported;
+}
+
+/// Generic 3D convolution targeting Conv3dFprop, Conv3dDgrad, and Conv3dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3d(
+  conv::Operator convolutional_operator,
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    return Conv3dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+
+  case conv::Operator::kDgrad:
+    return Conv3dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+
+  case conv::Operator::kWgrad:
+    return Conv3dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+
+  default: break;
+  }
+  
+  return Status::kErrorNotSupported;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace reference
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gemm.h b/csrc/quantization/cutlass_test/example/util/reference/device/gemm.h
new file mode 100644
index 0000000000000..1a1bd3751801a
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/gemm.h
@@ -0,0 +1,385 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in device-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/reference/device/kernel/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp = multiply_add<AccumulatorType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  AccumulatorType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  // Blocking structure potentially improves performance of reference implementation
+  // with a minor increase in complexity.
+  //
+  // Note, this reference implementation is NOT expected to approach peak performance.
+  using OutputTile = MatrixShape<4, 4>;
+
+  dim3 block(16, 8);
+
+  dim3 grid(
+    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
+  );
+
+  // Launch a GEMM kernel
+  kernel::Gemm<
+    TensorRef<ElementA, LayoutA>,
+    TensorRef<ElementB, LayoutB>,
+    TensorRef<ElementC, LayoutC>,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    tensor_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    initial_accum
+  );
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp = multiply_add<AccumulatorType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  AccumulatorType initial_accum) {
+
+  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                ScalarType, AccumulatorType, InnerProductOp, ConvertOp>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+        initial_accum);
+}
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Gemm;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename AccumulatorType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+            ScalarType, AccumulatorType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+
+    static_assert(
+      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+      "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                  ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+      "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add-saturate
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename AccumulatorType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            AccumulatorType, arch::OpMultiplyAddSaturate> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for XOR-popc
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename AccumulatorType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            AccumulatorType, arch::OpXorPopc> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Batched GEMM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a batch of GEMMs over a set of matrices of common dimension.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp,
+  typename ConvertOp
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c,
+  AccumulatorType initial_accum) {
+
+  static_assert(
+    TensorRefCollectionA::kRank == 2 &&
+    TensorRefCollectionB::kRank == 2 &&
+    TensorRefCollectionC::kRank == 2, "Tensors must be of rank 2");
+
+  // Blocking structure potentially improves performance of reference implementation
+  // with a minor increase in complexity.
+  //
+  // Note, this reference implementation is NOT expected to approach peak performance.
+  using OutputTile = MatrixShape<4, 4>;
+
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn),
+    batch_count
+  );
+
+  // Launch a GEMM kernel
+  kernel::BatchedGemm<
+    TensorRefCollectionA,
+    TensorRefCollectionB,
+    TensorRefCollectionC,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    tensor_b,
+    beta,
+    tensor_c,
+    initial_accum
+  );
+}
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c) {
+
+  BatchedGemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gemm_complex.h b/csrc/quantization/cutlass_test/example/util/reference/device/gemm_complex.h
new file mode 100644
index 0000000000000..b4d41bd28efb5
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/gemm_complex.h
@@ -0,0 +1,350 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in device-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  int kMblock = 4,
+  int kNblock = 4
+>
+__global__ void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
+  int batch_idx = blockIdx.z;
+
+  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
+  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
+  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
+  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
+
+  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+
+    // Compute matrix product using blocks
+    ComputeType accum[kMblock][kNblock];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        accum[i][j] = initial_accum;
+      }
+    }
+
+    for (int k_block = 0; k_block < K; ++k_block) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kNblock; j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kMblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          if (row < M && col < N) {
+            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+            ElementB b = tensor_b.at(MatrixCoord(k_block, col));
+
+            ComputeType a_ik = ComputeType(a);
+            ComputeType b_kj = ComputeType(b);
+
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_ik = conj(a_ik);
+            }
+
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_kj = conj(b_kj);
+            }
+
+            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+          }
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        int row = row_block + i;
+        int col = col_block + j;
+
+        MatrixCoord coord = MatrixCoord(row, col);
+
+        if (row < M && col < N) {
+
+          tensor_d.at(coord) = convert_op(
+            alpha * ScalarType(accum[i][j]) + 
+            beta * ScalarType(tensor_c.at(coord)));
+        }
+      }
+    }
+
+    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
+    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
+    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
+    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
+
+  } // for (batch_idx)
+}
+
+} // namespace kernel
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+ 
+  int const kMblock = 4;
+  int const kNblock = 4;
+
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    batch_count % std::numeric_limits<uint16_t>::max()
+  );
+
+  if (grid.y <= std::numeric_limits<uint16_t>::max()) {
+    kernel::GemmComplex<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ScalarType,
+      ComputeType,
+      ElementD,
+      ConvertOp,
+      InnerProductOp,
+      kMblock,
+      kNblock
+    ><<< grid, block >>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      transform_a,
+      tensor_b,
+      transform_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum,
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+  } else {
+    // Using bigger thread tile size
+    int const kBigMblock = 4;
+    int const kBigNblock = 16;
+
+    dim3 Bigblock(16, 8);
+    dim3 Biggrid(
+      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
+      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
+      batch_count % std::numeric_limits<uint16_t>::max()
+    );
+
+    kernel::GemmComplex<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ScalarType,
+      ComputeType,
+      ElementD,
+      ConvertOp,
+      InnerProductOp,
+      kBigMblock,
+      kBigNblock
+    ><<< Biggrid, Bigblock >>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      transform_a,
+      tensor_b,
+      transform_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum,
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ElementD = ElementC
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d) {
+
+  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gemm_planar_complex.h b/csrc/quantization/cutlass_test/example/util/reference/device/gemm_planar_complex.h
new file mode 100644
index 0000000000000..37c103c3fcb45
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/gemm_planar_complex.h
@@ -0,0 +1,311 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in device code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static int const kGemmPlanarComplexBlockSize = 4;
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<complex<ComputeType>>
+>
+__global__ void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
+  complex<ComputeType> initial_accum) {
+
+  int const kMblock = kGemmPlanarComplexBlockSize;
+  int const kNblock = kGemmPlanarComplexBlockSize;
+
+  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
+  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
+  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  complex<ComputeType> accum[kMblock][kNblock];
+  
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int j = 0; j < kNblock; j++) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kMblock; i++) {
+      accum[i][j] = initial_accum;
+    }
+  }
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int k_block = 0; k_block < K; ++k_block) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+
+        int row = row_block + i;
+        int col = col_block + j;
+
+        if (row < M && col < N) {
+
+          ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
+          ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
+
+          complex<ComputeType> a = complex<ComputeType>{
+            ComputeType(a_ik.real()),
+            ComputeType(a_ik.imag())
+          };
+
+          complex<ComputeType> b = complex<ComputeType>{
+            ComputeType(b_kj.real()),
+            ComputeType(b_kj.imag())
+          };
+
+          if (transform_a == ComplexTransform::kConjugate) {
+            a = conj(a);
+          }
+
+          if (transform_b == ComplexTransform::kConjugate) {
+            b = conj(b);
+          }
+
+          accum[i][j] = inner_product_op(a, b,  accum[i][j]);
+        }
+      }
+    }
+  }
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int j = 0; j < kNblock; j++) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kMblock; i++) {
+
+      int row = row_block + i;
+      int col = col_block + j;
+
+      MatrixCoord coord = MatrixCoord(row, col);
+
+      if (row < M && col < N) {
+
+        complex<ScalarType> acc{
+          ScalarType(accum[i][j].real()),
+          ScalarType(accum[i][j].imag())
+        };
+
+        ComplexC c_ij = ComplexC();
+
+        if (beta.real() != ScalarType() || beta.imag() != ScalarType()) {
+          c_ij = tensor_c.at(coord);
+        }
+
+        complex<ScalarType> src{
+          ScalarType(c_ij.real()),
+          ScalarType(c_ij.imag())
+        };
+
+        complex<ScalarType> result = alpha * acc + beta * src;
+
+        ComplexC d_ij;
+
+        d_ij.real() = convert_op(result.real());
+        d_ij.imag() = convert_op(result.imag());
+
+        tensor_d.at(coord) = d_ij;
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<complex<ComputeType>>
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
+  complex<ComputeType> initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  int const kMblock = kernel::kGemmPlanarComplexBlockSize;
+  int const kNblock = kernel::kGemmPlanarComplexBlockSize;
+
+  dim3 block(16, 8);
+
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    1);
+
+  kernel::GemmPlanarComplex<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ScalarType,
+    ComputeType,
+    ConvertOp,
+    InnerProductOp
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    transform_a,
+    tensor_b,
+    transform_b,
+    beta,    
+    tensor_c,
+    tensor_d,
+    initial_accum
+  );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
+
+  GemmPlanarComplex(
+    problem_size, 
+    alpha, 
+    tensor_a, transform_a, 
+    tensor_b, transform_b, 
+    beta, 
+    tensor_c,
+    tensor_d,
+    complex<ScalarType>());
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gett.hpp b/csrc/quantization/cutlass_test/example/util/reference/device/gett.hpp
new file mode 100644
index 0000000000000..78586ad62dc18
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/gett.hpp
@@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief GETT device reference code
+*/
+#pragma once
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::reference::device {
+
+template <
+  class ATensor,
+  class BTensor,
+  class CTensor,
+  class DTensor,
+  class ElementAccumulator,
+  class ElementEpilogue>
+__global__ static
+void
+gett_kernel(
+  DTensor       D,
+  ATensor const A,
+  BTensor const B,
+  CTensor const C,
+  ElementEpilogue alpha, ElementEpilogue beta,
+  ElementAccumulator acc_init)
+{
+  using namespace cute;
+
+  static_assert(DTensor::rank == 3, "(M,N,L)");
+  static_assert(ATensor::rank == 3, "(M,K,L)");
+  static_assert(BTensor::rank == 3, "(N,K,L)");
+  static_assert(CTensor::rank == 3, "(M,N,L)");
+
+  assert(size<0>(A) == size<0>(D));  // M
+  assert(size<0>(C) == size<0>(D));  // M
+  assert(size<0>(B) == size<1>(D));  // N
+  assert(size<1>(C) == size<1>(D));  // N
+  assert(size<1>(A) == size<1>(B));  // K
+  assert(size<2>(A) == size<2>(D));  // L
+  assert(size<2>(B) == size<2>(D));  // L
+  assert(size<2>(C) == size<2>(D));  // L
+
+  NumericConverter<ElementAccumulator, typename ATensor::value_type> a_converter;
+  NumericConverter<ElementAccumulator, typename BTensor::value_type> b_converter;
+  NumericConverter<ElementEpilogue, ElementAccumulator> acc_converter;
+  NumericConverter<ElementEpilogue, typename CTensor::value_type> source_converter;
+  NumericConverter<typename DTensor::value_type, ElementEpilogue> output_converter;
+
+  // Thread id to each element of D
+  for (int tid = threadIdx.x + blockDim.x * blockIdx.x;
+       tid < size(D);
+       tid += blockDim.x * gridDim.x) {
+    // (m,n,l) coordinate
+    auto mnl_coord = idx2crd(tid, product_each(shape(D)));
+    auto m = get<0>(mnl_coord);
+    auto n = get<1>(mnl_coord);
+    auto l = get<2>(mnl_coord);
+
+    auto A_ml = A(m,_,l);
+    auto B_nl = B(n,_,l);
+
+    ElementAccumulator accum = ElementAccumulator(0);
+    for (int k = 0; k < size<1>(A); ++k) {
+      ElementAccumulator a = a_converter(A_ml(k));
+      ElementAccumulator b = b_converter(B_nl(k));
+      accum += a * b;
+    }
+
+    ElementEpilogue scaled_output = (alpha * acc_converter(accum)) + (beta * source_converter(C(m,n,l)));
+    D(m,n,l) = output_converter(scaled_output);
+  }
+}
+
+// Most general version
+template <
+  class ProblemShapeMNKL,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class ElementAccumulator,
+  class ElementC,
+  class StrideC,
+  class ElementD,
+  class StrideD,
+  class ElementEpilogue>
+void
+gett(
+    ProblemShapeMNKL problem_shape_mnkl,
+    ElementA const* ptr_A, StrideA stride_a_mkl,
+    ElementB const* ptr_B, StrideB stride_b_nkl,
+    ElementAccumulator _,
+    ElementC const* ptr_C, StrideC stride_c_mnl,
+    ElementD      * ptr_D, StrideD stride_d_mnl,
+    ElementEpilogue alpha, ElementEpilogue beta,
+    cudaStream_t stream = 0) {
+  using namespace cute;
+
+  static_assert(cute::rank(ProblemShapeMNKL{}) == 4);
+  auto M = get<0>(problem_shape_mnkl);
+  auto N = get<1>(problem_shape_mnkl);
+  auto K = get<2>(problem_shape_mnkl);
+  auto L = get<3>(problem_shape_mnkl);
+
+  // Represent the full tensors
+  auto A = make_tensor(make_gmem_ptr(ptr_A), make_shape(M,K,L), stride_a_mkl); // (M,K,L)
+  auto B = make_tensor(make_gmem_ptr(ptr_B), make_shape(N,K,L), stride_b_nkl); // (N,K,L)
+  auto C = make_tensor(make_gmem_ptr(ptr_C), make_shape(M,N,L), stride_c_mnl); // (M,N,L)
+  auto D = make_tensor(make_gmem_ptr(ptr_D), make_shape(M,N,L), stride_d_mnl); // (M,N,L)
+
+  dim3 dimBlock(256);
+  dim3 dimGrid(240);
+  gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0));
+}
+
+} // namespace cutlass::reference::device
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/kernel/gemm.h b/csrc/quantization/cutlass_test/example/util/reference/device/kernel/gemm.h
new file mode 100644
index 0000000000000..f7731213013d5
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/kernel/gemm.h
@@ -0,0 +1,162 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/reference/device/thread/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename TensorRefA,
+  typename TensorRefB,
+  typename TensorRefC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename OutputTile,
+  typename InnerProductOp,
+  typename ConvertOp
+>
+__global__ void Gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRefA tensor_a,
+  TensorRefB tensor_b,
+  ScalarType beta,
+  TensorRefC tensor_c,
+  TensorRefC tensor_d,
+  AccumulatorType initial_accum) {
+
+  // Map each thread to a unique tile of the output matrix
+  MatrixCoord output_coord(
+    MatrixCoord::Index((threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow),
+    MatrixCoord::Index((threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn)
+  );
+
+  // Compute the general matrix product
+  thread::Gemm<
+    TensorRefA,
+    TensorRefB,
+    TensorRefC,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  > gemm(initial_accum);
+
+  gemm.multiply_add(
+    problem_size,
+    tensor_a,
+    tensor_b,
+    output_coord);
+
+  gemm.epilogue(problem_size, alpha, beta, tensor_c, tensor_d, output_coord);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename OutputTile,
+  typename InnerProductOp,
+  typename ConvertOp
+>
+__global__ void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRefCollectionA tensor_collection_a,
+  TensorRefCollectionB tensor_collection_b,
+  ScalarType beta,
+  TensorRefCollectionC tensor_collection_c,
+  AccumulatorType initial_accum) {
+
+  // Obtain batch ID
+  int batch_id = blockIdx.z;
+
+  // Dereference based on batch_id
+  typename TensorRefCollectionA::TensorRef tensor_a = tensor_collection_a.at(batch_id);
+  typename TensorRefCollectionB::TensorRef tensor_b = tensor_collection_b.at(batch_id);
+  typename TensorRefCollectionC::TensorRef tensor_c = tensor_collection_c.at(batch_id);
+
+  // Map each thread to a unique tile of the output matrix
+  MatrixCoord output_coord(
+    (threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kColumn,
+    (threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kRow
+  );
+
+  // Compute the general matrix product
+  thread::Gemm<
+    typename TensorRefCollectionA::TensorRef,
+    typename TensorRefCollectionB::TensorRef,
+    typename TensorRefCollectionC::TensorRef,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  > gemm(initial_accum);
+
+  gemm.multiply_add(
+    problem_size,
+    tensor_a,
+    tensor_b,
+    output_coord);
+
+  gemm.epilogue(problem_size, alpha, beta, tensor_c, output_coord);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_elementwise.h b/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_elementwise.h
new file mode 100644
index 0000000000000..c703f07f78a24
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_elementwise.h
@@ -0,0 +1,168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <curand_kernel.h>
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to initialize tensor to uniform random distribution
+template <typename T>
+__global__ void TensorInitializeUniform(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      double range = dist.uniform.max - dist.uniform.min;
+
+      double rnd = curand_uniform(&rng_state[threadIdx.x]);
+
+      rnd = dist.uniform.min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      if (dist.int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << dist.int_scale)));
+        *tensor = T(rnd / double(1 << dist.int_scale));
+      } else {
+        *tensor = T(rnd);
+      }
+
+      tensor += ldm;
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to initialize tensor to uniform distribution
+template <typename T>
+__global__ void TensorInitializeGaussian(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+
+      double rnd = curand_normal(&rng_state[threadIdx.x]);
+
+      rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
+
+      if (dist.int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << dist.int_scale)));
+        *tensor = T(rnd / double(1 << dist.int_scale));
+      } else {
+        *tensor = T(rnd);
+      }
+    }
+  }
+}
+
+/// Kernel to initialize tensor to an identity matrix
+template <typename T>
+__global__ void TensorInitializeLinear(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      *tensor =
+          dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
+    }
+  }
+}
+
+/// Kernel to initialize tensor to an identity matrix
+template <typename T>
+__global__ void TensorInitializeIdentity(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+
+  tensor += s_idx * ldm + c_idx;
+
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      *tensor = (c_idx == s_idx ? T(1) : T(0));
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_foreach.h b/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_foreach.h
new file mode 100644
index 0000000000000..a64a419d8a193
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_foreach.h
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/fast_math.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines several helpers
+namespace detail {
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank, int RankRemaining>
+struct TensorForEachHelper {
+
+  /// Constructor for general rank
+  __inline__ __device__
+  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
+
+    int64_t product = 1;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Rank - RankRemaining; i < Rank; ++i) {
+      product *= size[i];
+    }
+
+    coord[Rank - 1 - RankRemaining] = index / product;
+    int64_t remaining = index % product;
+    
+    TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
+  }
+};
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank>
+struct TensorForEachHelper<Func, Rank, 0> {
+
+  /// Constructor for fastest changing rank
+  __inline__ __device__
+  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
+
+    coord[Rank - 1] = index;
+
+    if (coord < size) {
+      func(coord);
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel calls a functor for each element in a tensor's index space
+template <typename Func, int Rank, typename Params>
+__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
+
+  Func func(params);
+
+  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t max_index = 1;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    max_index *= size[i];
+  }
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  while  (index < max_index) {
+    Coord<Rank> coord;
+
+    detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index); 
+    index += blockDim.x * gridDim.x;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel calls a functor for each element along a tensor's diagonal
+template <typename Func, int Rank, typename Params>
+__global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
+
+  Func func(params);
+
+  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
+
+  if (index < end) {
+    Coord<Rank> coord;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Rank; ++i) {
+      coord[i] = index;
+    }
+
+    func(coord);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, typename Func>
+__global__ void BlockForEach(
+  Element *ptr, 
+  size_t capacity, 
+  typename Func::Params params) {
+
+  Func func(params);
+
+  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (; index < capacity; index += blockDim.x * gridDim.x) {
+    ReferenceFactory<Element>::get(ptr, index) = func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/rank_2k_complex.h b/csrc/quantization/cutlass_test/example/util/reference/device/rank_2k_complex.h
new file mode 100644
index 0000000000000..d5892457ca942
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/rank_2k_complex.h
@@ -0,0 +1,355 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in device-side code.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  int kMblock = 4,
+  int kNblock = 4
+>
+__global__ void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  assert(M=N);
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
+  int batch_idx = blockIdx.z;
+
+  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
+  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
+  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
+  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
+
+  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+
+    // Compute matrix product using blocks
+    ComputeType accum[kMblock][kNblock];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        accum[i][j] = initial_accum;
+      }
+    }
+
+    for (int k_block = 0; k_block < K; ++k_block) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kNblock; j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kMblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          if (row < M && col < N &&
+             ( (fill_mode_c == FillMode::kLower && row >= col) || 
+              (fill_mode_c == FillMode::kUpper && row <= col) )               
+            ) {
+
+            // A x B^T (Symmetric) or A x B^H (Hermitian)
+            // complex conjugation on operandB (b_t) is function of blas3 computation
+            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+            ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
+                          conj(tensor_b.at(MatrixCoord(col, k_block))) : 
+                          tensor_b.at(MatrixCoord(col, k_block));
+
+            ComputeType a_ik = ComputeType(a);
+            ComputeType b_jk = ComputeType(b_t);
+
+            // complex conjugation is a function of operand layouts
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_ik = conj(a_ik);
+            }
+            // complex conjugation is a function of operand layouts
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_jk = conj(b_jk);
+            }
+
+            accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+
+            // B x A^T (Symmetric) or B x A^H (Hermitian)
+            // complex conjugation on operandB (a_t) is function of blas3 computation
+            ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+            ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
+                            conj(tensor_a.at(MatrixCoord(col, k_block))):
+                            tensor_a.at(MatrixCoord(col, k_block));
+
+            ComputeType b_ik = ComputeType(b);
+            ComputeType a_jk = ComputeType(a_t);
+            
+            // complex conjugation here is a function of operand layouts
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_ik = conj(b_ik);
+            }
+            // complex conjugation here is a function of operand layouts
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_jk = conj(a_jk);
+            }
+
+            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+          }
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        int row = row_block + i;
+        int col = col_block + j;
+
+        MatrixCoord coord = MatrixCoord(row, col);
+
+        if (row < M && col < N && 
+            ((fill_mode_c == FillMode::kLower && row >= col) || 
+             (fill_mode_c == FillMode::kUpper && row <= col))
+          ) {
+
+          ScalarType c = tensor_c.at(coord);
+          // The imaginary parts of the diagonal elements of 
+          // a complex data type are assumed and set to zero
+          if (blas_mode == BlasMode::kHermitian) {
+            c = (row == col) ? real(c) : c;
+          }
+
+          tensor_d.at(coord) = convert_op(
+            alpha * ScalarType(accum[i][j]) + 
+            beta * c);
+        }
+      }
+    }
+
+    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
+    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
+    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
+    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
+
+  } // for (batch_idx)
+}
+
+} // namespace kernel
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+ 
+  int const kMblock = 4;
+  int const kNblock = 4;
+
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    batch_count % std::numeric_limits<uint16_t>::max()
+  );
+
+  kernel::Rank2KComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ScalarType,
+    ComputeType,
+    ConvertOp,
+    InnerProductOp,
+    kMblock,
+    kNblock
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    transform_a,
+    tensor_b,
+    transform_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    initial_accum,
+    fill_mode_c,
+    blas_mode,
+    batch_count,
+    batch_stride_A,
+    batch_stride_B,
+    batch_stride_C,
+    batch_stride_D
+  );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+
+  Rank2KComplex(    
+    problem_size, alpha, 
+    tensor_a, transform_a, 
+    tensor_b, transform_b, 
+    beta, tensor_c, tensor_d, 
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_compare.h b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_compare.h
new file mode 100644
index 0000000000000..e6b36990f0f1a
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_compare.h
@@ -0,0 +1,246 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+
+#pragma once
+// Standard Library includes
+#include <utility>
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/relatively_equal.h"
+
+#include "cutlass/util/distribution.h"
+
+#include "tensor_foreach.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+template <typename Element>
+__global__ void BlockCompareEqual(
+  int *equal, 
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity) {
+
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+
+    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
+    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
+
+    if (a != b) {
+      *equal = 0;
+
+      return;
+    }
+  }
+}
+
+template <typename Element>
+__global__ void BlockCompareRelativelyEqual(
+  int *equal, 
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity,
+  Element epsilon,
+  Element nonzero_floor) {
+
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+
+    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
+    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
+
+    if (!relatively_equal(a, b, epsilon, nonzero_floor)) {
+      *equal = 0;
+      return;
+    }
+  }
+}
+
+} // namespace kernel
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performs a bit-level equality check between two blocks
+template <typename Element>
+bool BlockCompareEqual(
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity,
+  int grid_size = 0, 
+  int block_size = 0) {
+
+  int equal_flag = 1;
+  int *device_equal_flag = nullptr;
+
+  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+
+  if (cudaMemcpy(
+    device_equal_flag, 
+    &equal_flag, 
+    sizeof(int), 
+    cudaMemcpyHostToDevice) != cudaSuccess) {
+
+    throw std::runtime_error("Failed to copy equality flag to device.");
+  }
+
+  if (!grid_size || !block_size) {
+
+    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+      &grid_size,
+      &block_size,
+      reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to query occupancy.");
+    }
+
+    // Limit block size. This has the effect of increasing the number of items processed by a
+    // single thread and reduces the impact of initialization overhead.
+    block_size = (block_size < 128 ? block_size : 128);
+  }
+
+  dim3 grid(grid_size, 1, 1);
+  dim3 block(block_size, 1, 1);
+
+  kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
+
+  if (cudaMemcpy(
+    &equal_flag, 
+    device_equal_flag,
+    sizeof(int), 
+    cudaMemcpyDeviceToHost) != cudaSuccess) {
+    
+    cudaFree(device_equal_flag);
+
+    throw std::runtime_error("Failed to copy equality flag from device.");
+  }
+
+  cudaFree(device_equal_flag);
+
+  return equal_flag;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performs a bit-level equality check between two blocks
+template <typename Element>
+bool BlockCompareRelativelyEqual(
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity,
+  Element epsilon,
+  Element nonzero_floor,
+  int grid_size = 0, 
+  int block_size = 0) {
+
+  int equal_flag = 1;
+  int *device_equal_flag = nullptr;
+
+  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+
+  if (cudaMemcpy(
+    device_equal_flag, 
+    &equal_flag, 
+    sizeof(int), 
+    cudaMemcpyHostToDevice) != cudaSuccess) {
+
+    throw std::runtime_error("Failed to copy equality flag to device.");
+  }
+
+  if (!grid_size || !block_size) {
+
+    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+      &grid_size,
+      &block_size,
+      reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to query occupancy.");
+    }
+
+    // Limit block size. This has the effect of increasing the number of items processed by a
+    // single thread and reduces the impact of initialization overhead.
+    block_size = (block_size < 128 ? block_size : 128);
+  }
+
+  dim3 grid(grid_size, 1, 1);
+  dim3 block(block_size, 1, 1);
+
+  kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
+    device_equal_flag, 
+    ptr_A, 
+    ptr_B, 
+    capacity, 
+    epsilon, 
+    nonzero_floor
+  );
+
+  if (cudaMemcpy(
+    &equal_flag, 
+    device_equal_flag,
+    sizeof(int), 
+    cudaMemcpyDeviceToHost) != cudaSuccess) {
+    
+    cudaFree(device_equal_flag);
+
+    throw std::runtime_error("Failed to copy equality flag from device.");
+  }
+
+  cudaFree(device_equal_flag);
+
+  return equal_flag;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // reference
+} // cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_fill.h b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_fill.h
new file mode 100644
index 0000000000000..13aedf14d113f
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_fill.h
@@ -0,0 +1,2077 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
+    in this header are not specialized for any particular data layout and are therefore not
+    intended to offer the best possible performance. Rather, they are intended to be generic
+    reference implementations to support the CUTLASS unit tests.
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <type_traits>
+#include <cstdint>
+
+#endif
+
+// CUDA includes
+#include <curand_kernel.h>
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/layout/vector.h"
+
+#include "cutlass/util/reference/device/tensor_foreach.h"
+#include "cutlass/util/distribution.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename FloatType>
+CUTLASS_DEVICE
+FloatType random_normal_float(curandState_t *state) {
+  return curand_normal(state);
+}
+
+template <>
+CUTLASS_DEVICE
+double random_normal_float<double>(curandState_t *state) {
+  return curand_normal_double(state);
+}
+
+template <typename FloatType>
+CUTLASS_DEVICE
+FloatType random_uniform_float(curandState_t *state) {
+  return curand_uniform(state);
+}
+
+template <>
+CUTLASS_DEVICE
+double random_uniform_float<double>(curandState_t *state) {
+  return curand_uniform_double(state);
+}
+
+template <typename Element>
+struct RandomGaussianFunc {
+
+  using FloatType = typename std::conditional<(sizeof(Element) > 4), double, float>::type;
+  using IntType = typename std::conditional<(sizeof(Element) > 4), int64_t, int>::type;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    uint64_t seed;
+    FloatType mean;
+    FloatType stddev;
+    int int_scale;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0,
+      Element mean_ = 0, 
+      Element stddev_ = 1,
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_), 
+      mean(static_cast<FloatType>(mean_)), 
+      stddev(static_cast<FloatType>(stddev_)), 
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {
+
+      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
+      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// RNG state object
+  curandState_t rng_state;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomGaussianFunc(Params const &params): params(params) {
+
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+
+    FloatType rnd = random_normal_float<FloatType>(&rng_state);
+    rnd = params.mean + params.stddev * rnd;
+
+    Element result;
+    if (params.int_scale >= 0) {
+      rnd = FloatType(IntType(std::llround(rnd * params.float_scale_up)));
+      result = Element(IntType(rnd * params.float_scale_down));
+    }
+    else {
+      result = Element(rnd);
+    }
+
+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd += FloatType(1);
+      } else {
+        rnd -= FloatType(1);
+      }
+      result = Element(rnd);
+    }
+
+    return result;
+  }
+};
+
+
+template <typename Real>
+struct RandomGaussianFunc<complex<Real>> {
+
+  using Element = complex<Real>;
+  using FloatType = typename std::conditional<(sizeof(Real) > 4), double, float>::type;
+  using IntType = typename std::conditional<(sizeof(Real) > 4), int64_t, int>::type;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    uint64_t seed;
+    FloatType mean;
+    FloatType stddev;
+    int int_scale;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0,
+      Real mean_ = 0, 
+      Real stddev_ = 1,
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_), 
+      mean(static_cast<FloatType>(mean_)), 
+      stddev(static_cast<FloatType>(stddev_)), 
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {
+
+      float_scale_up = FloatType(IntType(1) << int_scale);
+      float_scale_up += FloatType(0.5) * float_scale_up;
+      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// RNG state object
+  curandState_t rng_state;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomGaussianFunc(Params const &params): params(params) {
+
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+
+    FloatType rnd_r = random_normal_float<FloatType>(&rng_state);
+    FloatType rnd_i = random_normal_float<FloatType>(&rng_state);
+    rnd_r = params.mean + params.stddev * rnd_r;
+    rnd_i = params.mean + params.stddev * rnd_i;
+
+    Element result;
+    if (params.int_scale >= 0) {
+      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
+      rnd_i = FloatType(IntType(rnd_i * params.float_scale_down));
+
+      result = {
+        Real(rnd_r * params.float_scale_down),
+        Real(rnd_i * params.float_scale_down)
+      };
+    }
+    else {
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
+    if (params.exclude_zero >= 0 && 
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+
+      if (rnd_r > FloatType(0)) {
+        rnd_r += FloatType(1);
+      } else {
+        rnd_r -= FloatType(1);
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
+    return result;
+  }
+};
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomGaussianFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  using RandomFunc = RandomGaussianFunc<Element>;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    typename RandomFunc::Params random;
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      typename RandomFunc::Params random_ = typename RandomFunc::Params()
+    ):
+      view(view_), random(random_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+  RandomFunc random;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillRandomGaussianFunc(Params const &params): params(params), random(params.random) {
+
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    params.view.at(coord) = random();
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomGaussian(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  typename RealType<Element>::Type mean = Element(0),   ///< Gaussian distribution's mean
+  typename RealType<Element>::Type stddev = Element(1), ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  int exclude_zero = -1,                  ///< If non-negative, excludes zeros from tensor init
+  cudaStream_t stream = nullptr) {
+
+  using RandomFunc = detail::RandomGaussianFunc<Element>;
+  using Func = detail::TensorFillRandomGaussianFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a Gaussian distribution.
+template <typename Element>               ///< Element type
+void BlockFillRandomGaussian(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                              ///< seed for RNG
+  typename RealType<Element>::Type mean,      ///< Gaussian distribution's mean
+  typename RealType<Element>::Type stddev,    ///< Gaussian distribution's standard deviation
+  int bits = -1,                              ///< If non-negative, specifies number of fractional bits that
+                                              ///  are not truncated to zero. Permits reducing precision of
+                                              ///  data.
+  cudaStream_t stream = nullptr) {
+
+  using RandomFunc = detail::RandomGaussianFunc<Element>;
+
+  typename RandomFunc::Params params(seed, mean, stddev, bits);
+
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random uniform distribution
+template <typename Element>                ///< Element type 
+struct RandomUniformFunc {
+
+  using FloatType = typename std::conditional<
+    (sizeof(Element) > 4),
+    double,
+    float>::type;
+
+  using IntType = typename std::conditional<
+    (sizeof(Element) > 4),
+    int64_t,
+    int>::type;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    uint64_t seed;
+    FloatType range;
+    FloatType max;
+    int int_scale;
+    double pnan;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0, 
+      Element max_ = 1,
+      Element min = 0,
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_), 
+      range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)), 
+      max(static_cast<FloatType>(max_)),
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {
+      
+      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
+      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        range = (min == Element(0)) ? range - FloatType(1): range;
+        max = (max_ == Element(0)) ? max - FloatType(1): max; 
+      }
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// RNG state object
+  curandState_t rng_state;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomUniformFunc(Params const &params): params(params) {
+
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+
+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(NAN);
+      }
+    }
+
+    FloatType rnd = random_uniform_float<FloatType>(&rng_state);
+    rnd = params.max - params.range * rnd;
+
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+
+    if (params.int_scale >= 0) {
+      rnd = FloatType(IntType(std::llround(rnd * params.float_scale_up)));
+      result = Element(IntType(rnd * params.float_scale_down));
+    }
+    else {
+      result = Element(rnd);
+    }
+
+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd = std::min(params.max, rnd + FloatType(1));
+      } else {
+        rnd = std::max((params.max - params.range), rnd - FloatType(1));
+      }
+      result = Element(rnd);
+    }
+
+    return result;
+  }
+};
+
+/// Computes a random Gaussian distribution
+template <typename Real>
+struct RandomUniformFunc<complex<Real>> {
+
+  using Element = complex<Real>;
+
+  using FloatType = typename std::conditional<
+    (sizeof(Real) > 4),
+    double,
+    float>::type;
+
+  using IntType = typename std::conditional<
+    (sizeof(Real) > 4),
+    int64_t,
+    int>::type;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    uint64_t seed;
+    FloatType range;
+    FloatType min;
+    int int_scale;
+    double pnan;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0, 
+      FloatType max = 1,
+      FloatType min_ = 0,
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_), 
+      range(static_cast<FloatType>(max - min_)), 
+      min(static_cast<FloatType>(min_)), 
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {
+
+      float_scale_up = FloatType(IntType(1) << int_scale);
+      float_scale_up += FloatType(0.5) * float_scale_up;
+      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        min = (min == FloatType(0)) ? min + FloatType(1): min;
+        range = (max == FloatType(0)) ? range - FloatType(1): range; 
+      }
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// RNG state object
+  curandState_t rng_state;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomUniformFunc(Params const &params): params(params) {
+
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+
+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(Real(NAN), Real(NAN));
+      }
+    }
+
+    FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
+    FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
+
+    rnd_r = params.min + params.range * rnd_r;
+    rnd_i = params.min + params.range * rnd_i;
+
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+
+    if (params.int_scale >= 0) {
+      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
+      rnd_i = FloatType(IntType(rnd_i * params.float_scale_up));
+
+      result = {
+        Real(rnd_r * params.float_scale_down),
+        Real(rnd_i * params.float_scale_down)
+      };
+    }
+    else {
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
+    if (params.exclude_zero >= 0 && 
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+
+      if (rnd_r > FloatType(0)) {
+        rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
+      } else {
+        rnd_r = std::max((params.min), rnd_r - FloatType(1));
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
+    return result;
+  }
+};
+
+/// Computes a random uniform distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomUniformFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  using RandomFunc = RandomUniformFunc<Element>;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    typename RandomFunc::Params random;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      typename RandomFunc::Params random_ = RandomFunc::Params()
+    ):
+      view(view_), random(random_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+  RandomFunc random;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillRandomUniformFunc(Params const &params): params(params), random(params.random) {
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    params.view.at(coord) = random();
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  typename RealType<Element>::Type max = Element(1), ///< upper bound of distribution
+  typename RealType<Element>::Type min = Element(0), ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  int exclude_zero = -1,               ///< If non-negative, excludes zeros from tensor init
+  cudaStream_t stream = nullptr) {
+
+  using RandomFunc = detail::RandomUniformFunc<Element>;
+  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, random),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <typename Element>
+void BlockFillRandomUniform(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  typename RealType<Element>::Type max,   ///< upper bound of distribution
+  typename RealType<Element>::Type min,   ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  cudaStream_t stream = nullptr) {
+
+  using RandomFunc = detail::RandomUniformFunc<Element>;
+
+  typename RandomFunc::Params params(seed, max, min, bits, pnan);
+
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random sparse meta 
+template <typename Element>               ///< Element type
+struct RandomSparseMetaFunc {
+
+  using FloatType = float;
+
+  using IntType = int32_t;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    uint64_t seed;
+    FloatType range;
+    int MetaSizeInBits;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0, 
+      int MetaSizeInBits_ = 2 
+    ):
+      seed(seed_), 
+      MetaSizeInBits(MetaSizeInBits_) {
+      if (MetaSizeInBits_ == 2) {
+        range = 6;
+      }
+      else if (MetaSizeInBits_ == 4) {
+        range = 2;
+      }
+      else {
+        throw std::invalid_argument("Invalid MetaSizeInBits");
+      }
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  /// RNG state object
+  curandState_t rng_state;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomSparseMetaFunc(Params const &params): params(params) {
+
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
+    Element TwoToOneMeta[2] = {0x4, 0xe};
+
+    Element *MetaArray =
+        (params.MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
+
+    Element result = 0x0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
+      FloatType rnd = random_uniform_float<FloatType>(&rng_state);
+      rnd = params.range * rnd;
+      Element meta = MetaArray[(int)rnd];
+
+      result = (Element)(result | ((Element)(meta << (i * 4))));
+    }
+
+    return result;
+  }
+};
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomSparseMetaFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  using RandomFunc = RandomSparseMetaFunc<Element>;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    typename RandomFunc::Params random;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      typename RandomFunc::Params random_ = RandomFunc::Params()
+    ):
+      view(view_), random(random_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+  RandomFunc random;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillRandomSparseMetaFunc(Params const &params): params(params), random(params.random) {
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    params.view.at(coord) = random();
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomSparseMeta(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  int MetaSizeInBits = 2,                 ///< meta data size
+  cudaStream_t stream = nullptr) {
+
+  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
+  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  typename RandomFunc::Params random(seed, MetaSizeInBits);
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, random),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <typename Element>
+void BlockFillRandomSparseMeta(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  int MetaSizeInBits = 2,                 ///< meta data size
+  cudaStream_t stream = nullptr) {
+
+  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
+
+  typename RandomFunc::Params params(seed, MetaSizeInBits);
+
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Functor to fill a tensor with zeros off the diagonal and a uniform value on the diagonal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillDiagonalFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element diag;
+    Element other;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    Params(
+      TensorView view_ = TensorView(),
+      Element diag_ = Element(1),
+      Element other_ = Element(0)
+    ):
+      view(view_), diag(diag_), other(other_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillDiagonalFunc(Params const &params): params(params) {
+
+  }
+
+  /// Updates the tensor
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    bool is_diag = true;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+
+    params.view.at(coord) = (is_diag ? params.diag : params.other);
+  }
+};
+
+// Overwrites the elements of a tensor with a uniform value depending on fill mode
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillPartialFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element element;
+    FillMode fill_mode;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): fill_mode(FillMode::kNone) { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,
+      Element element_,
+      FillMode fill_mode_
+    ):
+      view(view_), element(element_), fill_mode(fill_mode_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TensorFillPartialFunc(Params const &params): params(params) {
+
+  }
+
+  /// Overwrites the element if it is within the covered region.
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    bool predicate = true;
+      
+    switch (params.fill_mode) {
+    case FillMode::kFull:
+      predicate = true;
+      break;
+
+    case FillMode::kLower:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] < coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+
+    case FillMode::kUpper:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] > coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+
+    case FillMode::kDiagonal:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] != coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+
+    case FillMode::kNone: // fall-through
+    
+    default:
+      predicate = false;
+      break;
+    }
+    
+    if (predicate) {
+      params.view.at(coord) = params.element;
+    }
+  }
+};
+
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorClearPartialFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// 
+  static_assert((Layout::kRank == 2), "TensorClearPartial is only supported for matrices");
+
+  /// Parameters structure
+  struct Params {
+    TensorView view{};
+    Element element{};
+    FillMode fill_mode{FillMode::kNone};
+    int alignment{0};
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TensorClearPartialFunc(Params const &params): params(params) {
+
+  }
+
+  /// Overwrites the element if it is within the covered region.
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    bool predicate = true;
+      
+    switch (params.fill_mode) {
+
+    case FillMode::kLower:
+      if ((coord[0] >= coord[1]) || 
+          ((coord[1] - coord[0]) >= params.alignment))  {
+          predicate = false;
+        break;
+      }
+      break;
+
+    case FillMode::kUpper:
+      if ((coord[0] <= coord[1]) ||
+          ((coord[0] - coord[1]) >= params.alignment))  {
+          predicate = false;
+        break;
+      }
+      break;
+
+    case FillMode::kNone: // fall-through
+    
+    default:
+      predicate = false;
+      break;
+    }
+    
+    if (predicate) {
+      params.view.at(coord) = params.element;
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor everywhere with a unique value for its diagonal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillDiagonal(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element diag = Element(1),              ///< value to write in the diagonal
+  Element other = Element(0),             ///< value to write off the diagonal
+  cudaStream_t stream = nullptr) {
+
+  typedef detail::TensorFillDiagonalFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, diag, other),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+/// Fills a tensor partially depending on fill mode. Elements not covered by the fillmode are
+/// not written.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillPartial(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element element,
+  FillMode fill_mode,
+  cudaStream_t stream = nullptr) {
+
+  typedef detail::TensorFillPartialFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, element, fill_mode),
+    stream
+  );
+}
+
+/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side
+/// of fillmode (upto the alignment) are overwritten with the user supplied element (typically zeros)
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorClearPartial(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element element,
+  FillMode fill_mode,
+  int alignment,
+  cudaStream_t stream = nullptr) {
+
+  typedef detail::TensorClearPartialFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params{view, element, fill_mode, alignment},
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with a uniform value
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFill(
+  TensorView<Element, Layout> view,         ///< destination tensor
+  Element val = Element(0),                 ///< value to uniformly fill it with
+  cudaStream_t stream = nullptr) {
+
+  TensorFillDiagonal(view, val, val, stream);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor's diagonal with 1 and 0 everywhere else.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillIdentity(
+  TensorView<Element, Layout> view,                 ///< destination tensor
+  cudaStream_t stream = nullptr) {
+
+  TensorFillDiagonal(view, Element(1), Element(0), stream);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorUpdateDiagonalFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element diag;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      Element diag_ = Element(1)
+    ):
+      view(view_), diag(diag_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorUpdateDiagonalFunc(Params const &params): params(params) {
+
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    bool is_diag = true;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+
+    if (is_diag) {
+      params.view.at(coord) = params.diag;  
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateDiagonal(
+  TensorView<Element, Layout> view,                 ///< destination tensor
+  Element diag = Element(1),
+  cudaStream_t stream = nullptr) {
+
+  typedef detail::TensorUpdateDiagonalFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, diag),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorUpdateOffDiagonalFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element other;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      Element other_ = Element(0)
+    ):
+      view(view_), other(other_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorUpdateOffDiagonalFunc(Params const &params): params(params) {
+
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    bool is_diag = true;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+
+    if (!is_diag) {
+      params.view.at(coord) = params.other;  
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateOffDiagonal(
+  TensorView<Element, Layout> view,      ///< destination tensor
+  Element other = Element(1),
+  cudaStream_t stream = nullptr) {
+
+  typedef detail::TensorUpdateOffDiagonalFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, other),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillLinearFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Array<Element, Layout::kRank> v;
+    Element s;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,      ///< destination tensor
+      Array<Element, Layout::kRank> const & v_,
+      Element s_ = Element(0)
+    ):
+      view(view_), v(v_), s(s_) { 
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillLinearFunc(Params const &params): params(params) {
+
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    Element sum = params.s;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank; ++i) {
+      if constexpr (is_complex<Element>::value) {
+        if constexpr (sizeof_bits<Element>::value <= 32) {
+          sum = Element(static_cast<complex<float>>(sum) + 
+                  static_cast<complex<float>>(params.v[i]) * static_cast<complex<float>>(coord[i]));
+        }
+      }
+      else if constexpr (sizeof_bits<Element>::value <= 32) {
+        if constexpr (std::numeric_limits<Element>::is_integer) {
+          sum = Element(static_cast<int32_t>(sum) + 
+                  static_cast<int32_t>(params.v[i]) * static_cast<int32_t>(coord[i]));
+        }
+        else {
+          sum = Element(static_cast<float>(sum) + 
+                  static_cast<float>(params.v[i]) * static_cast<float>(coord[i]));
+        }
+      }
+      else {
+        sum += params.v[i] * coord[i];
+      }
+    }
+
+    params.view.at(coord) = sum;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills tensor with a linear combination of its coordinate and another vector
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillLinear(
+  TensorView<Element, Layout> view,      ///< destination tensor
+  Array<Element, Layout::kRank> const & v,
+  Element s = Element(0),
+  cudaStream_t stream = nullptr) {
+
+  using Func = detail::TensorFillLinearFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, v, s),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values from a distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandom(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,
+  Distribution dist,
+  cudaStream_t stream = nullptr,
+  int exclude_zero = -1                   ///< If non-negative, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+  ) {
+
+  using Real = typename RealType<Element>::Type;
+
+  if (dist.kind == Distribution::Gaussian) {
+    TensorFillRandomGaussian<Element, Layout>(
+      view,
+      seed,
+      static_cast<Real>(dist.gaussian.mean),
+      static_cast<Real>(dist.gaussian.stddev),
+      dist.int_scale,
+      exclude_zero,
+      stream);
+  } else if (dist.kind == Distribution::Uniform) {
+    TensorFillRandomUniform<Element, Layout>(
+      view,
+      seed,
+      static_cast<Real>(dist.uniform.max),
+      static_cast<Real>(dist.uniform.min),
+      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero,
+      stream);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequential(
+  Element *ptr,
+  int64_t capacity,
+  Element v = Element(1),
+  Element s = Element(0)) {
+
+  using Layout = layout::PackedVectorLayout;
+  Layout::TensorCoord size(static_cast<Layout::Index>(capacity)); // -Wconversion
+  Layout layout = Layout::packed(size);
+  TensorView<Element, Layout> view(ptr, layout, size);
+
+  Array<Element, Layout::kRank> c{};
+  c[0] = v;
+
+  TensorFillLinear(view, c, s);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillRandom(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,
+  Distribution dist,
+  cudaStream_t stream = nullptr) {
+
+  using Real = typename RealType<Element>::Type;
+
+  if (dist.kind == Distribution::Gaussian) {
+    BlockFillRandomGaussian<Element>(
+      ptr,
+      capacity,
+      seed,
+      static_cast<Real>(dist.gaussian.mean),
+      static_cast<Real>(dist.gaussian.stddev),
+      dist.int_scale,
+      stream);
+  }
+  else if (dist.kind == Distribution::Uniform) {
+    BlockFillRandomUniform<Element>(
+      ptr,
+      capacity,
+      seed,
+      static_cast<Real>(dist.uniform.max),
+      static_cast<Real>(dist.uniform.min),
+      dist.int_scale,
+      dist.uniform.pnan,
+      stream);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorCopyDiagonalInFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element const *ptr;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,      ///< destination tensor
+      Element const *ptr_
+    ):
+      view(view_), ptr(ptr_) { 
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorCopyDiagonalInFunc(Params const &params): params(params) {
+
+  }
+
+  /// Only update the diagonal element
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool is_diagonal = true;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[0]) {
+        is_diagonal = false;
+      }
+    }
+    if (is_diagonal) {
+      params.view.at(coord) = params.ptr[coord[0]];
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies a diagonal in from host memory without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalIn(
+  TensorView<Element, Layout> view,   ///< destination tensor
+  Element const *ptr,                        ///< dense buffer of elements
+  cudaStream_t stream = nullptr) {
+
+  using Func = detail::TensorCopyDiagonalInFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, ptr),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+namespace detail {
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorCopyDiagonalOutFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element *ptr;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,      ///< destination tensor
+      Element *ptr_
+    ):
+      view(view_), ptr(ptr_) { 
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorCopyDiagonalOutFunc(Params const &params): params(params) {
+
+  }
+
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool is_diagonal = true;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[0]) {
+        is_diagonal = false;
+      }
+    }
+    if (is_diagonal) {
+      params.ptr[coord[0]] = params.view.at(coord);  
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies the diagonal of a tensor into a dense buffer in host memory.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalOut(
+  Element *ptr,                               ///< dense buffer of elements
+  TensorView<Element, Layout> view,      ///< source tensor
+  cudaStream_t stream = nullptr) {
+
+  using Func = detail::TensorCopyDiagonalOutFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, ptr),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_foreach.h b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_foreach.h
new file mode 100644
index 0000000000000..3911b0240c6d2
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_foreach.h
@@ -0,0 +1,144 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/reference/device/kernel/tensor_foreach.h"
+
+namespace cutlass  {
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Launches a kernel calling a functor for each element in a tensor's index space.
+template <typename Func, int Rank, typename Params>
+struct TensorForEach {
+
+  /// Constructor performs the operation.
+  TensorForEach(
+    Coord<Rank> size, Params params = Params(),
+    int grid_size = 0, int block_size = 0,
+    cudaStream_t stream = nullptr) {
+
+    if (!grid_size || !block_size) {
+
+      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+        &grid_size,
+        &block_size,
+        reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
+
+      if (result != cudaSuccess) {
+        throw std::runtime_error("Failed to query occupancy.");
+      }
+
+      // Limit block size. This has the effect of increasing the number of items processed by a
+      // single thread and reduces the impact of initialization overhead.
+      block_size = (block_size < 128 ? block_size : 128);
+    }
+
+    dim3 grid(grid_size, 1, 1);
+    dim3 block(block_size, 1, 1);
+
+    kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Launches a kernel calling a functor for each element along a tensor's diagonal
+template <typename Func, int Rank, typename Params>
+struct TensorDiagonalForEach {
+
+  /// Constructor performs the operation
+  TensorDiagonalForEach(
+    Coord<Rank> size, Params params = Params(),
+    int start = 0, int end = -1,
+    int block_size = 128, cudaStream_t stream = nullptr) {
+
+    if (end < 0) {
+      end = size.min();
+    }
+
+    dim3 block(block_size, 1, 1);
+    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
+
+    kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
+      size, params, start, end);
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, typename Func>
+struct BlockForEach {
+
+  /// Constructor performs the operation.
+  BlockForEach(
+    Element *ptr,
+    size_t capacity,
+    typename Func::Params params = typename Func::Params(),
+    int grid_size = 0,
+    int block_size = 0,
+    cudaStream_t stream = nullptr) {
+
+    if (!grid_size || !block_size) {
+
+      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+        &grid_size,
+        &block_size,
+        reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
+
+      if (result != cudaSuccess) {
+        throw std::runtime_error("Failed to query occupancy.");
+      }
+
+      // Limit block size. This has the effect of increasing the number of items processed by a
+      // single thread and reduces the impact of initialization overhead.
+      block_size = (block_size < 128 ? block_size : 128);
+    }
+
+    dim3 grid(grid_size, 1, 1);
+    dim3 block(block_size, 1, 1);
+
+    kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_reduce.h b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_reduce.h
new file mode 100644
index 0000000000000..47b898b4fd161
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_reduce.h
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cmath>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/detail/linear_to_coordinate.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp,
+  int kBlockSize = 128
+>
+__global__ void TensorTransformReducePartial(
+  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  
+  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t size = view.size();
+
+  __shared__ ComputeType scratchpad[kBlockSize];
+
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+
+    // Map linear thread ID onto tensor coordinate
+    typename Layout::TensorCoord coord;
+
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
+
+    if (view.contains(coord)) {
+
+      // Fetch element
+      Element x = view.at(coord);
+
+      // Transform 
+      identity = reduce(identity, transform(x));
+    }
+  }
+
+  scratchpad[threadIdx.x] = identity;
+
+  __syncthreads();
+
+  // One thread performs the final reduction and stores out. This could be enhanced via
+  // a tree reduction and pipelining.
+  if (threadIdx.x == 0) {
+
+    for (int i = 1; i < kBlockSize; ++i) {
+      identity = reduce(identity, scratchpad[i]);
+    }
+    
+    workspace[blockIdx.x] = identity;
+  }
+}
+
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp,
+  int kBlockSize = 128
+>
+__global__ void TensorTransformReducePartial(
+  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
+  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  
+  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto size = static_cast<int64_t>(view_A.size());
+
+  __shared__ ComputeType scratchpad[kBlockSize];
+
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+
+    // Map linear thread ID onto tensor coordinate
+    typename Layout::TensorCoord coord;
+
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
+
+    if (view_A.contains(coord)) {
+
+      // Fetch element
+      Element a = view_A.at(coord);
+      Element b = view_B.at(coord);
+
+      // Transform 
+      identity = reduce(identity, transform(a, b));
+    }
+  }
+
+  scratchpad[threadIdx.x] = identity;
+
+  __syncthreads();
+
+  // One thread performs the final reduction and stores out. This could be enhanced via
+  // a tree reduction and pipelining.
+  if (threadIdx.x == 0) {
+
+    for (int i = 1; i < kBlockSize; ++i) {
+      identity = reduce(identity, scratchpad[i]);
+    }
+
+    workspace[blockIdx.x] = identity;
+  }
+}
+
+
+template <
+  typename ComputeType,
+  typename ReduceOp,
+  int kBlockSize = 32
+>
+__global__ void TensorTransformReduceFinalize(
+  ComputeType *workspace, 
+  ComputeType identity,
+  int workspace_size,
+  ReduceOp reduce) {
+
+  __shared__ ComputeType scratchpad[kBlockSize];
+
+  for (int idx = threadIdx.x; idx < workspace_size; idx += kBlockSize) {
+    identity = reduce(identity, workspace[idx]);
+  }
+
+  scratchpad[threadIdx.x] = identity;
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+
+    for (int i = 1; i < kBlockSize; ++i) {
+      identity = reduce(identity, scratchpad[i]);
+    }
+
+    workspace[0] = identity;
+  }
+}
+
+} // namespace kernel
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Transform-reduce operation over the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  int workspace_size,                   /// Number of elements in workspace
+  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
+  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
+) {
+
+  int const kBlockSize = 128;
+
+  dim3 block(kBlockSize, 1);
+  dim3 grid(workspace_size, 1);
+
+  kernel::TensorTransformReducePartial<
+    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
+  ><<< grid, block, 0, stream >>>(
+    view, identity, reduce, transform, workspace
+  );
+
+  int const kFinalizeBlockSize = 32;
+
+  kernel::TensorTransformReduceFinalize<
+    ComputeType, ReduceOp, kFinalizeBlockSize
+  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
+    workspace, identity, workspace_size, reduce
+  );
+
+  if (copy_out) {
+    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaMemcpy() failed");
+    }
+  }
+
+  return identity;
+}
+
+/// Transform-reduce operation over the elements of two tensors, zipped together
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
+  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  int workspace_size,                   /// Number of elements in workspace
+  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
+  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
+) {
+
+  if (view_A.extent() != view_B.extent()) {
+    throw std::runtime_error("Extents must be equal.");
+  }
+
+  int const kBlockSize = 128;
+
+  dim3 block(kBlockSize, 1);
+  dim3 grid(workspace_size, 1);
+
+  kernel::TensorTransformReducePartial<
+    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
+  ><<< grid, block, 0, stream >>>(
+    view_A, view_B, identity, reduce, transform, workspace
+  );
+
+  int const kFinalizeBlockSize = 32;
+
+  kernel::TensorTransformReduceFinalize<
+    ComputeType, ReduceOp, kFinalizeBlockSize
+  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
+    workspace, identity, workspace_size, reduce
+  );
+
+  if (copy_out) {
+    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaMemcpy() failed");
+    }
+  }
+
+  return identity;
+}
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view,
+  ComputeType identity,            
+  ReduceOp reduce,                 
+  TransformOp transform,
+  cudaStream_t stream = nullptr, 
+  int workspace_size = 0           
+) {
+
+  // Optionally query for the SM count to size the workspace.
+  if (!workspace_size) {
+
+    int device_idx = 0;
+    cudaDeviceProp prop;
+
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() failed");
+    }
+
+    result = cudaGetDeviceProperties(&prop, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProp() failed");
+    }
+
+    workspace_size = int(prop.multiProcessorCount);
+  }
+
+  DeviceAllocation<ComputeType> workspace(workspace_size);
+
+  ComputeType output = TensorTransformReduce(
+    view, 
+    identity, 
+    reduce, 
+    transform, 
+    workspace.get(), 
+    workspace_size, 
+    stream, 
+    true);
+
+  return output;
+}
+
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity,            
+  ReduceOp reduce,                 
+  TransformOp transform,
+  cudaStream_t stream = nullptr, 
+  int workspace_size = 0           
+) {
+
+  // Optionally query for the SM count to size the workspace.
+  if (!workspace_size) {
+
+    int device_idx = 0;
+    cudaDeviceProp prop;
+
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() failed");
+    }
+
+    result = cudaGetDeviceProperties(&prop, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProp() failed");
+    }
+
+    workspace_size = int(prop.multiProcessorCount);
+  }
+
+  DeviceAllocation<ComputeType> workspace(workspace_size);
+
+  ComputeType output = TensorTransformReduce(
+    view_A,
+    view_B, 
+    identity, 
+    reduce, 
+    transform, 
+    workspace.get(), 
+    workspace_size, 
+    stream, 
+    true);
+
+  return output;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to compute the sum of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSum(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+
+  plus<ComputeType> reduce;
+  NumericConverter<ComputeType, Element> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform, stream, workspace_size);
+}
+
+/// Helper to compute the sum of the squares of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSumSq(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared<Element, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform, stream, workspace_size);
+}
+
+/// Helper to compute the norm of the elements of a tensor.
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNorm(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+
+  return std::sqrt(TensorSumSq(view, identity, stream, workspace_size));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to compute the sum of the squares of the differences of two tensors
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorSumSqDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared_difference<Element, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view_A, view_B, identity, reduce, transform, stream, workspace_size);
+}
+
+
+/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNormDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+
+  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity, stream, workspace_size));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_relu.h b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_relu.h
new file mode 100644
index 0000000000000..4e5a50403cf8d
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/tensor_relu.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
+    in this header are not specialized for any particular data layout and are therefore not
+    intended to offer the best possible performance. Rather, they are intended to be generic
+    reference implementations to support the CUTLASS unit tests.
+*/
+
+#pragma once
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_view.h"
+
+#include "cutlass/util/reference/device/tensor_foreach.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorReLuFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Coordinate in tensor's index space
+  using TensorCoord = typename TensorView::TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element threshold;
+
+
+    //
+    // Methods
+    //
+
+    Params(
+      TensorView view_ = TensorView(),
+      Element threshold_ = Element(0)
+    ):
+      view(view_), threshold(threshold_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  Params params;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TensorReLuFunc(Params const &params): params(params) {
+
+  }
+
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    Element const & value = params.view.at(coord);
+    params.view.at(coord) = (value < params.threshold) ? params.threshold : value;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Apply ReLu on a tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorReLu(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element threshold = Element(0)) {         ///< ReLu threshold
+  
+  using Func = detail::TensorReLuFunc<Element, Layout>;
+  using Params = typename Func::Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, threshold)
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/thread/gemm.h b/csrc/quantization/cutlass_test/example/util/reference/device/thread/gemm.h
new file mode 100644
index 0000000000000..04775a746ad16
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/device/thread/gemm.h
@@ -0,0 +1,186 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace thread {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread-level blocked general matrix product.
+//
+// Note, this is a reference implementation. Performance is not expected to approach peak.
+//
+template <
+  typename TensorRefA,
+  typename TensorRefB,
+  typename TensorRefC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename OutputTile,
+  typename InnerProductOp = multiply_add<AccumulatorType>,
+  typename ConvertOp = NumericConverter<typename TensorRefC::Element, ScalarType>
+>
+struct Gemm {
+
+  using ElementA = typename TensorRefA::Element;
+  using ElementB = typename TensorRefB::Element;
+  using ElementC = typename TensorRefC::Element;
+
+  //
+  // Data members
+  //
+
+  /// Tile for A operand
+  ElementA A_tile[OutputTile::kColumn];
+
+  /// Tile for B operand
+  ElementB B_tile[OutputTile::kRow];
+
+  /// Tile for Accumulator
+  AccumulatorType accum[OutputTile::kColumn][OutputTile::kRow];
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Gemm(AccumulatorType initial_accum = AccumulatorType(0)) {
+
+    // Clear fetch registers
+    for (int i = 0; i < OutputTile::kColumn; ++i) {
+      A_tile[i] = ElementA(0);
+    }
+
+    for (int j = 0; j < OutputTile::kRow; ++j) {
+      B_tile[j] = ElementB(0);
+    }
+
+    // Clear accumulators
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < OutputTile::kColumn; ++j) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < OutputTile::kRow; ++i) {
+        accum[j][i] = initial_accum;
+      }
+    }
+  }
+
+  /// Computes a matrix product
+  CUTLASS_HOST_DEVICE
+  Gemm & multiply_add(
+    gemm::GemmCoord problem_size,
+    TensorRefA tensor_a,
+    TensorRefB tensor_b,
+    MatrixCoord output_coord = MatrixCoord()) {
+
+    InnerProductOp inner_product_op;
+
+    // Loop over the GEMM K dimension
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int k = 0; k < problem_size.k(); ++k) {
+
+      // Fetch a slice of the A matrix
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < OutputTile::kColumn; ++i) {
+        if (output_coord.row() + i < problem_size.m()) {
+          A_tile[i] = tensor_a.at(make_Coord(output_coord.row() + i, k));
+        }
+      }
+
+      // Fetch a slice of the B matrix
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < OutputTile::kRow; ++j) {
+        if (output_coord.column() + j < problem_size.n()) {
+          B_tile[j] = tensor_b.at(make_Coord(k, output_coord.column() + j));
+        }
+      }
+
+      // Compute an accumulated matrix product
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < OutputTile::kRow; ++j) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < OutputTile::kColumn; ++i) {
+          accum[j][i] = inner_product_op(A_tile[i], B_tile[j], accum[j][i]);
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  /// Performs linear scaling of matrix product and updates output tensor
+  CUTLASS_HOST_DEVICE
+  Gemm & epilogue(
+    gemm::GemmCoord problem_size,
+    ScalarType alpha,
+    ScalarType beta,
+    TensorRefC tensor_c,
+    TensorRefC tensor_d,
+    MatrixCoord output_coord = MatrixCoord()) {
+
+    ConvertOp convert_op;
+    
+    // Update the output tensor
+    for (int j = 0; j < OutputTile::kRow; ++j) {
+      for (int i = 0; i < OutputTile::kColumn; ++i) {
+        MatrixCoord coord = output_coord + MatrixCoord(i, j);
+        if (coord.row() < problem_size.m() && coord.column() < problem_size.n()) {
+
+          tensor_d.at(coord) = convert_op(
+            alpha * ScalarType(accum[j][i]) +
+            beta * ScalarType(tensor_c.at(coord))
+          );
+        }
+      }
+    }
+
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/conv.hpp b/csrc/quantization/cutlass_test/example/util/reference/host/conv.hpp
new file mode 100644
index 0000000000000..545dbba9a4e89
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/conv.hpp
@@ -0,0 +1,698 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for CONV in host-side code.
+*/
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+#include "cute/tensor.hpp"
+
+#include <cuda_runtime.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::reference::host {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_) {
+  return ((n_ >= 0 && n_ < size<4>(activation)) &&
+          (d_ >= 0 && d_ < size<3>(activation)) &&
+          (h_ >= 0 && h_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t h_, int32_t w_, int32_t c_) {
+  return ((n_ >= 0 && n_ < size<3>(activation)) &&
+          (h_ >= 0 && h_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t w_, int32_t c_) {
+  return ((n_ >= 0 && n_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+
+} // namespace detail
+
+template<
+  class ElementAcc_,
+  class ElementScalar_,
+  class ElementCompute_,
+  class ElementC_,
+  class ElementOut_,
+  class TensorAlpha_,
+  class TensorBeta_,
+  class TensorBias_,
+  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>
+>
+struct ConvEpilogueFusionParams {
+  using ElementAcc = ElementAcc_;
+  using ElementScalar = ElementScalar_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementC_;
+  using ElementOut = ElementOut_;
+  using TensorAlpha = TensorAlpha_;
+  using TensorBeta = TensorBeta_;
+  using TensorBias = TensorBias_;
+  using ActivationFunctor = ActivationFunctor_;
+  ElementScalar alpha = ElementScalar(1);
+  ElementScalar beta = ElementScalar(0);
+
+  TensorAlpha tensor_alpha{};
+  TensorBeta tensor_beta{};
+  TensorBias tensor_bias{};
+};
+
+template<
+  cutlass::conv::Operator ConvOp,
+  int NumSpatialDims,
+  class TensorA,
+  class TensorB,
+  class TensorC,
+  class TensorD,
+  class ShapePadding,
+  class StrideTraversal,
+  class ShapeDilation,
+  class EpilogueFusionParams
+>
+struct ConvReferenceImpl {
+  // Hard code accumlulator type to float to avoid data lost in accumulating add.
+  using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
+  using ElementC = typename EpilogueFusionParams::ElementC;
+  using ElementOut = typename EpilogueFusionParams::ElementOut;
+  using ElementScalar = typename EpilogueFusionParams::ElementScalar;
+  using ElementCompute = typename EpilogueFusionParams::ElementCompute;
+  using ElementBias = typename EpilogueFusionParams::TensorBias::value_type;
+  using ActivationFunctor = typename EpilogueFusionParams::ActivationFunctor;
+
+  // Input related converter
+  NumericConverter<ElementCompute, ElementAcc> acc_converter;
+  NumericConverter<ElementCompute, ElementC> residual_converter;
+  NumericConverter<ElementCompute, ElementBias> bias_converter;
+  // Scale related converter
+  NumericConverter<ElementCompute, ElementScalar> scale_converter;
+  // Output related converter
+  NumericConverter<ElementOut, ElementCompute> output_converter;
+
+  EpilogueFusionParams& epi_fusion_params_;
+  TensorA const& tensor_a_;
+  TensorB const& tensor_b_;
+  TensorC const& tensor_c_;
+  TensorD& tensor_d_;
+
+  ShapePadding const& padding_;
+  StrideTraversal const& tstride_;
+  ShapeDilation const& dilation_;
+
+  // Epilogue activation operation
+  ActivationFunctor epi_activation;
+
+  ConvReferenceImpl(
+    TensorA const& tensor_a,
+    TensorB const& tensor_b,
+    TensorC const& tensor_c,
+    TensorD& tensor_d,
+    ShapePadding const& padding,
+    StrideTraversal const& tstride,
+    ShapeDilation const& dilation,
+    EpilogueFusionParams& epi_fusion_params)
+  : tensor_a_(tensor_a),
+    tensor_b_(tensor_b),
+    tensor_c_(tensor_c),
+    tensor_d_(tensor_d),
+    padding_(padding),
+    tstride_(tstride),
+    dilation_(dilation),
+    epi_fusion_params_(epi_fusion_params)
+  {
+    static_assert(rank(ShapePadding{}) == rank(ShapeDilation{}));
+    static_assert(rank(ShapePadding{}) == rank(StrideTraversal{}));
+  }
+
+  void compute_reference() {
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      fprop_reference(cute::Int<NumSpatialDims>{});
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      dgrad_reference(cute::Int<NumSpatialDims>{});
+    }
+    else {
+      wgrad_reference(cute::Int<NumSpatialDims>{});
+    }
+  }
+
+private:
+  // Specialization for 1D fprop kernel
+  void fprop_reference(cute::Int<1> spatial_dims) {
+    int32_t N = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+
+#if defined(_OPENMP)
+  #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t q = 0; q < Q; ++q) {
+        for (int32_t k = 0; k < K; ++k) {
+          auto accumulator = ElementAcc(0);
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t c = 0; c < C; ++c) {
+              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+              if (detail::is_activation_in_bounds(tensor_a_, n, w, c)) {
+                auto a = tensor_a_(c, w, n);
+                auto b = tensor_b_(c, s, k);
+                accumulator += ElementAcc(a * b);
+              }
+            }
+          }
+          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+            epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+            epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                  scale_converter(beta) * residual_converter(tensor_c_(k, q, n));
+          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+          }
+          output = epi_activation(output);
+          tensor_d_(k, q, n) = output_converter(output);
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 2D fprop kernel
+  void fprop_reference(cute::Int<2> spatial_dims) {
+    int32_t N = size<3>(tensor_d_);
+    int32_t P = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t p = 0; p < P; ++p) {
+        for (int32_t q = 0; q < Q; ++q) {
+          for (int32_t k = 0; k < K; ++k) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t r = 0; r < R; ++r) {
+              for (int32_t s = 0; s < S; ++s) {
+                for (int32_t c = 0; c < C; ++c) {
+                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                  if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c)) {
+                    auto a = tensor_a_(c, w, h, n);
+                    auto b = tensor_b_(c, s, r, k);
+                    accumulator += ElementAcc(a * b);
+                  }
+                }
+              }
+            }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+            }
+            output = epi_activation(output);
+            tensor_d_(k, q, p, n) = output_converter(output);
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 3D fprop kernel
+  void fprop_reference(cute::Int<3> spatial_dims) {
+    int32_t N = size<4>(tensor_d_);
+    int32_t Z = size<3>(tensor_d_);
+    int32_t P = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t T = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t z = 0; z < Z; ++z) {
+        for (int32_t p = 0; p < P; ++p) {
+          for (int32_t q = 0; q < Q; ++q) {
+            for (int32_t k = 0; k < K; ++k) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t t = 0; t < T; ++t) {
+                for (int32_t r = 0; r < R; ++r) {
+                  for (int32_t s = 0; s < S; ++s) {
+                    for (int32_t c = 0; c < C; ++c) {
+                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                      if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c)) {
+                        auto a = tensor_a_(c, w, h, d, n);
+                        auto b = tensor_b_(c, s, r, t, k);
+                        accumulator += ElementAcc(a * b);
+                      }
+                    }
+                  }
+                }
+              }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+              }
+              output = epi_activation(output);
+              tensor_d_(k, q, p, z, n) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 1D dgrad kernel
+  void dgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t N = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+
+#if defined(_OPENMP)
+   #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t w = 0; w < W; ++w) {
+        for (int32_t c = 0; c < C; ++c) {
+          auto accumulator = ElementAcc(0);
+          for (int32_t k = 0; k < K; ++k) {
+            for (int32_t s = 0; s < S; ++s) {
+              int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+
+              if (q % cute::get<0>(tstride_) == 0) {
+                q /= cute::get<0>(tstride_);
+              } else {
+                continue;
+              }
+
+              if (detail::is_activation_in_bounds(tensor_a_, n, q, k)) {
+                accumulator += ElementAcc(tensor_a_(k, q, n) * tensor_b_(c, s, k));
+              }
+            }
+          }
+          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+            ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+            ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                  scale_converter(beta) * residual_converter(tensor_c_(c, w, n));
+          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+            output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+          }
+          output = epi_activation(output);
+          tensor_d_(c, w, n) = output_converter(output);
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 2D dgrad kernel
+  void dgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t N = size<3>(tensor_d_);
+    int32_t H = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t h = 0; h < H; ++h) {
+        for (int32_t w = 0; w < W; ++w) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t k = 0; k < K; ++k) {
+              for (int32_t r = 0; r < R; ++r) {
+                for (int32_t s = 0; s < S; ++s) {
+                  int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                  int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+
+                  if (q % cute::get<0>(tstride_) == 0) {
+                    q /= cute::get<0>(tstride_);
+                  } else {
+                    continue;
+                  }
+
+                  if (p % cute::get<1>(tstride_) == 0) {
+                    p /= cute::get<1>(tstride_);
+                  } else {
+                    continue;
+                  }
+
+                  if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k)) {
+                    accumulator += ElementAcc(tensor_a_(k, q, p, n) * tensor_b_(c, s, r, k));
+                  }
+                }
+              }
+            }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+            }
+            output = epi_activation(output);
+
+            tensor_d_(c, w, h, n) = output_converter(output);
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 3D dgrad kernel
+  void dgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t N = size<4>(tensor_d_);
+    int32_t D = size<3>(tensor_d_);
+    int32_t H = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<4>(tensor_b_);
+    int32_t T = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t n = 0; n < N; ++n) {
+      for (int32_t d = 0; d < D; ++d) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t k = 0; k < K; ++k) {
+                for (int32_t t = 0; t < T; ++t) {
+                  for (int32_t r = 0; r < R; ++r) {
+                    for (int32_t s = 0; s < S; ++s) {
+                      int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                      int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+                      int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
+
+                      if (q % cute::get<0>(tstride_) == 0) {
+                        q /= cute::get<0>(tstride_);
+                      } else {
+                        continue;
+                      }
+
+                      if (p % cute::get<1>(tstride_) == 0) {
+                        p /= cute::get<1>(tstride_);
+                      } else {
+                        continue;
+                      }
+
+                      if (z % cute::get<2>(tstride_) == 0) {
+                        z /= cute::get<2>(tstride_);
+                      } else {
+                        continue;
+                      }
+
+                      if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k)) {
+                        accumulator += ElementAcc(tensor_a_(k, q, p, z, n) * tensor_b_(c, s, r, t, k));
+                      }
+                    }
+                  }
+                }
+              }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+              }
+              output = epi_activation(output);
+              tensor_d_(c, w, h, d, n) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+  // Specialization for 1D wgrad kernel
+  void wgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t N =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t k = 0; k < K; ++k) {
+      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+      for (int32_t s = 0; s < S; ++s) {
+        for (int32_t c = 0; c < C; ++c) {
+          auto accumulator = ElementAcc(0);
+          for (int32_t n = 0; n < N; ++n) {
+            for (int32_t q = 0; q < Q; ++q) {
+              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+              bool is_in_bounds =
+                  detail::is_activation_in_bounds(tensor_b_, n, w, c);
+              if (is_in_bounds) {
+                auto act =
+                    tensor_b_(c, w, n);
+                auto xformed_act =
+                    tensor_a_(k, q, n);
+                accumulator += ElementAcc(act * xformed_act);
+              }
+            }
+          }
+          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                  scale_converter(beta) * residual_converter(tensor_c_(c, s, k));
+          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+          }
+          output = epi_activation(output);
+          tensor_d_(c, s, k) = output_converter(output);
+        }
+      }
+    }
+  }
+
+  // Specialization for 2D wgrad kernel
+  void wgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t N =
+        size<3>(tensor_a_);
+    int32_t P =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t R = size<2>(tensor_d_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t k = 0; k < K; ++k) {
+      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+      for (int32_t r = 0; r < R; ++r) {
+        for (int32_t s = 0; s < S; ++s) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t n = 0; n < N; ++n) {
+              for (int32_t p = 0; p < P; ++p) {
+                for (int32_t q = 0; q < Q; ++q) {
+                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                  bool is_in_bounds =
+                      detail::is_activation_in_bounds(tensor_b_, n, h, w, c);
+                  if (is_in_bounds) {
+                    auto act =
+                        tensor_b_(c, w, h, n);
+                    auto xformed_act =
+                        tensor_a_(k, q, p, n);
+                    accumulator += ElementAcc(act * xformed_act);
+                  }
+                }
+              }
+            }
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+            }
+            output = epi_activation(output);
+            tensor_d_(c, s, r, k) = output_converter(output);
+          }
+        }
+      }
+    }
+  }
+
+  // Specialization for 3D wgrad kernel
+  void wgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t N =
+        size<4>(tensor_a_);
+    int32_t Z =
+        size<3>(tensor_a_);
+    int32_t P =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t T = size<3>(tensor_d_);
+    int32_t R = size<2>(tensor_d_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t k = 0; k < K; ++k) {
+      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+      for (int32_t t = 0; t < T; ++t) {
+        for (int32_t r = 0; r < R; ++r) {
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t n = 0; n < N; ++n) {
+                for (int32_t z = 0; z < Z; ++z) {
+                  for (int32_t p = 0; p < P; ++p) {
+                    for (int32_t q = 0; q < Q; ++q) {
+                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                      bool is_in_bounds =
+                          detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c);
+                      if (is_in_bounds) {
+                        auto act =
+                            tensor_b_(c, w, h, d, n);
+                        auto xformed_act =
+                            tensor_a_(k, q, p, z, n);
+                        accumulator += ElementAcc(act * xformed_act);
+                      }
+                    }
+                  }
+                }
+              }
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+              }
+              output = epi_activation(output);
+              tensor_d_(c, s, r, t, k) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // cutlass::reference::host
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/convolution.h b/csrc/quantization/cutlass_test/example/util/reference/host/convolution.h
new file mode 100644
index 0000000000000..f28b4a658a388
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/convolution.h
@@ -0,0 +1,802 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Reference implementation for convolution in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include <iostream>
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Forward propagation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// y = conv2d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementD, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int p = 0; p < problem_size.P; ++p) {
+      for (int q = 0; q < problem_size.Q; ++q) {
+        for (int k = 0; k < problem_size.K; ++k) {
+
+          int group_idx = k / (problem_size.K / problem_size.groups);
+          int channels_per_group = problem_size.C / problem_size.groups;
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int c = 0; c < channels_per_group; ++c) {
+
+                int filter_r = r;
+                int filter_s = s;
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+                if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
+
+                  ElementA a = tensor_x.at({n, h, w, c + group_idx * channels_per_group});
+                  ElementB b = tensor_w.at({k, r, s, c});
+
+                  acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+
+                }
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
+          }
+
+          tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+
+/// Depthwise-separable convolution
+template <typename ElementA,
+          typename LayoutA,
+          typename ElementB,
+          typename LayoutB,
+          typename ElementC,
+          typename LayoutC,
+          typename ElementCompute,
+          typename ElementAccumulator = ElementCompute,
+          typename ElementD = ElementC,
+          typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+          typename InnerProductOp = multiply_add<ElementAccumulator>>
+void Depsep_Fprop(cutlass::TensorView<ElementA, LayoutA> tensor_A,
+                  cutlass::TensorView<ElementB, LayoutB> tensor_B,
+                  cutlass::TensorView<ElementC, LayoutC> tensor_C,
+                  cutlass::TensorView<ElementD, LayoutC> tensor_D,
+                  ElementCompute alpha,
+                  ElementCompute beta,
+                  cutlass::Tensor4DCoord padding = cutlass::Tensor4DCoord(),
+                  cutlass::Coord<2> conv_stride = cutlass::Coord<2>(),
+                  cutlass::Coord<2> dilation = cutlass::Coord<2>(),
+                  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < tensor_C.extent().n(); ++n) {
+    for (int p = 0; p < tensor_C.extent().h(); ++p) {
+      for (int q = 0; q < tensor_C.extent().w(); ++q) {
+        for (int g = 0; g < tensor_C.extent().c(); ++g) {
+          ElementAccumulator acc = ElementAccumulator();
+          for (int r = 0; r < tensor_B.extent().h(); ++r) {
+            for (int s = 0; s < tensor_B.extent().w(); ++s) {
+              
+              // input activation H and W
+              int h = p * conv_stride[0] - padding[0] + r * dilation[0];
+              int w = q * conv_stride[1] - padding[2] + s * dilation[1];
+
+              if (h < tensor_A.extent().h() && h >= 0 && w < tensor_A.extent().w() && w >= 0) {
+                ElementA a = tensor_A.at(cutlass::make_Coord(n, h, w, g));
+
+                ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
+                                   ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
+                                   : tensor_B.at(cutlass::make_Coord(
+                                         g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
+
+                acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
+          tensor_D.at(cutlass::make_Coord(n, p, q, g)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad / Deconv
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dDgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementD, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  bool is_deconv = false) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int h = 0; h < problem_size.H; ++h) {
+      for (int w = 0; w < problem_size.W; ++w) {
+        for (int c = 0; c < problem_size.C; ++c) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int k = 0; k < problem_size.K; ++k) {
+
+                int filter_r = r;
+                int filter_s = s;
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+                if (p >= 0 && (p % problem_size.stride_h) == 0 && 
+                    q >= 0 && (q % problem_size.stride_w) == 0) {
+
+                  p = p / problem_size.stride_h;
+                  q = q / problem_size.stride_w;
+#if 0
+                  std::cout << "row:" 
+                  << n * problem_size.H * problem_size.W +
+                    h * problem_size.W +
+                    w << " "
+                  << "n, p, q: (" 
+                  << n << ", "
+                  << p << ", "
+                  << q << ") * "
+                  << "r, s: (" 
+                  << r << ", "
+                  << s << ") [" 
+                  << ((p < problem_size.P && q < problem_size.Q) ? "true":"false") << "]"        
+                  << std::endl;
+#endif
+                  if (p < problem_size.P && q < problem_size.Q) {
+
+                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
+                    ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, r, s, k))
+                        : tensor_w.at(cutlass::make_Coord(k, r, s, c));
+
+                    acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                  }
+                }
+
+              } // for (K)
+            } // for (S)
+          } // for (R)
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
+          }
+
+          tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+        } // for (C)
+      } // for (W)
+    } // for (H)
+  } // for (N)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dWgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementD, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int r = 0; r < problem_size.R; ++r) {
+      for (int s = 0; s < problem_size.S; ++s) {
+        for (int c = 0; c < problem_size.C; ++c) {
+
+          ElementAccumulator acc = ElementAccumulator();
+
+          for (int n = 0; n < problem_size.N; ++n) {
+            for (int p = 0; p < problem_size.P; ++p) {
+              for (int q = 0; q < problem_size.Q; ++q) {
+                  
+                cutlass::Tensor4DCoord b_coord;
+                
+                int filter_r = r;
+                int filter_s = s; 
+
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+
+                b_coord = make_Coord(
+                    n,
+                    p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                    q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                    c);
+
+                if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                    b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+
+                  ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
+                  ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+                  acc = inner_product_op(a, b, acc);
+                }
+              }
+            }
+          }
+
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
+          }
+
+          tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+        } // for (C)
+      } // for (S)
+    } // for (R)
+  } // for (K)
+}
+
+/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2d(
+  conv::Operator convolutional_operator,
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementD, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv2dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementD,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kDeconv:
+  case conv::Operator::kDgrad:
+    Conv2dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementD,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
+    break;
+
+  case conv::Operator::kWgrad:
+    Conv2dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementD,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  default:
+    break;  
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// 3D convolution 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// y = conv3d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int z = 0; z < problem_size.Z; ++z) {
+      for (int p = 0; p < problem_size.P; ++p) {
+        for (int q = 0; q < problem_size.Q; ++q) {
+          for (int k = 0; k < problem_size.K; ++k) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int c = 0; c < problem_size.C; ++c) {
+
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+                    int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                    int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+
+                    if (d >= 0 && d < problem_size.D && 
+                      h >=0 && h < problem_size.H && 
+                      w >= 0 && w < problem_size.W) {
+
+                      ElementA a = tensor_x.at({n, d, h, w, c});
+                      ElementB b = tensor_w.at({k, t, r, s, c});
+                      
+                      acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                    }
+                  }
+                }
+              }
+            }
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
+            }
+
+            tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad / Deconv
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dDgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  bool is_deconv = false) {
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int d = 0; d < problem_size.D; ++d) {
+      for (int h = 0; h < problem_size.H; ++h) {
+        for (int w = 0; w < problem_size.W; ++w) {
+          for (int c = 0; c < problem_size.C; ++c) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int k = 0; k < problem_size.K; ++k) {
+
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
+                    int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                    int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+
+                    if (z >= 0 && (z % problem_size.stride_d) == 0 &&
+                        p >= 0 && (p % problem_size.stride_h) == 0 && 
+                        q >= 0 && (q % problem_size.stride_w) == 0) {
+
+                      z = z / problem_size.stride_d;
+                      p = p / problem_size.stride_h;
+                      q = q / problem_size.stride_w;
+                      
+                      if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
+
+                        ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
+                        ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, t, r, s, k))
+                            : tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
+                        acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                      }
+                    }
+
+                  } // for (K)
+                } // for (S)
+              } // for (R)
+            } // for (T)
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
+            }
+
+            tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+          } // for (C)
+        } // for (W)
+      } // for (H)
+    } // for (D)
+  } // for (N)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dWgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int t = 0; t < problem_size.T; ++t) {
+      for (int r = 0; r < problem_size.R; ++r) {
+        for (int s = 0; s < problem_size.S; ++s) {
+          for (int c = 0; c < problem_size.C; ++c) {
+
+            ElementAccumulator acc = ElementAccumulator();
+
+            for (int n = 0; n < problem_size.N; ++n) {
+              for (int z = 0; z < problem_size.Z; ++z) {
+                for (int p = 0; p < problem_size.P; ++p) {
+                  for (int q = 0; q < problem_size.Q; ++q) {
+                      
+                    int filter_t = t;     
+                    int filter_r = r;
+                    int filter_s = s; 
+
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+
+                    Tensor5DCoord b_coord = make_Coord(
+                        n,
+                        z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
+                        p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                        q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                        c);
+
+                    if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
+                        b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                        b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+
+                      ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
+                      ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+
+                      acc = inner_product_op(a, b, acc);
+                    }
+                  }
+                }
+              }
+            }
+
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
+            }
+
+            tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+
+          } // for (C)
+        } // for (S)
+      } // for (R)
+    } // for (T)
+  } // for (K)
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3d(
+  conv::Operator convolutional_operator,
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv3dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  case conv::Operator::kDeconv:
+  case conv::Operator::kDgrad:
+    Conv3dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
+    break;
+
+  case conv::Operator::kWgrad:
+    Conv3dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator, 
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+
+  default:
+    break;  
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace host
+}  // namespace reference
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/error_metrics.h b/csrc/quantization/cutlass_test/example/util/reference/host/error_metrics.h
new file mode 100644
index 0000000000000..86db65ccc441e
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/error_metrics.h
@@ -0,0 +1,66 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cmath>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+#include "cutlass/core_io.h"
+
+namespace cutlass  {
+namespace reference {
+namespace host {
+
+/// Helper to compute the relative error metric for tensor A_computed  w.r.t. to tensor A_reference
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorRelativeErrorMetric(
+  TensorView<Element, Layout> view_A_computed,
+  TensorView<Element, Layout> view_B_reference,
+  ComputeType identity = ComputeType()
+) {
+
+  return cutlass::reference::host::TensorNormDiff(view_A_computed, view_B_reference, identity) /
+   cutlass::reference::host::TensorNorm(view_B_reference, identity);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gemm.h b/csrc/quantization/cutlass_test/example/util/reference/host/gemm.h
new file mode 100644
index 0000000000000..03888131095fc
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/gemm.h
@@ -0,0 +1,531 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+template<typename Out, typename In>
+struct CastIfScalar {
+  static Out cast(In in) {
+    return Out(in);
+  }
+};
+
+template<typename OutScalar, typename In>
+struct CastIfScalar<cutlass::complex<OutScalar>, In> {
+  typedef cutlass::complex<OutScalar> Out;
+  static Out cast(In in) {
+    return Out(static_cast<OutScalar>(in));
+  }
+};
+
+template<typename OutScalar, typename InScalar>
+struct CastIfScalar<cutlass::complex<OutScalar>, cutlass::complex<InScalar>> {
+  typedef cutlass::complex<OutScalar> Out;
+  typedef cutlass::complex<InScalar> In;
+  static Out cast(In in) {
+    return Out(in);
+  }
+};
+
+template<typename Out, typename In>
+Out cast_if_scalar(In in) {
+  return CastIfScalar<Out, In>::cast(in);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+              ElementB b = tensor_b.at(MatrixCoord(k_block, col));
+
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+
+              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Gemm;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddFastBF16> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add-saturate
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddSaturate> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for XOR-popc
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpXorPopc> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, xor_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, xor_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+/// Partial specialization for AND-popc
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpAndPopc> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, and_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, and_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddFastF32> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Batched GEMM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a batch of GEMMs over a set of matrices of common dimension.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c,
+  AccumulatorType initial_accum) {
+
+  typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
+  typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
+  typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
+
+  for (int batch = 0;
+    batch < batch_count;
+    ++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
+    
+    Gemm<typename TensorRefCollectionA::Element,
+         typename TensorRefCollectionA::Layout,
+         typename TensorRefCollectionB::Element,
+         typename TensorRefCollectionB::Layout,
+         typename TensorRefCollectionC::Element,
+         typename TensorRefCollectionC::Layout,
+         typename TensorRefCollectionC::Element,
+         typename TensorRefCollectionC::Element>
+        gemm;
+
+    gemm(problem_size, alpha, *tensor_a_it, *tensor_b_it, beta, *tensor_c_it,
+         initial_accum);
+  }
+}
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c) {
+
+  BatchedGemm(problem_size, batch_count, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gemm_complex.h b/csrc/quantization/cutlass_test/example/util/reference/host/gemm_complex.h
new file mode 100644
index 0000000000000..92da343a9c222
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/gemm_complex.h
@@ -0,0 +1,210 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/matrix_coord.h"
+
+#include "cutlass/tensor_view.h"
+
+#include "cutlass/gemm/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+        ComputeType accum[Mblock][Nblock];
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N) {
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementB b = tensor_b.at(MatrixCoord(k_block, col));
+
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_kj = ComputeType(b);
+
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_kj = conj(b_kj);
+                }
+
+                accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+              }
+            }
+          }
+        }
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            MatrixCoord coord = MatrixCoord(row, col);
+
+            if (row < M && col < N) {
+
+              tensor_d.at(coord) = convert_op(
+                alpha * ScalarType(accum[i][j]) + 
+                beta * ScalarType(tensor_c.at(coord)));
+            }
+          }
+        }
+
+      } // for (col_block)
+    } // for (row_block)
+
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+
+  } // for (batch_idx)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ElementD = ElementC
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d) {
+
+  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gemm_planar_complex.h b/csrc/quantization/cutlass_test/example/util/reference/host/gemm_planar_complex.h
new file mode 100644
index 0000000000000..094af8b37b695
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/gemm_planar_complex.h
@@ -0,0 +1,228 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in host-side code.
+*/
+
+#pragma once
+
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<complex<ComputeType>>
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
+  complex<ComputeType> initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
+  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
+  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      complex<ComputeType> accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+
+              ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
+              ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
+
+              complex<ComputeType> a = complex<ComputeType>{
+                ComputeType(a_ik.real()),
+                ComputeType(a_ik.imag())
+              };
+
+              complex<ComputeType> b = complex<ComputeType>{
+                ComputeType(b_kj.real()),
+                ComputeType(b_kj.imag())
+              };
+
+              if (transform_a == ComplexTransform::kConjugate) {
+                a = conj(a);
+              }
+
+              if (transform_b == ComplexTransform::kConjugate) {
+                b = conj(b);
+              }
+
+              accum[i][j] = inner_product_op(a, b,  accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+
+            complex<ScalarType> acc{
+              ScalarType(accum[i][j].real()),
+              ScalarType(accum[i][j].imag())
+            };
+
+            ComplexC d_ij = tensor_c.at(coord);
+
+            complex<ScalarType> src{
+              ScalarType(d_ij.real()),
+              ScalarType(d_ij.imag())
+            };
+
+            complex<ScalarType> result = alpha * acc + beta * src;
+
+            d_ij.real() = convert_op(result.real());
+            d_ij.imag() = convert_op(result.imag());
+
+            tensor_d.at(coord) = d_ij;
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
+
+  GemmPlanarComplex(
+    problem_size, 
+    alpha, 
+    tensor_a, transform_a, 
+    tensor_b, transform_b, 
+    beta, 
+    tensor_c,
+    tensor_d,
+    complex<ScalarType>());
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gett.hpp b/csrc/quantization/cutlass_test/example/util/reference/host/gett.hpp
new file mode 100644
index 0000000000000..f6984fb2ba9c5
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/gett.hpp
@@ -0,0 +1,538 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GETT in host-side code.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/relatively_equal.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::reference::host {
+
+template<class T, class = void>
+struct ElementTraits {
+  using type = T;
+};
+
+template<class T>
+struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
+  using type = decltype(std::declval<T>().get());
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ElementAccumulator_,
+  class TensorA_,                                                                                         // (M, K, L)
+  class TensorB_                                                                                          // (N, K, L)
+>
+struct GettMainloopParams {
+  using ElementAccumulator = ElementAccumulator_;
+  using TensorA = TensorA_;
+  using TensorB = TensorB_;
+  using EngineA = typename TensorA::engine_type;
+  using LayoutA = typename TensorA::layout_type;
+  using EngineB = typename TensorB::engine_type;
+  using LayoutB = typename TensorB::layout_type;
+
+  TensorA A{};
+  TensorB B{};
+
+  ComplexTransform transform_A = ComplexTransform::kNone;
+  ComplexTransform transform_B = ComplexTransform::kNone;
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+  class ElementScalar_,
+  class ElementScalingFactor_,
+  class ElementAccumulator_,
+  class ElementCompute_,
+  class TensorC_,                                                                                          // (M, N, L)
+  class TensorD_,                                                                                          // (M, N, L)
+  class VectorBias_ = TensorD_,                                                                            //    (M, 1)
+  class TensorAux_ = TensorD_,                                                                             // (M, N, L)
+  class VectorAlpha_ = TensorD_,                                                                           //    (M, 1)
+  class VectorBeta_ = VectorAlpha_,                                                                        //    (M, 1)
+  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
+  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
+  bool PerColumnBias_ = false
+>
+struct GettEpilogueParams {
+  using ElementScalar = ElementScalar_;
+  using ElementScalingFactor = ElementScalingFactor_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using TensorC = TensorC_;
+  using TensorD = TensorD_;
+  using TensorAux = TensorAux_;
+  using VectorBias = VectorBias_;
+  using VectorAlpha = VectorAlpha_;
+  using VectorBeta = VectorBeta_;
+  using ActivationFunctor = ActivationFunctor_;
+  using BiasBinaryOp = BiasBinaryOp_;
+
+  using EngineC = typename TensorC::engine_type;
+  using LayoutC = typename TensorC::layout_type;
+  using EngineD =  typename TensorD::engine_type;
+  using LayoutD = typename TensorD::layout_type;
+  static constexpr bool PerColumnBias = PerColumnBias_;
+  ElementScalar alpha = ElementScalar(1);
+  ElementScalar beta = ElementScalar(0);
+
+  TensorC C{};
+  TensorD D{};
+  VectorBias Bias{};
+  TensorAux Aux{};
+  VectorAlpha Valpha{};
+  VectorBeta Vbeta{};
+  ElementCompute st = ElementCompute(1);
+
+  ElementAccumulator* abs_max_D = nullptr;
+  ElementAccumulator* abs_max_Aux = nullptr;
+
+  ElementScalingFactor scale_a = ElementScalingFactor(1);
+  ElementScalingFactor scale_b = ElementScalingFactor(1);
+  ElementScalingFactor scale_c = ElementScalingFactor(1);
+  ElementScalingFactor scale_d = ElementScalingFactor(1);
+  ElementScalingFactor scale_aux = ElementScalingFactor(1);
+
+  bool beta_per_channel_scaling = false;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GETT - General Tensor-Tensor contraction reference kernel
+template <
+  class MainloopParams,
+  class EpilogueParams
+>
+void Gett(
+    MainloopParams const& mainloop_params,
+    EpilogueParams const& epilogue_params)
+{
+
+  static int constexpr kBlockM = 64;
+  static int constexpr kBlockN = 64;
+
+#if defined(_OPENMP)
+  #pragma omp parallel for collapse(3)
+#endif
+  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
+    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
+      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
+        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
+        gett_mainloop(mainloop_params, m, n, l, acc);
+        gett_epilogue(epilogue_params, m, n, l, acc);
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GETT - Mainloop
+template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
+void gett_mainloop(
+    MainloopParams const& mainloop_params,
+    int64_t m,
+    int64_t n,
+    int64_t l,
+    ElementAccumulator (&acc)[kBlockM][kBlockN])
+{
+
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
+  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
+  
+  using cute::raw_pointer_cast;
+
+  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
+  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
+
+  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
+  RingOp fma_op;
+
+  // Zero out accumulators
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+    }
+  }
+
+  // Compute on this k-block
+  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
+    // Load A
+    ElementAccumulator a_frag[kBlockM];
+    for (int m_b = 0; m_b < kBlockM; ++m_b) {
+      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
+        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
+        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
+        
+        if (mainloop_params.transform_A == ComplexTransform::kConjugate) {
+          a_frag[m_b] = conj(a_frag[m_b]);
+        }
+      } else {
+        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+      }
+    }
+
+    // Load B
+    ElementAccumulator b_frag[kBlockN];
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
+        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
+        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
+
+        if (mainloop_params.transform_B == ComplexTransform::kConjugate) {
+          b_frag[n_b] = conj(b_frag[n_b]);
+        }
+      } else {
+        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+      }
+    }
+
+    // do compute
+    for (int m_b = 0; m_b < kBlockM; ++m_b) {
+      for (int n_b = 0; n_b < kBlockN; ++n_b) {
+        acc[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc[m_b][n_b]);
+      }
+    }
+
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GETT - Epilogue
+template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
+void gett_epilogue(
+    EpilogueParams const& epilogue_params,
+    int64_t m,
+    int64_t n,
+    int64_t l,
+    ElementAccumulator (&acc)[kBlockM][kBlockN])
+{
+  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
+  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
+
+  using cute::raw_pointer_cast;
+
+  using ElementCompute = typename EpilogueParams::ElementCompute;
+  using ElementC = typename EpilogueParams::TensorC::value_type;
+  using ElementD = typename EpilogueParams::TensorD::value_type;
+  using ElementAux = typename EpilogueParams::TensorAux::value_type;
+  using ElementBias = typename EpilogueParams::VectorBias::value_type;
+  using ElementScalar = typename EpilogueParams::ElementScalar;
+  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
+  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
+  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
+
+  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
+  constexpr bool IsScalingAndAmaxOutputNeeded = 
+      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
+      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
+
+  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
+      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
+      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
+
+  constexpr bool IsReLUAuxNeeded =
+      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
+       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and 
+      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
+  constexpr bool IsClamp =
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>;
+
+  constexpr bool IsBackpropFusion =
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
+
+  // Input related converter
+  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
+  NumericConverter<ElementCompute, ElementC> source_converter;
+  NumericConverter<ElementCompute, ElementBias> bias_converter;
+  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
+
+  // Scale related converter
+  NumericConverter<ElementCompute, ElementScalar> scale_converter;
+  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
+
+  // Abs max converter
+  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
+
+  // Output related converter
+  NumericConverter<ElementD, ElementCompute> destination_converter;
+  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
+  NumericConverter<ElementBias, ElementCompute> dBias_converter;
+
+  // Epilogue operations
+  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
+  multiplies<ElementCompute> mul;
+  plus<ElementCompute> add;
+
+  // Activation operation
+  ActivationFunctor activation;
+
+  // Bias binary operation
+  BiasBinaryOp bias_op;
+
+  // Do conversion
+  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
+  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
+  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
+  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
+  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
+  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
+  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
+
+  // Init local var
+  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
+  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
+
+  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
+  converted_beta = mul(converted_beta, converted_scale_c);
+
+  ElementCompute inter_accum[kBlockM][kBlockN];
+
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    ElementCompute local_dBias = ElementCompute(0);
+
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
+        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
+        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
+        // per-row alpha
+        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
+          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
+          converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
+        }
+        ElementCompute output = mul(converted_alpha, converted_acc);
+
+        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
+          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
+          output = bias_op(output, converted_bias);
+        }
+
+        if (raw_pointer_cast(epilogue_params.C.data())) {
+          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
+          // per-row beta
+          if (epilogue_params.Vbeta.data()) {
+            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
+            converted_beta = mul(converted_beta, converted_scale_c);
+          }
+          output = epilogue_fma(converted_beta, converted_src, output);
+        }
+
+        if constexpr (IsBackpropFusion) {
+          ElementAux aux_input = ElementAux(0);
+          if (raw_pointer_cast(epilogue_params.Aux.data())) {
+            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
+          }
+
+          output = activation(output, aux_source_converter(aux_input));
+          local_dBias = add(local_dBias, output);
+        }
+        else {
+          if (raw_pointer_cast(epilogue_params.Aux.data())) {
+            auto aux_output = output;
+            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
+              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
+              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
+            }
+
+            if constexpr (IsReLUAuxNeeded) {
+              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
+            } else {
+              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
+            }
+          }
+
+          if constexpr (IsClamp) { // Treat Clamp as ReLU
+            output = activation(output, {0, std::numeric_limits<ElementCompute>::max()});
+          }
+          else {
+            output = activation(output);
+          }
+        }
+
+        if constexpr (IsScalingAndAmaxOutputNeeded) {
+          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+          local_abs_max_output = amax_op(local_abs_max_output, output);
+          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
+        }
+
+        inter_accum[m_b][n_b] = ElementCompute(output);
+      }
+    } // n_b
+
+    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
+      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
+        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
+        local_dBias = add(local_dBias, converted_dBias);
+        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
+      }
+    }
+  } // m_b
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
+        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
+      }
+    }
+  }
+
+#if defined(_OPENMP)
+  #pragma omp critical(Abs_Max_Data_Update)
+#endif
+  {
+    if constexpr (IsScalingAndAmaxOutputNeeded) {
+      if (epilogue_params.abs_max_D) {
+        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
+          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
+      }
+    }
+
+    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
+      if (epilogue_params.abs_max_Aux) {
+        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
+            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
+      }
+    }
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class TensorType>
+auto make_layout_rank3(const TensorType& tensor) {
+  // append a batch mode of size 1 if we do not have tensors that are rank 3
+  return make_layout(
+      make_shape(cute::get<0>(tensor.shape()), cute::get<1>(tensor.shape()), cute::Int<1>{}),
+      make_stride(cute::get<0>(tensor.stride()), cute::get<1>(tensor.stride()), int64_t(cosize(tensor.layout()))));
+}
+
+/// GEMM - General Matrix-Matrix contraction without conjugation options
+template <
+  class MainloopParams,
+  class EpilogueParams
+>
+void Gemm3x(
+    MainloopParams const& mainloop_params,
+    EpilogueParams const& epilogue_params)
+{
+  using namespace cute;
+
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
+  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
+
+  if constexpr (cute::rank(typename MainloopParams::LayoutA{}) == 2) {
+    cute::Layout layout_A = make_layout_rank3(mainloop_params.A);
+    cute::Layout layout_B = make_layout_rank3(mainloop_params.B);
+    cute::Layout layout_C = make_layout_rank3(epilogue_params.C);
+    cute::Layout layout_D = make_layout_rank3(epilogue_params.D);
+    cute::Layout layout_Aux = make_layout_rank3(epilogue_params.Aux);
+    cute::Layout layout_Bias = make_layout_rank3(epilogue_params.Bias);
+    cute::Layout layout_Valpha = make_layout_rank3(epilogue_params.Valpha);
+    cute::Layout layout_Vbeta = make_layout_rank3(epilogue_params.Vbeta);
+    
+    auto TensorA = make_tensor(mainloop_params.A.data(), layout_A);
+    auto TensorB = make_tensor(mainloop_params.B.data(), layout_B);
+    auto TensorC = make_tensor(epilogue_params.C.data(), layout_C);
+    auto TensorD = make_tensor(epilogue_params.D.data(), layout_D);
+    auto TensorAux = make_tensor(epilogue_params.Aux.data(), layout_Aux);
+    auto VectorBias = make_tensor(epilogue_params.Bias.data(), layout_Bias);
+    auto VectorAlpha = make_tensor(epilogue_params.Valpha.data(), layout_Valpha);
+    auto VectorBeta = make_tensor(epilogue_params.Vbeta.data(), layout_Vbeta);
+
+    // Reconstruct mainloop params
+    GettMainloopParams<typename MainloopParams::ElementAccumulator,
+                       decltype(TensorA),
+                       decltype(TensorB)>
+        mainloop_params_converted{TensorA,
+                                  TensorB,
+                                  mainloop_params.transform_A,
+                                  mainloop_params.transform_B};
+
+    // Reconstruct epilogue params
+    GettEpilogueParams<typename EpilogueParams::ElementScalar,
+                       typename EpilogueParams::ElementScalingFactor,
+                       typename EpilogueParams::ElementAccumulator,
+                       typename EpilogueParams::ElementCompute,
+                       decltype(TensorC),
+                       decltype(TensorD),
+                       decltype(VectorBias),
+                       decltype(TensorAux),
+                       decltype(VectorAlpha),
+                       decltype(VectorBeta)
+                      >
+        epilogue_params_converted{epilogue_params.alpha,
+                                  epilogue_params.beta,
+                                  TensorC,
+                                  TensorD,
+                                  VectorBias,
+                                  TensorAux,
+                                  VectorAlpha,
+                                  VectorBeta,
+                                  epilogue_params.abs_amax_D,
+                                  epilogue_params.abs_amax_Aux,
+                                  epilogue_params.scale_a,
+                                  epilogue_params.scale_b,
+                                  epilogue_params.scale_c,
+                                  epilogue_params.scale_d,
+                                  epilogue_params.scale_aux
+                                  };
+
+    Gett(mainloop_params_converted, epilogue_params_converted);
+  }
+  else {
+    // if we already have a batch mode, just pass it through
+    Gett(mainloop_params, epilogue_params);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // cutlass::reference::host
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k.h b/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k.h
new file mode 100644
index 0000000000000..2a99bc03a35ba
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k.h
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for Rank 2k update in host-side code.
+    
+    
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_rank2k(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, 
+    "Tensors must be of rank 2");
+
+  static_assert(
+    FillModeC == FillMode::kLower || 
+    FillModeC == FillMode::kUpper, 
+    "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp = typename platform::conditional<(FillModeC == FillMode::kLower), 
+                                                    std::greater_equal<int>, 
+                                                    std::less_equal<int>>::type;
+
+  // Note: batch is ignored.
+  // Note: M is same as N for Rank 2k update
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+
+  for (int row_block = 0; row_block < N; row_block += Nblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Nblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Nblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Nblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < N && col < N && compare_op(row, col)) 
+            {
+
+              // A x B^T
+              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+              ElementB b_t = tensor_b.at(MatrixCoord(col, k_block));
+
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b_t(cast_if_scalar<ComputeType>(b_t));
+
+              accum[i][j] = inner_product_op(compute_a, compute_b_t, accum[i][j]);
+
+              // B x A^T
+              ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+              ElementA a_t = tensor_a.at(MatrixCoord(col, k_block));
+
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+              ComputeType compute_a_t(cast_if_scalar<ComputeType>(a_t));
+
+              accum[i][j] = inner_product_op(compute_b, compute_a_t, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Nblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < N && col < N && 
+              ( (FillModeC == FillMode::kLower && row >= col) || 
+                (FillModeC == FillMode::kUpper && row <= col) )
+          ) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general Rank 2k update (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_rank2k(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Rank2K;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, 
+          typename ElementB, typename LayoutB, 
+          typename ElementC, typename LayoutC, FillMode FillModeC,
+          typename ScalarType, typename ComputeType>
+struct Rank2K<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k_complex.h b/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k_complex.h
new file mode 100644
index 0000000000000..090019c100396
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k_complex.h
@@ -0,0 +1,318 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
+
+    
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <assert.h>
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
+  assert(M==N);
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+        ComputeType accum[Mblock][Nblock];
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col) )               
+                ) {
+                
+                // A x B^T (Symmetric) or A x B^H (Hermitian)
+                // complex conjugation on operandB (b_t) is function of blas3 computation
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
+                              conj(tensor_b.at(MatrixCoord(col, k_block))) : 
+                              tensor_b.at(MatrixCoord(col, k_block));
+
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_jk = ComputeType(b_t);
+
+                // complex conjugation is a function of operand layouts
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+                // complex conjugation is a function of operand layouts
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_jk = conj(b_jk);
+                }
+
+                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+              }
+            }
+          }
+        }
+
+        /* HER2K need two epilogues to handle complex alpha value */
+        if ( blas_mode == BlasMode::kHermitian ) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              MatrixCoord coord = MatrixCoord(row, col);
+
+              if (row < M && col < N && 
+                  ((fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col))
+                ) {
+
+                ScalarType c = tensor_c.at(coord);
+                // The imaginary parts of the diagonal elements of 
+                // a complex data type are assumed and set to zero
+                if (blas_mode == BlasMode::kHermitian) {
+                  c = (row == col) ? real(c) : c;
+                }
+
+                tensor_d.at(coord) = convert_op(alpha * 
+                  ScalarType(accum[i][j]) + 
+                  beta * c);
+              }
+            }
+          }
+          
+          /* Zeoring out accum for second HERK */
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              accum[i][j] = initial_accum;
+            }
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col) )               
+                ) {
+
+                // B x A^T (Symmetric) or B x A^H (Hermitian)
+                // complex conjugation on operandB (a_t) is function of blas3 computation
+                ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
+                                conj(tensor_a.at(MatrixCoord(col, k_block))):
+                                tensor_a.at(MatrixCoord(col, k_block));
+
+                ComputeType b_ik = ComputeType(b);
+                ComputeType a_jk = ComputeType(a_t);
+                
+                // complex conjugation here is a function of operand layouts
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_ik = conj(b_ik);
+                }
+                // complex conjugation here is a function of operand layouts
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_jk = conj(a_jk);
+                }
+
+                accum[i][j] = inner_product_op(b_ik, a_jk, accum[i][j]);
+              }
+            }
+          }
+        }
+
+        ScalarType alpha_hermitian = (blas_mode == BlasMode::kHermitian) ? 
+                                      conj(alpha) : alpha;
+        ScalarType beta_hermitian = (blas_mode == BlasMode::kHermitian) ? 
+                                      1 : beta;
+        
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            MatrixCoord coord = MatrixCoord(row, col);
+
+            if (row < M && col < N && 
+                ((fill_mode_c == FillMode::kLower && row >= col) || 
+                 (fill_mode_c == FillMode::kUpper && row <= col))
+              ) {
+
+              ScalarType d = (blas_mode == BlasMode::kHermitian) ? 
+                             tensor_d.at(coord) : tensor_c.at(coord);
+
+              ScalarType tmp_d = convert_op(
+                alpha_hermitian * ScalarType(accum[i][j]) + 
+                beta_hermitian * d);
+
+              if (blas_mode == BlasMode::kHermitian && row == col ) {
+                tensor_d.at(coord) = real(tmp_d);
+              } else {
+                tensor_d.at(coord) = tmp_d;
+              }
+            }
+          }
+        }
+
+      } // for (col_block)
+    } // for (row_block)
+
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+
+  } // for (batch_idx)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+
+  Rank2KComplex(
+    problem_size, alpha, 
+    tensor_a, transform_a, 
+    tensor_b, transform_b, 
+    beta, tensor_c, tensor_d, 
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/rank_k_complex.h b/csrc/quantization/cutlass_test/example/util/reference/host/rank_k_complex.h
new file mode 100644
index 0000000000000..ef44270a314a4
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/rank_k_complex.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
+
+    
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <assert.h>
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
+  assert(M==N);
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+        ComputeType accum[Mblock][Nblock];
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col) )               
+                ) {
+                
+                // A x A^T (Symmetric) or A x A^H (Hermitian)
+                // complex conjugation on operandB (a_t) (function of blas3 computation)
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
+                              conj(tensor_a.at(MatrixCoord(col, k_block))) : 
+                              tensor_a.at(MatrixCoord(col, k_block));
+
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_jk = ComputeType(a_t);
+
+                // complex conjugation (function of input layouts)
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+                // complex conjugation (function of input layouts)
+                if (transform_a == ComplexTransform::kConjugate) {
+                  b_jk = conj(b_jk);
+                }
+
+                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+
+              }
+            }
+          }
+        }
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            MatrixCoord coord = MatrixCoord(row, col);
+
+            if (row < M && col < N && 
+                ((fill_mode_c == FillMode::kLower && row >= col) || 
+                 (fill_mode_c == FillMode::kUpper && row <= col))
+              ) {
+
+              ScalarType c = tensor_c.at(coord);
+              // The imaginary parts of the diagonal elements of 
+              // a complex data type are assumed and set to zero
+              if (blas_mode == BlasMode::kHermitian) {
+                c = (row == col) ? real(c) : c;
+              }
+
+              ScalarType tmp_d = convert_op(
+                alpha * ScalarType(accum[i][j]) + 
+                beta * c);
+
+              if (blas_mode == BlasMode::kHermitian && row == col ) {
+                tensor_d.at(coord) = real(tmp_d);
+              } else {
+                tensor_d.at(coord) = tmp_d;
+              }
+            }
+          }
+        }
+
+      } // for (col_block)
+    } // for (row_block)
+
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+
+  } // for (batch_idx)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void RankKComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+
+  Rank2KComplex(
+    problem_size, alpha, 
+    tensor_a, transform_a, 
+    beta, tensor_c, tensor_d, 
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/symm.h b/csrc/quantization/cutlass_test/example/util/reference/host/symm.h
new file mode 100644
index 0000000000000..a585caf73f64f
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/symm.h
@@ -0,0 +1,285 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for SYMM update in host-side code.
+    
+    
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, 
+    "Tensors must be of rank 2");
+
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(
+    FillModeA == FillMode::kLower || 
+    FillModeA == FillMode::kUpper, 
+    "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp_w_diag =  typename TrMatrixCompareOp<FillModeA, DiagType::kNonUnit>::Type;
+  using CompareOp_wo_diag = typename TrMatrixCompareOp<FillModeA, DiagType::kZero>::Type;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp_w_diag compare_op_1;
+  CompareOp_wo_diag compare_op_2;
+
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              ElementA a_1 = ElementA();
+              ElementB b_1 = ElementB();
+              ElementA a_2 = ElementA();
+              ElementB b_2 = ElementB();
+
+              // A x B or B x A (with diagonal)
+              if (SideModeA == SideMode::kLeft) {
+                a_1 = (compare_op_1(row, k_block)) ? 
+                      (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
+                b_1 = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a_1 = tensor_b.at(MatrixCoord(row, k_block));
+                b_1 = (compare_op_1(k_block, col)) ? 
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
+              }
+
+              ComputeType compute_a_1(cast_if_scalar<ComputeType>(a_1));
+              ComputeType compute_b_1(cast_if_scalar<ComputeType>(b_1));
+
+              accum[i][j] = inner_product_op(compute_a_1, compute_b_1, accum[i][j]);
+
+              // A^T x B or B x A^T (without diagonal)
+              if (SideModeA == SideMode::kLeft) {
+                a_2 = (compare_op_2(k_block, row)) ? 
+                      (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
+                b_2 = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a_2 = tensor_b.at(MatrixCoord(row, k_block));
+                b_2 = (compare_op_2(col, k_block)) ? 
+                      tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
+              }
+
+              ComputeType compute_a_2(cast_if_scalar<ComputeType>(a_2));
+              ComputeType compute_b_2(cast_if_scalar<ComputeType>(b_2));
+
+              accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general Symm update (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Symm;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, 
+          SideMode SideModeA, FillMode FillModeA,
+          typename ElementB, typename LayoutB, 
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/symm_complex.h b/csrc/quantization/cutlass_test/example/util/reference/host/symm_complex.h
new file mode 100644
index 0000000000000..2618feaa70cee
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/symm_complex.h
@@ -0,0 +1,319 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued SYMM update in host-side code.
+
+    
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <assert.h>
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  BlasMode BlasMode_ = BlasMode::kSymmetric,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm_complex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static BlasMode const kBlasMode  = BlasMode_;
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  static_assert(kSideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(
+    kFillModeA == FillMode::kLower || 
+    kFillModeA == FillMode::kUpper, 
+    "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp_w_diag =  typename TrMatrixCompareOp<kFillModeA, DiagType::kNonUnit>::Type;
+  using CompareOp_wo_diag = typename TrMatrixCompareOp<kFillModeA, DiagType::kZero>::Type;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp_w_diag compare_op_1;
+  CompareOp_wo_diag compare_op_2;
+
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+        ComputeType accum[Mblock][Nblock];
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N) 
+              {
+                ElementA a_1 = ElementA();
+                ElementB b_1 = ElementB();
+                ElementA a_2 = ElementA();
+                ElementB b_2 = ElementB();
+                
+                // A x B or B x A (with diagonal)
+                if (kSideModeA == SideMode::kLeft) {
+                  a_1 = (compare_op_1(row, k_block)) ? 
+                        (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
+                  b_1 = tensor_b.at(MatrixCoord(k_block, col));
+                } else if (kSideModeA == SideMode::kRight) {
+                  a_1 = tensor_b.at(MatrixCoord(row, k_block));
+                  b_1 = (compare_op_1(k_block, col)) ? 
+                        tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
+                }
+                ComputeType compute_a_1 = ComputeType(a_1);
+                ComputeType compute_b_1 = ComputeType(b_1);
+
+                // The imaginary parts of the diagonal elements of 
+                // a complex data type are assumed and set to zero
+                if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kLeft && row == k_block) {
+                  compute_a_1 = real(compute_a_1);
+                } else if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kRight && k_block == col) {
+                  compute_b_1 = real(compute_b_1);
+                }
+
+                accum[i][j] = inner_product_op(compute_a_1, compute_b_1,  accum[i][j]);
+
+                // A^T x B or B x A^T (without diagonal)
+                if (kSideModeA == SideMode::kLeft) {
+                  a_2 = (compare_op_2(k_block, row)) ? 
+                        (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
+                  b_2 = tensor_b.at(MatrixCoord(k_block, col));
+                  if (kBlasMode == BlasMode::kHermitian)
+                    a_2 = conj(a_2);
+                } else if (kSideModeA == SideMode::kRight) {
+                  a_2 = tensor_b.at(MatrixCoord(row, k_block));
+                  b_2 = (compare_op_2(col, k_block)) ? 
+                        tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
+                  if (kBlasMode == BlasMode::kHermitian)
+                    b_2 = conj(b_2);
+                }
+
+                ComputeType compute_a_2 = ComputeType(a_2);
+                ComputeType compute_b_2 = ComputeType(b_2);
+
+                accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
+              }
+            }
+          }
+        }
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            MatrixCoord coord = MatrixCoord(row, col);
+
+            if (row < M && col < N) {
+
+              ScalarType c = tensor_c.at(coord);
+
+              tensor_d.at(coord) = convert_op(
+                alpha * ScalarType(accum[i][j]) + 
+                beta * c);
+            }
+          }
+        }
+
+      } // for (col_block)
+    } // for (row_block)
+
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+
+  } // for (batch_idx)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  BlasMode BlasMode_ = cutlass::BlasMode::kSymmetric,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
+>
+struct SymmComplex;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA,
+          SideMode SideModeA, FillMode FillModeA, 
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType,
+          BlasMode BlasMode_>
+struct SymmComplex<ElementA, LayoutA, 
+                   SideModeA, FillModeA,
+                   ElementB, LayoutB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, BlasMode_,
+                   arch::OpMultiplyAddComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm_complex<ElementA, LayoutA,
+                 SideModeA, FillModeA,
+                 ElementB, LayoutB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for gaussian multiply-add 
+template <typename ElementA, typename LayoutA,
+          SideMode SideModeA, FillMode FillModeA,
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType,
+          BlasMode BlasMode_>
+struct SymmComplex<ElementA, LayoutA, 
+                   SideModeA, FillModeA, 
+                   ElementB, LayoutB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, BlasMode_,
+                   arch::OpMultiplyAddGaussianComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm_complex<ElementA, LayoutA,
+                 SideModeA, FillModeA,
+                 ElementB, LayoutB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.h b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.h
new file mode 100644
index 0000000000000..df164a37e9297
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.h
@@ -0,0 +1,423 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/relatively_equal.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/tensor_view_planar_complex.h"
+
+#include "cutlass/util/distribution.h"
+#include "tensor_foreach.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorEqualsFunc {
+
+  //
+  // Data members
+  //
+
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  bool result;
+
+  /// Ctor
+  TensorEqualsFunc(): result(true) { }
+
+  /// Ctor
+  TensorEqualsFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_), rhs(rhs_), result(true) { }
+
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+
+    if (lhs_ != rhs_) {
+      result = false;
+    }
+  }
+
+  /// Returns true if equal
+  operator bool() const {
+    return result;
+  }
+};
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorRelativelyEqualsFunc {
+
+  //
+  // Data members
+  //
+
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  Element epsilon;
+  Element nonzero_floor;
+  bool result;
+
+  /// Ctor
+  TensorRelativelyEqualsFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_,
+    Element epsilon_,
+    Element nonzero_floor_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    epsilon(epsilon_),
+    nonzero_floor(nonzero_floor_),
+    result(true) { }
+
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+
+    if (!relatively_equal(lhs_, rhs_, epsilon, nonzero_floor)) {
+      result = false;
+    }
+  }
+
+  /// Returns true if equal
+  operator bool() const {
+    return result;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if two tensor views are equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorEquals(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+
+  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+
+  return bool(func);
+}
+
+/// Returns true if two tensor views are equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorEquals(
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
+  TensorViewPlanarComplex<Element, Layout> const &rhs) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+
+  detail::TensorEqualsFunc<Element, Layout> real_func(
+    {lhs.data(), lhs.layout(), lhs.extent()},
+    {rhs.data(), rhs.layout(), rhs.extent()}
+  );
+
+  TensorForEach(
+    lhs.extent(),
+    real_func
+  );
+
+  if (!bool(real_func)) {
+    return false;
+  }
+
+  detail::TensorEqualsFunc<Element, Layout> imag_func(
+    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()}, 
+    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()}
+    );
+
+  TensorForEach(
+    lhs.extent(),
+    imag_func
+  );
+
+  return bool(imag_func);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if two tensor views are relatively equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorRelativelyEquals(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs,
+  Element epsilon,
+  Element nonzero_floor) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+
+  detail::TensorRelativelyEqualsFunc<Element, Layout> func(lhs, rhs, epsilon, nonzero_floor);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+
+  return bool(func);
+}
+
+/// Returns true if two tensor views are relatively equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorRelativelyEquals(
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
+  TensorViewPlanarComplex<Element, Layout> const &rhs,
+  Element epsilon,
+  Element nonzero_floor) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+
+  detail::TensorRelativelyEqualsFunc<Element, Layout> real_func(
+    {lhs.data(), lhs.layout(), lhs.extent()},
+    {rhs.data(), rhs.layout(), rhs.extent()},
+    epsilon,
+    nonzero_floor
+  );
+
+  TensorForEach(
+    lhs.extent(),
+    real_func
+  );
+
+  if (!bool(real_func)) {
+    return false;
+  }
+
+  detail::TensorEqualsFunc<Element, Layout> imag_func(
+    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()},
+    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()},
+    epsilon,
+    nonzero_floor
+  );
+
+  TensorForEach(
+    lhs.extent(),
+    imag_func
+  );
+
+  return bool(imag_func);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if two tensor views are NOT equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorNotEquals(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return true;
+  }
+
+  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+
+  return !bool(func);
+}
+
+/// Returns true if two tensor views are equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorNotEquals(
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
+  TensorViewPlanarComplex<Element, Layout> const &rhs) {
+
+  return !TensorEquals(lhs, rhs);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorContainsFunc {
+
+  //
+  // Data members
+  //
+
+  TensorView<Element, Layout> view;
+  Element value;
+  bool contains;
+  Coord<Layout::kRank> location;
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  TensorContainsFunc(): contains(false) { }
+
+  /// Ctor
+  TensorContainsFunc(
+    TensorView<Element, Layout> const &view_,
+    Element value_
+  ) :
+    view(view_), value(value_), contains(false) { }
+
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    if (view.at(coord) == value) {
+      if (!contains) {
+        location = coord;
+      }
+      contains = true;
+    }
+  }
+
+  /// Returns true if equal
+  operator bool() const {
+    return contains;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if a value is present in a tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorContains(
+  TensorView<Element, Layout> const & view,
+  Element value) {
+
+  detail::TensorContainsFunc<Element, Layout> func(
+    view,
+    value
+  );
+
+  TensorForEach(
+    view.extent(),
+    func
+  );
+
+  return bool(func);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns a pair containing a boolean of whether a value exists in a tensor and the location of
+/// of the first occurrence. If the value is not contained in the tensor, the second element of the
+/// pair is undefined.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+std::pair<bool, Coord<Layout::kRank> > TensorFind(
+  TensorView<Element, Layout> const & view,
+  Element value) {
+
+  detail::TensorContainsFunc<Element, Layout> func(
+    view,
+    value
+  );
+
+  TensorForEach(
+    view.extent(),
+    func
+  );
+
+  return std::make_pair(bool(func), func.location);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.hpp b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.hpp
new file mode 100644
index 0000000000000..a1f3f5b14e6f0
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.hpp
@@ -0,0 +1,101 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+
+// Cute includes
+#include "cute/tensor.hpp"
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if two tensor views are equal.
+template <
+  typename TensorL,
+  typename TensorR
+>
+bool TensorEquals(
+  TensorL lhs,
+  TensorR rhs) {
+
+  // Extents must be identical
+  if (cute::size(lhs) != cute::size(rhs)) {
+    return false;
+  }
+
+  for (int64_t idx = 0; idx < cute::size(lhs); ++idx) {
+    if (lhs(idx) != rhs(idx)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// Returns true if two tensor views are NOT equal.
+template <
+  typename TensorL,
+  typename TensorR
+>
+bool TensorNotEquals(
+  TensorL lhs,
+  TensorR rhs) {
+
+  return TensorEquals(lhs, rhs);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_copy.h b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_copy.h
new file mode 100644
index 0000000000000..0b963b72e9152
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_copy.h
@@ -0,0 +1,256 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "tensor_foreach.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Helper to convert between types
+template <
+  typename DstElement,
+  typename SrcElement
+>
+struct TrivialConvert {
+
+  TrivialConvert() { }
+
+  DstElement operator()(SrcElement src) const {
+    return DstElement(src);
+  }
+};
+
+/// Helper to conditionally copy between tensor views.
+template <
+  typename DstElement,
+  typename DstLayout,
+  typename SrcElement,
+  typename SrcLayout,
+  typename F
+>
+struct TensorCopyIf {
+
+  using DstTensorView = TensorView<DstElement, DstLayout>;
+  using SrcTensorView = TensorView<SrcElement, SrcLayout>;
+
+  //
+  // Data members
+  //
+
+  DstTensorView dst;
+  SrcTensorView src;
+  F convert;
+
+  //
+  // Methods
+  //
+
+  TensorCopyIf() { }
+
+  TensorCopyIf(
+    DstTensorView const &dst_, 
+    SrcTensorView const &src_,
+    F const &convert_): dst(dst_), src(src_), convert(convert_) {}
+
+  /// Copies based on destination and source bounds
+  void operator()(Coord<DstLayout::kRank> const &coord) {
+    if (dst.contains(coord) && src.contains(coord)) {
+      dst.at(coord) = convert(src.at(coord));
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src,
+  F const &transform) {
+
+  using CopyIf = detail::TensorCopyIf<
+    DstElement,
+    DstLayout,
+    SrcElement,
+    SrcLayout,
+    F>;
+
+  CopyIf copy_if(dst, src, transform);
+
+  TensorForEach(dst.extent(), copy_if);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
+/// to avoid out of bounds accesses.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorRef<SrcElement, SrcLayout> src,
+  F const &transform) {
+
+  using CopyIf = detail::TensorCopyIf<
+    DstElement,
+    DstLayout,
+    SrcElement,
+    SrcLayout,
+    F>;
+
+  TensorView<SrcElement, SrcLayout> src_view(src, dst.extent());
+
+  CopyIf copy_if(dst, src_view, transform);
+
+  TensorForEach(dst.extent(), copy_if);
+}
+
+/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
+/// to avoid out of bounds accesses.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorRef<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src,
+  F const &transform) {
+
+  using CopyIf = detail::TensorCopyIf<
+    DstElement,
+    DstLayout,
+    SrcElement,
+    SrcLayout,
+    F>;
+
+  TensorView<DstElement, DstLayout> dst_view(dst, src.extent());
+
+  CopyIf copy_if(dst_view, src, transform);
+
+  TensorForEach(src.extent(), copy_if);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
+/// if SrcElement can be converted to DstElement.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout            /// Source tensor's layout
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src) {
+
+  detail::TrivialConvert<DstElement, SrcElement> convert;
+
+  TensorCopy(dst, src, convert);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
+/// if SrcElement can be converted to DstElement.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorRef<SrcElement, SrcLayout> src) {
+
+  detail::TrivialConvert<DstElement, SrcElement> convert;
+
+  TensorCopy(dst, src, convert);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
+/// if SrcElement can be converted to DstElement.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout            /// Source tensor's layout
+>
+void TensorCopy(
+  TensorRef<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src) {
+
+  detail::TrivialConvert<DstElement, SrcElement> convert;
+
+  TensorCopy(dst, src, convert);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_elementwise.h b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_elementwise.h
new file mode 100644
index 0000000000000..42ce2183b6a24
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_elementwise.h
@@ -0,0 +1,341 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+
+#pragma once
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+
+#include "tensor_foreach.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to apply a binary operator in place
+template <
+  typename ElementA, 
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementD,
+  typename LayoutD,
+  typename BinaryFunc>
+struct TensorFuncBinaryOp {
+
+  //
+  // Data members
+  //
+
+  /// View of left-hand-side tensor
+  TensorView<ElementD, LayoutD> view_d;
+  TensorRef<ElementA, LayoutA> view_a;
+  TensorRef<ElementB, LayoutB> view_b;
+  BinaryFunc func;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  TensorFuncBinaryOp() { }
+
+  /// Constructor
+  TensorFuncBinaryOp(
+    TensorView<ElementD, LayoutD> const & view_d_,
+    TensorRef<ElementA, LayoutA> const & view_a_,
+    TensorRef<ElementB, LayoutB> const & view_b_,
+    BinaryFunc func = BinaryFunc()
+  ):
+    view_d(view_d_), view_a(view_a_), view_b(view_b_), func(func) { }
+
+  /// Equality check
+  void operator()(Coord<LayoutD::kRank> const &coord) const {
+    view_d.at(coord) = func(
+      ElementD(view_a.at(coord)),
+      ElementD(view_b.at(coord))
+    );
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Adds two tensors and stores in the destination tensor: d = a + b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorAdd(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+
+  detail::TensorFuncBinaryOp<
+    ElementD, 
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::plus<ElementD>
+  > func(d, a, b);
+
+  TensorForEach(
+    d.extent(),
+    func); 
+}
+
+/// Adds a tensor in place: d = d .+ a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorAdd(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorAdd(d, d, a);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Subtracts two tensors and stores in the destination tensor: d = a - b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorSub(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+  ) {
+
+  detail::TensorFuncBinaryOp<
+    ElementD, 
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::minus<ElementD>
+  > func(d, a, b);
+
+  TensorForEach(
+    d.extent(),
+    func);
+}
+
+/// Subtracts two tensors in place: d = d .- a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorSub(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+  ) {
+  
+  TensorSub(d, d, a);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Multiplies two tensors and stores in the destination tensor: d = a .* b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorMul(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+  
+  detail::TensorFuncBinaryOp<
+    ElementD, 
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::multiplies<ElementD>
+  > func(d, a, b);
+
+  TensorForEach(
+    d.extent(),
+    func);
+}
+
+/// Multiplies tensors in place: d = d .* a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorMul(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorMul(d, d, a);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Divides two tensors and stores in the destination tensor: d = a ./ b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorDiv(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+  
+  detail::TensorFuncBinaryOp<
+    ElementD, 
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::divides<ElementD>
+  > func(d, a, b);
+
+  TensorForEach(
+    d.extent(),
+    func);
+}
+
+/// Divides tensors in place: d = d ./ a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorDiv(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorDiv(d, d, a);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Divides two tensors and stores in the destination tensor: d = a ./ b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorModulus(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+  
+  detail::TensorFuncBinaryOp<
+    ElementD, 
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::divides<ElementD>
+  > func(d, a, b);
+
+  TensorForEach(
+    d.extent(),
+    func);
+}
+
+/// Divides tensors in place: d = d ./ a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorModulus(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorDiv(d, d, a);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.h b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.h
new file mode 100644
index 0000000000000..b9f0c84d9a2a9
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.h
@@ -0,0 +1,1718 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <random>
+#include <stdexcept>
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/tensor_view_planar_complex.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/distribution.h"
+#include "tensor_foreach.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  Element value;
+
+  //
+  // Methods
+  //
+
+  TensorFillFunc(
+    TensorView const &view_ = TensorView(), 
+    Element value_ = Element(0)
+  ): view(view_), value(value_) { }
+
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    view.at(coord) = value;
+  }
+};
+
+/// Returns a pair of values of the Gaussian distribution generated by the Box Muller method 
+struct BoxMullerFunc {
+
+  BoxMullerFunc() {}
+
+  void operator()(
+    double* rnd,                     ///< Size-2 vector to be filled with random values
+    double  mean = 0,                ///< Mean of the Gaussian distribution
+    double  stddev = 1,              ///< Standard deviation of the Gaussian distribution
+    double  pi = std::acos(-1)) const {
+
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+    rnd[0] = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd[1] = std::sqrt(-2 * std::log(u1)) * std::sin(2 * pi * u2);
+    rnd[0] = mean + stddev * rnd[0];
+    rnd[1] = mean + stddev * rnd[1];
+  }
+};
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with a uniform value
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFill(
+  TensorView<Element, Layout> dst,    ///< destination tensor 
+  Element val = Element(0)) {               ///< value to uniformly fill it with
+
+  detail::TensorFillFunc<Element, Layout> func(dst, val);
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+/// Fills a tensor with a uniform value
+template <
+  typename Element,                                                   ///< Element type
+  typename Layout>                                                    ///< Layout function
+void TensorFill(
+  TensorViewPlanarComplex<Element, Layout> dst,                       ///< destination tensor 
+  cutlass::complex<Element> val = cutlass::complex<Element>(0)) {     ///< value to uniformly fill it with
+
+  TensorFill(dst.view_real(), val.real());
+  TensorFill(dst.view_imag(), val.imag());
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element>
+struct RandomGaussianFunc {
+
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+  double pnz;
+  bool exclude_zero;
+
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0, 
+    double mean_ = 0, 
+    double stddev_ = 1,
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+  }
+
+  /// Compute random value and update RNG state
+  Element operator()() const {
+
+    // Box-Muller transform to generate random numbers with Normal distribution
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+
+    // Compute Gaussian random value
+    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd = mean + stddev * rnd;
+
+    // Scale and convert final result
+    Element result;
+
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
+
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
+      if (int_scale >= 0) {
+        rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
+        result = static_cast<Element>(rnd);
+      }
+      else {
+        result = static_cast<Element>(rnd);
+      }
+    }
+    else {
+      result = static_cast<Element>(0);
+    }
+
+    // Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0) {
+        rnd += 1;
+      } else {
+        rnd -= 1;
+      }
+      result = Element(rnd);
+    }    
+
+    return result;
+  }
+};
+
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomGaussianFunc<complex<Element> > {
+
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+  double pnz;
+  bool exclude_zero;
+
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0, 
+    double mean_ = 0, 
+    double stddev_ = 1,
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+  }
+
+  /// Compute random value and update RNG state
+  complex<Element> operator()() const {
+
+    Element reals[2];
+
+    double rnd[2];
+    detail::BoxMullerFunc func;
+    func(rnd, mean, stddev, pi);
+
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
+
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
+      if (int_scale >= 0) {
+        rnd[0] = double(int(rnd[0] * double(1 << int_scale)));
+        rnd[1] = double(int(rnd[1] * double(1 << int_scale)));
+        reals[0] = from_real<Element>(rnd[0] / double(1 << int_scale));
+        reals[1] = from_real<Element>(rnd[1] / double(1 << int_scale));
+      }
+      else {
+        reals[0] = from_real<Element>(rnd[0]);
+        reals[1] = from_real<Element>(rnd[1]);
+      }
+    }
+    else {
+      reals[0] = from_real<Element>(0);
+      reals[1] = from_real<Element>(0);
+    }
+
+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0.0) &&
+        reals[1] == from_real<Element>(0.0)) {
+
+      if (rnd[0] > 0.0) {
+        rnd[0] += 1.0;
+      } else {
+        rnd[0] -= 1.0;
+      }
+      reals[0] = from_real<Element>(rnd[0]);
+    }
+
+    return complex<Element>(reals[0], reals[1]);
+  }
+};
+
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomGaussianFunc<Quaternion<Element> > {
+
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+  double pnz;
+  bool exclude_zero;
+
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0,
+    double mean_ = 0,
+    double stddev_ = 1,
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+  }
+
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() const {
+
+    Element reals[4];
+
+    double rnd1[2];
+    double rnd2[2];
+    detail::BoxMullerFunc func;
+    func(rnd1, mean, stddev, pi);
+    func(rnd2, mean, stddev, pi);
+
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
+
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
+      if (int_scale >= 0) {
+        rnd1[0] = double(int(rnd1[0] * double(1 << int_scale)));
+        rnd1[1] = double(int(rnd1[1] * double(1 << int_scale)));
+        rnd2[0] = double(int(rnd2[0] * double(1 << int_scale)));
+        rnd2[1] = double(int(rnd2[1] * double(1 << int_scale)));
+
+        reals[0] = from_real<Element>(rnd1[0] / double(1 << int_scale));
+        reals[1] = from_real<Element>(rnd1[1] / double(1 << int_scale));
+        reals[2] = from_real<Element>(rnd2[0] / double(1 << int_scale));
+        reals[3] = from_real<Element>(rnd2[1] / double(1 << int_scale));
+      }
+      else {
+        reals[0] = from_real<Element>(rnd1[0]);
+        reals[1] = from_real<Element>(rnd1[1]);
+        reals[2] = from_real<Element>(rnd2[0]);
+        reals[3] = from_real<Element>(rnd2[1]);
+      }
+    }
+    else {
+      reals[0] = from_real<Element>(0);
+      reals[1] = from_real<Element>(0);
+      reals[2] = from_real<Element>(0);
+      reals[3] = from_real<Element>(0);
+    }
+
+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0) &&
+        reals[1] == from_real<Element>(0) &&
+        reals[2] == from_real<Element>(0) &&
+        reals[3] == from_real<Element>(0)) {
+
+      if (rnd1[0] > 0.0) {
+        rnd1[0] += 1.0;
+      } else {
+        rnd1[0] -= 1.0;
+      }
+      reals[0] = from_real<Element>(rnd1[0]);
+    }
+
+    return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillGaussianFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomGaussianFunc<Element> func;
+
+  //
+  // Methods
+  //
+
+  /// Construction of Gaussian RNG functor.
+  TensorFillGaussianFunc(
+    TensorView view_ = TensorView(),
+    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>()
+  ):
+    view(view_), func(func_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    view.at(coord) = func();
+  }
+};
+
+/// Computes a random Gaussian distribution for a rank-2 tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillSymmetricGaussianFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomGaussianFunc<Element> func;
+  cutlass::FillMode fill_mode;
+
+  //
+  // Methods
+  //
+
+  /// Construction of Gaussian RNG functor.
+  TensorFillSymmetricGaussianFunc(
+    TensorView view_ = TensorView(),
+    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kLower &&
+        coord[0] >= coord[1]) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kUpper &&
+        coord[0] <= coord[1]) {
+      view.at(coord) = func();
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomGaussian(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0,                     ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  bool exclude_zero = false) {            ///< Exclude zeros from tensor init.
+  
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);
+
+  detail::TensorFillGaussianFunc<Element, Layout> func(
+    dst,
+    random_func
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomGaussian(
+  TensorViewPlanarComplex<Element, Layout> dst,         ///< destination tensor
+  uint64_t seed,                                        ///< seed for RNG
+  double mean = 0,                                      ///< Gaussian distribution's mean
+  double stddev = 1,                                    ///< Gaussian distribution's standard deviation
+  int bits = -1,                                        ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0,                                   ///  are not truncated to zero. Permits reducing precision of
+                                                        ///  data.
+  bool exclude_zero = false) {                          ///< Exclude zeros from tensor init.
+  
+  TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
+  TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSymmetricRandomGaussian(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
+
+  detail::TensorFillSymmetricGaussianFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values of a Gaussian distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomGaussian(
+  Element *ptr,                           ///< destination buffer
+  size_t capacity,                        ///< number of elements
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  
+
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
+
+  for (size_t i = 0; i < capacity; ++i) {
+    ReferenceFactory<Element>::get(ptr, i) = random_func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element>
+struct RandomUniformFunc {
+
+  using Real = typename RealType<Element>::Type;
+  
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
+  bool exclude_zero;
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0, 
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_) 
+    {
+      std::srand((unsigned)seed);
+      
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range; 
+      }
+  }
+
+
+  /// Compute random value and update RNG state
+  Element operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
+
+    double rnd = double(std::rand()) / double(RAND_MAX);
+
+    rnd = min + range * rnd;
+
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+    if (int_scale >= 0) {
+      rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(Real(rnd));
+    }
+    else {
+      result = static_cast<Element>(Real(rnd));
+    }
+
+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0.0) {
+        rnd = std::min(min + range, rnd + 1.0);
+      } else {
+        rnd = std::max(min, rnd - 1.0);
+      }
+      result = static_cast<Element>(Real(rnd));
+    }
+
+    return result;
+  }
+};
+
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomUniformFunc<complex<Element> > {
+
+  using Real = typename RealType<Element>::Type;
+  
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
+  bool exclude_zero;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0, 
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range; 
+      }
+  }
+
+
+  /// Compute random value and update RNG state
+  complex<Element> operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
+
+    Element reals[2];
+
+    for (int i = 0; i < 2; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+
+      rnd = min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+
+      if (exclude_zero && 
+          i == 0 &&
+          reals[0] == from_real<Element>(0.0)) {
+
+        if (rnd > 0.0) {
+          rnd = std::min(min + range, rnd + 1.0);
+        } else {
+          rnd = std::max(min, rnd - 1.0);
+        }
+        reals[0] = from_real<Element>(Real(rnd));
+      }
+
+    }
+
+    return complex<Element>(reals[0], reals[1]);
+  }
+};
+
+/// Partial specialization for initializing a Quaternion value.
+template <typename Element>
+struct RandomUniformFunc<Quaternion<Element> > {
+
+  using Real = typename RealType<Element>::Type;
+
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1,
+    double pnan_ = 0
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
+    bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
+    bernoulli_dist(pnan_)
+  {
+    std::srand((unsigned)seed);
+  }
+
+
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
+
+    Element reals[4];
+
+    for (int i = 0; i < 4; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+
+      rnd = min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+
+    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+
+/// Computes a random uniform distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomUniformFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomUniformFunc<Element> func;
+
+  //
+  // Methods
+  //
+
+  /// Construction of uniform RNG functor.
+  TensorFillRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>()
+  ):
+    view(view_), func(func_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) {
+
+    view.at(coord) = func();
+  }
+};
+
+/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a uniform distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillSymmetricRandomUniformFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomUniformFunc<Element> func;
+  cutlass::FillMode fill_mode;
+
+  //
+  // Methods
+  //
+
+  /// Construction of uniform RNG functor.
+  TensorFillSymmetricRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kLower &&
+        coord[0] >= coord[1]) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kUpper &&
+        coord[0] <= coord[1]) {
+      view.at(coord) = func();
+    }
+  }
+};
+
+/// Computes a random Uniform distribution and pads diagonal with zeros
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillPadDiagonalRandomUniformFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomUniformFunc<Element> func;
+  cutlass::FillMode fill_mode;
+  int alignment;
+
+  //
+  // Methods
+  //
+
+  /// Construction of uniform RNG functor.
+  TensorFillPadDiagonalRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid,
+    int alignment_ = 1
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_), alignment(alignment_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 && 
+        (fill_mode == cutlass::FillMode::kLower) &&
+        (coord[0] >= coord[1]) || 
+        ((coord[1] - coord[0]) >= alignment)) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kUpper &&
+        (coord[0] <= coord[1]) ||
+        ((coord[0] - coord[1]) >= alignment)) {
+      view.at(coord) = func();
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values of a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  bool exclude_zero = false) {            ///< Exclude zero from tensor init  
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);
+
+  detail::TensorFillRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+/// Fills a tensor with random values of a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorViewPlanarComplex<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                                       ///< seed for RNG
+  double max = 1,                                      ///< upper bound of distribution
+  double min = 0,                                      ///< lower bound for distribution
+  int bits = -1,                                       ///< If non-negative, specifies number of fractional bits that
+                                                       ///  are not truncated to zero. Permits reducing precision of
+                                                       ///  data.
+  double pnan = 0,                                     ///< Percentage of NaN elements.
+  bool exclude_zero = false) {                         ///< Exclude zero from tensor init 
+
+  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
+  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
+}
+
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorView<Quaternion<Element>, Layout> dst,        ///< destination tensor
+  uint64_t seed,                                      ///< seed for RNG
+  double max = 1,                                     ///< upper bound of distribution
+  double min = 0,                                     ///< lower bound for distribution
+  int bits = -1) {                                    ///< If non-negative, specifies number of fractional bits that 
+                                                      ///  are not truncated to zero. Permits reducing precision of
+                                                      ///  data.                 
+  detail::RandomUniformFunc<Quaternion<Element>> random_func(seed, max, min, bits);
+
+  detail::TensorFillRandomUniformFunc<Quaternion<Element>, Layout> func(
+    dst,
+    random_func
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSymmetricRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+
+  detail::TensorFillSymmetricRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+/// Fills a tensor with random values with a uniform random distribution pads zeros along diagonal
+template <
+  typename Element,                       ///< Element type
+  typename Layout>                        ///< Layout function
+void TensorFillPadDiagonalRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  int alignment = 1 
+) {
+
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+
+  detail::TensorFillPadDiagonalRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode,
+    alignment
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with a uniform value
+template <
+  typename Element                        ///< Element type
+>
+void BlockFill(
+  Element *ptr,
+  size_t capacity,
+  Element val
+  ) {                                       
+  for (size_t i = 0; i < capacity; ++i) {
+    ReferenceFactory<Element>::get(ptr, i) = val;
+  }
+}
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomUniform(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0) {                      ///< Percentage of NaN elements.
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);
+
+  for (size_t i = 0; i < capacity; ++i) {
+    ReferenceFactory<Element>::get(ptr, i) = random_func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillDiagonalFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  Element diag;
+  Element other;
+
+  //
+  // Methods
+  //
+
+  TensorFillDiagonalFunc(
+    TensorView const &view_ = TensorView(),
+    Element diag_ = Element(1),
+    Element other_ = Element(0)
+  ):
+    view(view_), diag(diag_), other(other_) { }
+
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    bool is_diag = true;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+
+    view.at(coord) = (is_diag ? diag : other);
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor everywhere with a unique value for its diagonal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillDiagonal(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  Element diag = Element(1),              ///< value to write in the diagonal
+  Element other = Element(0)) {           ///< value to write off the diagonal
+
+  detail::TensorFillDiagonalFunc<Element, Layout> func(
+    dst,
+    diag,
+    other
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to fill a tensor's diagonal with 1 and 0 everywhere else.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillIdentity(
+  TensorView<Element, Layout> dst) {               ///< destination tensor
+
+  TensorFillDiagonal(dst, Element(1), Element(0));
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateDiagonal(
+  TensorView<Element, Layout> dst,                 ///< destination tensor
+  Element val = Element(1)) {
+
+  typename Layout::Index extent = dst.extent().min();
+
+  for (typename Layout::Index i = 0; i < extent; ++i) {
+    Coord<Layout::kRank> coord(i);
+    dst.at(coord) = val;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorUpdateOffDiagonalFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  Element other;
+
+  //
+  // Methods
+  //
+
+  TensorUpdateOffDiagonalFunc(
+    TensorView const &view_ = TensorView(),
+    Element other_ = Element(0)
+  ):
+    view(view_), other(other_) { }
+
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    bool is_diag = true;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+
+    if (!is_diag) {
+      view.at(coord) = other;
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateOffDiagonal(
+  TensorView<Element, Layout> dst,      ///< destination tensor
+  Element other = Element(1)) {
+
+  detail::TensorUpdateOffDiagonalFunc<Element, Layout> func(
+    dst,
+    other
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillLinearFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  Array<Element, Layout::kRank> v;
+  Element s;
+
+  //
+  // Methods
+  //
+  
+  TensorFillLinearFunc() { }
+
+  /// Constructs functor
+  TensorFillLinearFunc(
+    TensorView const &view_,
+    Array<Element, Layout::kRank> const & v_,
+    Element s_ = Element(0)
+  ):
+    view(view_), v(v_), s(s_) { }
+
+  /// Updates the tensor
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    
+    Element sum(s);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank; ++i) {
+      sum += Element(coord[i]) * v[i];
+    }
+
+    view.at(coord) = sum;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills tensor with a linear combination of its coordinate and another vector
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillLinear(
+  TensorView<Element, Layout> dst,      ///< destination tensor
+  Array<Element, Layout::kRank> const & v,
+  Element s = Element(0)) {
+
+  detail::TensorFillLinearFunc<Element, Layout> func(
+    dst,
+    v,
+    s
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills tensor with a linear combination of its coordinate and another vector
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSequential(
+  TensorView<Element, Layout> dst,     ///< destination tensor
+  Element s = Element(0)) {
+
+  Array<Element, Layout::kRank> stride;
+
+  stride[0] = Element(1);
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 1; i < Layout::kRank; ++i) {
+    stride[i] = stride[i - 1] * Element(dst.extent()[i - 1]);
+  }
+
+  TensorFillLinear(dst, stride, s);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values from a distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandom(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,
+  Distribution dist,
+  bool exclude_zero = false               ///< If true, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+) {
+
+  using Real = typename RealType<Element>::Type;
+
+  if (dist.kind == Distribution::Gaussian) {
+    TensorFillRandomGaussian(
+      view,
+      seed,
+      dist.gaussian.mean,
+      dist.gaussian.stddev,
+      dist.int_scale,
+      dist.gaussian.pnz,
+      exclude_zero);
+  } else if (dist.kind == Distribution::Uniform) {
+    TensorFillRandomUniform(
+      view,
+      seed,
+      dist.uniform.max,
+      dist.uniform.min,
+      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequential(
+  Element *ptr,
+  int64_t capacity,
+  Element v = Element(1),
+  Element s = Element(0)) {
+  int i = 0;
+
+  while (i < capacity) {
+    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
+                                        8)>::get(ptr, i) = s;
+
+    s = Element(s + v);
+    ++i;
+  }
+}
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequentialModN(
+  Element *ptr,
+  int64_t capacity,
+  int64_t mod,
+  int64_t v = int64_t(1),
+  int64_t s = int64_t(0)) {
+  int i = 0;
+
+  while (i < capacity) {
+    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
+                                        8)>::get(ptr, i) = Element(s);
+
+    s = int64_t(s + v) % mod;
+    ++i;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillRandom(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,
+  Distribution dist) {
+
+  if (dist.kind == Distribution::Gaussian) {
+    BlockFillRandomGaussian<Element>(
+      ptr, 
+      capacity, 
+      seed, 
+      dist.gaussian.mean, 
+      dist.gaussian.stddev, 
+      dist.int_scale,
+      dist.gaussian.pnz);
+  }
+  else if (dist.kind == Distribution::Uniform) {
+    BlockFillRandomUniform<Element>(
+      ptr, 
+      capacity, 
+      seed, 
+      dist.uniform.max,
+      dist.uniform.min, 
+      dist.int_scale,
+      dist.uniform.pnan);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element>
+struct RandomSparseMetaFunc {
+  
+  uint64_t seed;
+  int range;
+  int MetaSizeInBits;
+
+  //
+  // Methods
+  //
+
+  RandomSparseMetaFunc(
+    uint64_t seed_ = 0, 
+    int MetaSizeInBits_ = 2
+  ):
+    seed(seed_), MetaSizeInBits(MetaSizeInBits_) {
+      std::srand((unsigned)seed);
+      if (MetaSizeInBits_ == 2) {
+        range = 6;
+      }
+      else if (MetaSizeInBits_ == 4) {
+        range = 2;
+      }
+      else {
+        throw std::invalid_argument("Invalid MetaSizeInBits");
+      }
+    }
+
+  /// Compute random value and update RNG state
+  Element operator()() const {
+    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
+    Element TwoToOneMeta[2] = {0x4, 0xe};
+
+    Element * MetaArray = (MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
+
+    Element result = 0x0;
+
+    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
+      int rnd = std::rand() % range;
+      Element meta = MetaArray[rnd];
+
+      result = (Element)(result | ((Element)(meta << (i * 4))));
+    }
+
+    return result;
+  }
+};
+
+/// Computes a random sparse meta
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomSparseMetaFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomSparseMetaFunc<Element> func;
+
+  //
+  // Methods
+  //
+
+  /// Construction of Gaussian RNG functor.
+  TensorFillRandomSparseMetaFunc(
+    TensorView view_ = TensorView(),
+    RandomSparseMetaFunc<Element> func_ = RandomSparseMetaFunc<Element>()
+  ):
+    view(view_), func(func_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+
+    view.at(coord) = func();
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,                 ///< Element type
+  typename Layout>                  ///< Layout function
+void TensorFillRandomSparseMeta(
+  TensorView<Element, Layout> dst,  ///< destination tensor
+  uint64_t seed,                    ///< seed for RNG
+  int MetaSizeInBits) {             ///< 2 bit or 4 bit
+
+  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
+
+  detail::TensorFillRandomSparseMetaFunc<Element, Layout> func(
+    dst,
+    random_func
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomSparseMeta(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  int MetaSizeInBits) {                   ///< 2 bit or 4bit
+
+  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
+
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a ell block index matrix with random values with a uniform random distribution.
+template <
+  typename Element,                                ///< Element type
+  typename Layout>                                 ///< Layout function
+void TensorFillRandomEllIdx(
+  TensorView<Element, Layout> dst,                 ///< destination tensor
+  uint64_t seed,                                   ///< seed for RNG
+  int rows, int ell_cols, int cols) {              ///< dimension of the matrix 
+
+  std::srand((unsigned)seed);
+
+  for (int i = 0; i < rows; ++i) {
+    int col_idx = std::rand() % cols;
+   
+    for (int j = 0; j < ell_cols; ++j) {
+      dst.at({i, j}) = col_idx;
+
+      if (col_idx != -1) {
+        if (col_idx == (cols - 1)) {
+          col_idx = -1;
+        } else {
+          col_idx = std::rand() % (cols - col_idx - 1) + col_idx + 1;
+        }
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies a diagonal in from host memory without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalIn(
+  TensorView<Element, Layout> dst,          ///< destination tensor
+  Element const *ptr) {                     ///< dense buffer of elements
+
+  typename Layout::Index extent = dst.extent().min();
+  
+  for (typename Layout::Index i = 0; i < extent; ++i) {
+    Coord<Layout::kRank> coord(i);
+    dst.at(coord) = ReferenceFactory<Element>::get(ptr, i);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Copies the diagonal of a tensor into a dense buffer in host memory.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalOut(
+  Element *ptr,                               ///< dense buffer of elements
+  TensorView<Element, Layout> src) {          ///< source tensor
+
+  typename Layout::Index extent = src.extent().min();
+  
+  for (typename Layout::Index i = 0; i < extent; ++i) {
+    Coord<Layout::kRank> coord(i);
+    ReferenceFactory<Element>::get(ptr, i) = src.at(coord);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.hpp b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.hpp
new file mode 100644
index 0000000000000..86a54e2ee06b7
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.hpp
@@ -0,0 +1,432 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+
+// Cute includes
+#include "cute/tensor.hpp"
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Uniform and procedural tensor fills
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with a scalar element
+template <typename Tensor>
+void TensorFill(Tensor dst, typename Tensor::value_type element) {
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = element;
+  }
+}
+
+/// Fills a tensor with the contents of its layout
+template <typename Tensor>
+void TensorFillSequential(Tensor dst) {
+
+  auto layout = dst.layout();
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = layout(idx);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Random uniform values
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element>
+struct RandomUniformFunc {
+
+  using Real = typename RealType<Element>::Type;
+  
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0, 
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+
+
+  /// Compute random value and update RNG state
+  Element operator()() const {
+
+    double rnd = double(std::rand()) / double(RAND_MAX);
+
+    rnd = min + range * rnd;
+
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+    
+    if (int_scale >= 0) {
+      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(Real(rnd));
+    }
+    else {
+      result = static_cast<Element>(Real(rnd));
+    }
+
+    return result;
+  }
+};
+
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomUniformFunc<complex<Element> > {
+
+  using Real = typename RealType<Element>::Type;
+  
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0, 
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+
+
+  /// Compute random value and update RNG state
+  complex<Element> operator()() const {
+
+    Element reals[2];
+
+    for (int i = 0; i < 2; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+
+      rnd = min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+
+    return complex<Element>(reals[0], reals[1]);
+  }
+};
+
+/// Partial specialization for initializing a Quaternion value.
+template <typename Element>
+struct RandomUniformFunc<Quaternion<Element> > {
+
+  using Real = typename RealType<Element>::Type;
+
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+
+
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() const {
+
+    Element reals[4];
+
+    for (int i = 0; i < 4; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+
+      rnd = min + range * rnd;
+
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+
+    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <typename Tensor>                ///< Tensor object
+void TensorFillRandomUniform(
+  Tensor dst,                             ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.   
+
+  detail::RandomUniformFunc<typename Tensor::value_type> random_func(seed, max, min, bits);
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = random_func();
+  }
+}
+
+/// Fills a block with random values with a uniform random distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomUniform(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.                 
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Random Gaussian
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element>
+struct RandomGaussianFunc {
+
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0, 
+    double mean_ = 0, 
+    double stddev_ = 1,
+    int int_scale_ = -1
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
+      std::srand((unsigned)seed);
+  }
+
+  /// Compute random value and update RNG state
+  Element operator()() const {
+
+    // Box-Muller transform to generate random numbers with Normal distribution
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+
+    // Compute Gaussian random value
+    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd = mean + stddev * rnd;
+
+    // Scale and convert final result
+    Element result;
+
+    if (int_scale >= 0) {
+      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(rnd);
+    }
+    else {
+      result = static_cast<Element>(rnd);
+    }
+
+    return result;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Tensor
+>
+void TensorFillRandomGaussian(
+  Tensor  dst,                            ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  
+  detail::RandomGaussianFunc<typename Tensor::value_type> random_func(seed, mean, stddev, bits);
+
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = random_func();
+  }
+}
+
+/// Fills a block with random values with a Gaussian distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomGaussian(
+  Element *ptr,                           ///< destination buffer
+  size_t capacity,                        ///< number of elements
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
+
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequential(
+  Element *ptr,
+  int64_t capacity,
+  Element v = Element(1),
+  Element s = Element(0)) {
+  int i = 0;
+
+  while (i < capacity) {
+
+    ptr[i] = Element(s + v);
+    ++i;
+  }
+}
+
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequentialModN(
+  Element *ptr,
+  int64_t capacity,
+  int64_t mod,
+  int64_t v = int64_t(1),
+  int64_t s = int64_t(0)) {
+  int i = 0;
+
+  while (i < capacity) {
+
+    ptr[i] = static_cast<Element>(int32_t(int64_t(s + v) % mod));
+    ++i;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_foreach.h b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_foreach.h
new file mode 100644
index 0000000000000..43ff17362c21b
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_foreach.h
@@ -0,0 +1,134 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+
+namespace cutlass  {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines several helpers
+namespace detail {
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank, int RankRemaining>
+struct TensorForEachHelper {
+
+  /// Index of the active rank
+  static int const kActiveRank = Rank - RankRemaining - 1;
+
+  /// Constructor for general rank
+  TensorForEachHelper(
+    Func &func,
+    Coord<Rank> const &extent,
+    Coord<Rank> &coord) {
+
+    for (int i = 0; i < extent.at(kActiveRank); ++i) {
+      coord[kActiveRank] = i;
+      TensorForEachHelper<Func, Rank, RankRemaining - 1>(func, extent, coord);
+    }
+  }
+};
+
+/// Helper to perform for-each operation
+template <typename Func, int Rank>
+struct TensorForEachHelper<Func, Rank, 0> {
+
+  /// Index of the active rank
+  static int const kActiveRank = Rank - 1;
+
+  /// Constructor for fastest changing rank
+  TensorForEachHelper(
+    Func &func,
+    Coord<Rank> const &extent,
+    Coord<Rank> &coord) {
+
+    for (int i = 0; i < extent.at(kActiveRank); ++i) {
+      coord[kActiveRank] = i;
+      func(coord);
+    }
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over the index space of a tensor
+template <
+  typename Func,          ///< function applied to each point in a tensor's index space
+  int Rank>               ///< rank of index space
+void TensorForEach(Coord<Rank> extent, Func & func) {
+  Coord<Rank> coord;
+  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over the index space of a tensor and calls a C++ lambda
+template <
+  typename Func,          ///< function applied to each point in a tensor's index space
+  int Rank>               ///< rank of index space
+void TensorForEachLambda(Coord<Rank> extent, Func func) {
+  Coord<Rank> coord;
+  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, typename Func>
+struct BlockForEach {
+
+  /// Constructor performs the operation.
+  BlockForEach(
+    Element *ptr, 
+    size_t capacity,
+    typename Func::Params params = typename Func::Params()) {
+  
+    Func func(params);
+
+    for (size_t index = 0; index < capacity; ++index) {
+      ptr[index] = func();
+    }    
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_norm.h b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_norm.h
new file mode 100644
index 0000000000000..8a7240665550d
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_norm.h
@@ -0,0 +1,42 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+
+#include "cutlass/cutlass.h"
+
+// The contents of this file have been moved  to 'tensor_reduce' to cover other types of reductions.
+
+#include "cutlass/util/reference/host/tensor_reduce.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.h b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.h
new file mode 100644
index 0000000000000..048352ae29514
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.h
@@ -0,0 +1,203 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cmath>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/util/reference/detail/linear_to_coordinate.h"
+#include "cutlass/core_io.h"
+
+namespace cutlass  {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform
+) {
+
+  for (int64_t idx = 0; idx < int64_t(view.size()); ++idx) {
+    typename Layout::TensorCoord coord;
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
+
+    if (view.contains(coord)) {
+      Element x = view.at(coord);
+      identity = reduce(identity, transform(x));
+    }
+  }
+
+  return identity;
+}
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform) {
+  
+  if (view_A.extent() != view_B.extent()) {
+    throw std::runtime_error("Tensor extents must match.");
+  }
+
+  for (int64_t idx = 0; idx < int64_t(view_A.size()); ++idx) {
+
+    typename Layout::TensorCoord coord;
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
+
+    if (view_A.contains(coord)) {
+      Element a = view_A.at(coord);
+      Element b = view_B.at(coord);
+      identity = reduce(identity, transform(a, b));
+    }
+  }
+
+  return identity;
+}
+
+/// Helper to compute the sum of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSum(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  NumericConverter<ComputeType, Element> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+
+/// Helper to compute the sum of the squares of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSumSq(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared<Element, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+
+/// Helper to compute the norm of the elements of a tensor.
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNorm(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType()
+) {
+
+  return std::sqrt(TensorSumSq(view, identity));
+}
+
+/// Helper to compute the sum of the squares of the differences of two tensors
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorSumSqDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared_difference<Element, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view_A, view_B, identity, reduce, transform);
+}
+
+
+/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNormDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType()
+) {
+
+  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.hpp b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.hpp
new file mode 100644
index 0000000000000..5ea5154107fcb
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.hpp
@@ -0,0 +1,203 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+
+// Cute includes
+#include "cute/tensor.hpp"
+
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tensor reductions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Tensor,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  Tensor view,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform
+) {
+
+  for (int64_t idx = 0; idx < cute::size(view); ++idx) {
+    identity = reduce(identity, transform(view(idx)));
+  }
+
+  return identity;
+}
+
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform) {
+  
+  if (cute::size(view_A) != cute::size(view_B)) {
+    throw std::runtime_error("Tensor sizes must match.");
+  }
+
+  for (int64_t idx = 0; idx < cute::size(view_A); ++idx) {
+    identity = reduce(identity, transform(view_A(idx), view_B(idx)));
+  }
+
+  return identity;
+}
+
+/// Helper to compute the sum of the elements of a tensor
+template <
+  typename Tensor,
+  typename ComputeType = typename Tensor::value_type
+>
+ComputeType TensorSum(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  NumericConverter<ComputeType, typename Tensor::value_type> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+
+/// Helper to compute the sum of the squares of the elements of a tensor
+template <
+  typename Tensor,
+  typename ComputeType = typename Tensor::value_type
+>
+ComputeType TensorSumSq(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared<typename Tensor::value_type, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+
+/// Helper to compute the norm of the elements of a tensor.
+template <
+  typename Tensor,
+  typename ComputeType = double
+>
+ComputeType TensorNorm(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+
+  return std::sqrt(TensorSumSq(view, identity));
+}
+
+/// Helper to compute the sum of the squares of the differences of two tensors
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType = double
+>
+ComputeType TensorSumSqDiff(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity = ComputeType()
+) {
+
+  plus<ComputeType> reduce;
+  magnitude_squared_difference<typename TensorA::value_type, ComputeType> transform;
+
+  return TensorTransformReduce(
+    view_A, view_B, identity, reduce, transform);
+}
+
+
+/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType = double
+>
+ComputeType TensorNormDiff(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity = ComputeType()
+) {
+
+  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/trmm.h b/csrc/quantization/cutlass_test/example/util/reference/host/trmm.h
new file mode 100644
index 0000000000000..08b979254278c
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/trmm.h
@@ -0,0 +1,215 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for TRMM in host-side code.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_trmm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
+                , "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+ 
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              ElementA a = ElementA();
+              ElementB b = ElementB();
+
+              if (SideModeA == SideMode::kLeft) {
+                a = (compare_op(row, k_block)) ? 
+                            (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
+                if (row == k_block && DiagTypeA == DiagType::kUnit) {
+                  a = ElementA(1);
+                }
+                b = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a = tensor_b.at(MatrixCoord(row, k_block));
+                b = (compare_op(k_block, col)) ? 
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
+                if (k_block == col && DiagTypeA == DiagType::kUnit) {
+                  b = ElementA(1);
+                }
+              }
+                            
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+
+              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Trmm;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, SideMode SideModeA,
+           FillMode FillModeA, DiagType DiagTypeA, 
+           typename ElementB, typename LayoutB,
+           typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
+            ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
+                 ElementC, LayoutC, ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/trmm_complex.h b/csrc/quantization/cutlass_test/example/util/reference/host/trmm_complex.h
new file mode 100644
index 0000000000000..86e58a035b481
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/reference/host/trmm_complex.h
@@ -0,0 +1,262 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued TRMM in host-side code.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  ComplexTransform TransformA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  ComplexTransform TransformB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_trmm_complex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
+                , "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
+  
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+ 
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+  
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              ElementA a = ElementA();
+              ElementB b = ElementB();
+              
+              if (SideModeA == SideMode::kLeft) {
+                a = (compare_op(row, k_block)) ? 
+                              (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
+                if (row == k_block && DiagTypeA == DiagType::kUnit) {
+                  a = ElementA(1);
+                }
+                b = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a = tensor_b.at(MatrixCoord(row, k_block));
+                b = (compare_op(k_block, col)) ? 
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
+                if (k_block == col && DiagTypeA == DiagType::kUnit) {
+                  b = ElementA(1);
+                }
+              }
+
+              ComputeType a_ik = ComputeType(a);
+              ComputeType b_kj = ComputeType(b);
+              
+              // Conjugate, and hence hermitian, is only allowed for the triangular matrix
+              if (SideModeA == SideMode::kLeft && TransformA == ComplexTransform::kConjugate) {
+                a_ik = conj(a_ik);
+              } else if (SideModeA == SideMode::kRight && TransformA == ComplexTransform::kConjugate) {
+                b_kj = conj(b_kj);
+              }
+
+              accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  ComplexTransform TransformA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  ComplexTransform TransformB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
+>
+struct TrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
+          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
+          typename ElementB, typename LayoutB, ComplexTransform TransformB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct TrmmComplex<ElementA, LayoutA, TransformA, 
+                   SideModeA, FillModeA, DiagTypeA,
+                   ElementB, LayoutB, TransformB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, arch::OpMultiplyAddComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_trmm_complex<ElementA, LayoutA, TransformA,
+                 SideModeA, FillModeA, DiagTypeA,
+                 ElementB, LayoutB, TransformB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for gaussian multiply-add 
+template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
+          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
+          typename ElementB, typename LayoutB, ComplexTransform TransformB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct TrmmComplex<ElementA, LayoutA, TransformA, 
+                   SideModeA, FillModeA, DiagTypeA,
+                   ElementB, LayoutB, TransformB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, arch::OpMultiplyAddGaussianComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_trmm_complex<ElementA, LayoutA, TransformA,
+                 SideModeA, FillModeA, DiagTypeA,
+                 ElementB, LayoutB, TransformB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/example/util/tensor_view_io.h b/csrc/quantization/cutlass_test/example/util/tensor_view_io.h
new file mode 100644
index 0000000000000..4f6bdd686b8f0
--- /dev/null
+++ b/csrc/quantization/cutlass_test/example/util/tensor_view_io.h
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+**************************************************************************************************/
+#pragma once
+
+#include "cutlass/core_io.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/tensor_view_planar_complex.h"
+#include "cutlass/complex.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Helper to write the least significant rank of a TensorView
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream & TensorView_WriteLeastSignificantRank(
+  std::ostream& out, 
+  TensorView<Element, Layout> const& view,
+  Coord<Layout::kRank> const &start_coord,
+  int rank,
+  std::streamsize width) {
+
+  for (int idx = 0; idx < view.extent(rank); ++idx) {
+
+    Coord<Layout::kRank> coord(start_coord);
+    coord[rank] = idx;
+
+    if (idx) {
+      out.width(0);
+      out << ", ";
+    }
+    if (idx || coord) {
+      out.width(width);
+    }
+    out << ScalarIO<Element>(view.at(coord));
+  }
+
+  return out;
+}
+
+/// Helper to write a rank of a TensorView
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream & TensorView_WriteRank(
+  std::ostream& out, 
+  TensorView<Element, Layout> const& view,
+  Coord<Layout::kRank> const &start_coord,
+  int rank,
+  std::streamsize width) {
+
+  // If called on the least significant rank, write the result as a row
+  if (rank + 1 == Layout::kRank) {
+    return TensorView_WriteLeastSignificantRank(out, view, start_coord, rank, width);
+  }
+
+  // Otherwise, write a sequence of rows and newlines
+  for (int idx = 0; idx < view.extent(rank); ++idx) {
+
+    Coord<Layout::kRank> coord(start_coord);
+    coord[rank] = idx;
+
+    if (rank + 2 == Layout::kRank) {
+      // Write least significant ranks asa matrix with rows delimited by "\n"
+      if (idx) {
+        out << ",\n";
+      }
+      TensorView_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
+    }
+    else {
+      // Higher ranks are separated by newlines
+      if (idx) {
+        out << ",\n\n";
+      }
+      TensorView_WriteRank(out, view, coord, rank + 1, width);
+    }
+  }
+
+  return out;
+}
+
+/// Helper to write the least significant rank of a TensorView
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream & TensorViewPlanarComplex_WriteLeastSignificantRank(
+  std::ostream& out, 
+  TensorViewPlanarComplex<Element, Layout> const& view,
+  Coord<Layout::kRank> const &start_coord,
+  int rank,
+  std::streamsize width) {
+
+  for (int idx = 0; idx < view.extent(rank); ++idx) {
+
+    Coord<Layout::kRank> coord(start_coord);
+    coord[rank] = idx;
+
+    if (idx) {
+      out.width(0);
+      out << ", ";
+    }
+    if (idx || coord) {
+      out.width(width);
+    }
+
+    complex<Element> x = view.at(coord);
+    out << x;
+  }
+
+  return out;
+}
+
+/// Helper to write a rank of a TensorView
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream & TensorViewPlanarComplex_WriteRank(
+  std::ostream& out, 
+  TensorViewPlanarComplex<Element, Layout> const& view,
+  Coord<Layout::kRank> const &start_coord,
+  int rank,
+  std::streamsize width) {
+
+  // If called on the least significant rank, write the result as a row
+  if (rank + 1 == Layout::kRank) {
+    return TensorViewPlanarComplex_WriteLeastSignificantRank(out, view, start_coord, rank, width);
+  }
+
+  // Otherwise, write a sequence of rows and newlines
+  for (int idx = 0; idx < view.extent(rank); ++idx) {
+
+    Coord<Layout::kRank> coord(start_coord);
+    coord[rank] = idx;
+
+    if (rank + 2 == Layout::kRank) {
+      // Write least significant ranks asa matrix with rows delimited by ";\n"
+      if (idx) {
+        out << ";\n";
+      }
+      TensorViewPlanarComplex_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
+    }
+    else {
+      // Higher ranks are separated by newlines
+      if (idx) {
+        out << "\n";
+      }
+      TensorViewPlanarComplex_WriteRank(out, view, coord, rank + 1, width);
+    }
+  }
+
+  return out;
+}
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Prints human-readable representation of a TensorView to an ostream
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream& TensorViewWrite(
+  std::ostream& out, 
+  TensorView<Element, Layout> const& view) {
+
+  // Prints a TensorView according to the following conventions:
+  //   - least significant rank is printed as rows separated by ";\n"
+  //   - all greater ranks are delimited with newlines
+  //
+  // The result is effectively a whitespace-delimited series of 2D matrices.
+
+  return detail::TensorView_WriteRank(out, view, Coord<Layout::kRank>(), 0, out.width());
+}
+
+/// Prints human-readable representation of a TensorView to an ostream
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream& operator<<(
+  std::ostream& out, 
+  TensorView<Element, Layout> const& view) {
+
+  // Prints a TensorView according to the following conventions:
+  //   - least significant rank is printed as rows separated by ";\n"
+  //   - all greater ranks are delimited with newlines
+  //
+  // The result is effectively a whitespace-delimited series of 2D matrices.
+
+  return TensorViewWrite(out, view);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Prints human-readable representation of a TensorView to an ostream
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream& TensorViewWrite(
+  std::ostream& out, 
+  TensorViewPlanarComplex<Element, Layout> const& view) {
+
+  // Prints a TensorView according to the following conventions:
+  //   - least significant rank is printed as rows separated by ";\n"
+  //   - all greater ranks are delimited with newlines
+  //
+  // The result is effectively a whitespace-delimited series of 2D matrices.
+
+  return detail::TensorViewPlanarComplex_WriteRank(out, view, Coord<Layout::kRank>(), 0, out.width());
+}
+
+/// Prints human-readable representation of a TensorView to an ostream
+template <
+  typename Element,
+  typename Layout
+>
+inline std::ostream& operator<<(
+  std::ostream& out, 
+  TensorViewPlanarComplex<Element, Layout> const& view) {
+
+  // Prints a TensorView according to the following conventions:
+  //   - least significant rank is printed as rows separated by ";\n"
+  //   - all greater ranks are delimited with newlines
+  //
+  // The result is effectively a whitespace-delimited series of 2D matrices.
+
+  return TensorViewWrite(out, view);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/exceptions.h b/csrc/quantization/cutlass_test/exceptions.h
new file mode 100644
index 0000000000000..54c62fdbb6f5d
--- /dev/null
+++ b/csrc/quantization/cutlass_test/exceptions.h
@@ -0,0 +1,69 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ exception semantics for CUDA error codes
+ */
+
+#include <cuda_runtime.h>
+#include <iosfwd>
+#include <stdexcept>
+
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+/// C++ exception wrapper for CUDA \p cudaError_t
+class cuda_exception : public std::exception {
+ public:
+  /// Constructor
+  cuda_exception(const char* msg = "", cudaError_t err = cudaErrorUnknown) : msg(msg), err(err) {}
+
+  /// Returns the underlying CUDA \p cudaError_t
+  cudaError_t cudaError() const { return err; }
+
+ protected:
+  /// Explanatory string
+  const char* msg;
+
+  /// Underlying CUDA \p cudaError_t
+  cudaError_t err;
+};
+
+/// Writes a cuda_exception instance to an output stream
+inline std::ostream& operator<<(std::ostream& out, cuda_exception const& e) {
+  return out << e.what() << ": " << cudaGetErrorString(e.cudaError());
+}
+
+}  // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/helper.h b/csrc/quantization/cutlass_test/helper.h
new file mode 100644
index 0000000000000..f333fab9cac53
--- /dev/null
+++ b/csrc/quantization/cutlass_test/helper.h
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                              \
+  {                                                                     \
+    cudaError_t error = status;                                         \
+    if (error != cudaSuccess) {                                         \
+      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                << " at line: " << __LINE__ << std::endl;               \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  }
+
+
+/**
+ * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
+ */
+struct GpuTimer
+{
+    cudaStream_t _stream_id;
+    cudaEvent_t _start;
+    cudaEvent_t _stop;
+
+    /// Constructor
+    GpuTimer() : _stream_id(0)
+    {
+        CUDA_CHECK(cudaEventCreate(&_start));
+        CUDA_CHECK(cudaEventCreate(&_stop));
+    }
+
+    /// Destructor
+    ~GpuTimer()
+    {
+        CUDA_CHECK(cudaEventDestroy(_start));
+        CUDA_CHECK(cudaEventDestroy(_stop));
+    }
+
+    /// Start the timer for a given stream (defaults to the default stream)
+    void start(cudaStream_t stream_id = 0)
+    {
+        _stream_id = stream_id;
+        CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+    }
+
+    /// Stop the timer
+    void stop()
+    {
+        CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
+    }
+
+    /// Return the elapsed time (in milliseconds)
+    float elapsed_millis()
+    {
+        float elapsed = 0.0;
+        CUDA_CHECK(cudaEventSynchronize(_stop));
+        CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
+        return elapsed;
+    }
+};
diff --git a/csrc/quantization/cutlass_test/host_tensor.h b/csrc/quantization/cutlass_test/host_tensor.h
new file mode 100644
index 0000000000000..3f061875b48dc
--- /dev/null
+++ b/csrc/quantization/cutlass_test/host_tensor.h
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+  \brief HostTensor contributes management for both host and device memory.
+
+  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
+  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
+  for CUDA memcpy operations.
+
+  Call {host, device}_{data, ref, view}() for accessing host or device memory.
+
+  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
+*/
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/fast_math.h"
+
+#include "device_memory.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Host tensor
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class HostTensor {
+public:
+
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+
+  /// Tensor reference to device memory
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Tensor reference to constant device memory
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  /// Tensor reference to device memory
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Tensor reference to constant device memory
+  using ConstTensorView = typename TensorView::ConstTensorView;
+
+  /// Reference to element in tensor
+  using Reference = typename TensorRef::Reference;
+
+  /// Constant reference to element in tensor
+  using ConstReference = typename ConstTensorRef::Reference;
+
+private:
+  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
+                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
+                                      Element, uint8_t>>;
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
+  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
+  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
+  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+
+  //
+  // Data members
+  //
+
+  /// Extent of tensor in logical dimensions
+  TensorCoord extent_;
+
+  /// Layout object
+  Layout layout_;
+
+  /// Host-side memory allocation
+  std::vector<StorageUnit> host_;
+
+  /// Device-side memory
+  device_memory::allocation<StorageUnit> device_;
+
+  /// number of containers 
+  size_t count_to_container_storage_unit_count(size_t count) {
+    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
+  }
+
+public:
+  //
+  // Device and Host Methods
+  //
+
+  /// Default constructor
+  HostTensor() {}
+
+  /// Constructs a tensor given an extent. Assumes a packed layout
+  HostTensor(
+    TensorCoord const &extent,
+    bool device_backed = true
+  ) {
+
+    this->reset(extent, Layout::packed(extent), device_backed);
+  }
+
+  /// Constructs a tensor given an extent and layout
+  HostTensor(
+    TensorCoord const &extent,
+    Layout const &layout,
+    bool device_backed = true
+  ) {
+
+    this->reset(extent, layout, device_backed);
+  }
+
+  ~HostTensor() { }
+
+  /// Clears the HostTensor allocation to size/capacity = 0
+  void reset() {
+    extent_ = TensorCoord();
+    layout_ = Layout::packed(extent_);
+
+    host_.clear();
+    device_.reset();
+  }
+
+  /// Resizes internal memory allocations without affecting layout or extent
+  void reserve(
+    size_t count,                                        ///< size of tensor in elements
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
+#endif
+
+    device_.reset();
+    host_.clear();
+
+    size_t count_container = count_to_container_storage_unit_count(count);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
+#endif    
+    host_.resize(count_container);
+
+    // Allocate memory
+    StorageUnit* device_memory = nullptr;
+    if (device_backed_) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
+#endif
+      device_memory = device_memory::allocate<StorageUnit>(count_container);
+    }
+    device_.reset(device_memory, device_backed_ ? count_container : 0);
+  }
+
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    extent_ = extent;
+    layout_ = layout;
+
+    reserve(size_t(layout_.capacity(extent_)), device_backed_);
+  }
+
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout. Assumes a packed tensor configuration.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    reset(extent, Layout::packed(extent), device_backed_);
+  }
+
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset().
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    extent_ = extent;
+    layout_ = layout;
+
+    LongIndex new_size = size_t(layout_.capacity(extent_));
+    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
+
+    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
+      reserve(new_size, device_backed_);
+    }
+  }
+
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+
+    resize(extent, Layout::packed(extent), device_backed_);
+  }
+
+  /// Returns the logical number of elements stored in the host tensor
+  size_t size() const {
+    return layout_.capacity(extent_);
+  }
+
+  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
+  LongIndex capacity() const {
+    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
+  }
+
+  /// Gets pointer to host data
+  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
+
+  /// Gets pointer to host data with a pointer offset
+  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+
+  /// Gets a reference to an element in host memory
+  Reference host_data(LongIndex idx) {
+    return ReferenceFactory<Element>::get(host_data(), idx);
+  }
+
+  /// Gets pointer to host data
+  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
+
+  /// Gets pointer to host data with a pointer offset
+  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+
+  /// Gets a constant reference to an element in host memory
+  ConstReference host_data(LongIndex idx) const {
+    return ReferenceFactory<Element const>::get(host_data(), idx);
+  }
+
+  /// Gets pointer to device data
+  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
+
+  /// Gets pointer to device data
+  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
+
+  /// Gets pointer to device data with a pointer offset
+  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+
+  /// Gets pointer to device data with a pointer offset
+  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+
+  /// Accesses the tensor reference pointing to data
+  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+
+  /// Accesses the tensor reference pointing to data
+  TensorRef device_ref(LongIndex ptr_element_offset=0) {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  TensorView host_view(LongIndex ptr_element_offset=0) {
+    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  TensorView device_view(LongIndex ptr_element_offset=0) {
+    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+
+  /// Returns true if device memory is allocated
+  bool device_backed() const {
+    return (device_.get() == nullptr) ? false : true;
+  }
+
+
+  /// Returns the layout object
+  Layout & layout() {
+    return layout_;
+  }
+
+  /// Returns the layout object
+  Layout layout() const {
+    return layout_;
+  }
+
+  /// Returns the layout object's stride vector
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride vector
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+
+  /// Computes the offset of an index from the origin of the tensor
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+
+  /// Returns a reference to the element at the logical Coord in host memory
+  Reference at(TensorCoord const& coord) {
+    return host_data(offset(coord));
+  }
+
+  /// Returns a const reference to the element at the logical Coord in host memory
+  ConstReference at(TensorCoord const& coord) const {
+    return host_data(offset(coord));
+  }
+
+  /// Returns the extent of the tensor
+  TensorCoord extent() const {
+    return extent_;
+  }
+
+  /// Returns the extent of the tensor
+  TensorCoord & extent() {
+    return extent_;
+  }
+
+  /// Copies data from device to host
+  void sync_host() {
+    if (device_backed()) {
+      device_memory::copy_to_host(
+          host_.data(), device_.get(), device_.size());
+    }
+  }
+
+  /// Copies data from host to device
+  void sync_device() {
+    if (device_backed()) {
+      device_memory::copy_to_device(
+          device_.get(), host_.data(), host_.size());
+    }
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_host(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_device(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_device(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_host(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_host(
+    Element * ptr_host,               ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_device(
+    Element * ptr_device,             ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_device(
+    Element * ptr_device,             ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
+  }
+
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_host(
+    Element * ptr_host,               ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/packed_stride.hpp b/csrc/quantization/cutlass_test/packed_stride.hpp
new file mode 100644
index 0000000000000..e9a243a1322cc
--- /dev/null
+++ b/csrc/quantization/cutlass_test/packed_stride.hpp
@@ -0,0 +1,570 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/container/array.hpp"   // cute::array
+#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides without batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with batch mode
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, int64_t>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT, int64_t>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides with group mode
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Strides for convolutions
+
+// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
+// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
+// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
+// right in KTRSC order and can be coalesced to just k.
+// We enforce this condition here with asserts.
+template <class IntT, size_t RankT_>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
+    cute::array<int32_t, RankT_> shape_output,
+    cute::array<IntT, RankT_> stride_output,
+    cutlass::conv::Operator conv_op) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  static_assert(RankT_ >= 3u);
+  constexpr static int RankT = static_cast<int>(RankT_);
+
+  assert(stride_output[RankT-1] == 1);
+  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
+    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
+  });
+
+  auto s_copy = s;
+  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
+      stride_output[0] :
+      stride_output[RankT-2];
+  return s_copy;
+}
+
+//
+// Activation tensor ((w, h, d, n), _1) for fprop kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_nwc[1];
+  cute::get<0,1>(s_copy) = stride_nwc[0];
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
+  });
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
+  });
+  return s_copy;
+}
+
+//
+// Filter tensor (k, (_1, s, r, t)) for fprop kernel
+//
+
+// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+//
+// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
+//
+// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
+//
+
+// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
+// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::get<1,0>(s_copy) = stride_nwc[1];
+    cute::get<1,1>(s_copy) = stride_nwc[0];
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nwc in dgrad is ksc.
+    cute::get<1,0>(s_copy) = stride_nwc[0];
+    cute::get<1,1>(s_copy) = stride_nwc[1];
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nhwc in dgrad is krsc.
+    cute::get<1,0>(s_copy) = stride_nhwc[0];
+    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_ndhwc in dgrad is ktrsc.
+    cute::get<1,0>(s_copy) = stride_ndhwc[0];
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+
+//
+// NZPQ tensor (_1, nzpq) for wgrad kernel
+//
+
+// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 3> stride_nqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nqk[2] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nqk[1];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 4> stride_npqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_npqk[3] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_npqk[2];
+  return s_copy;
+}
+
+// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 5> stride_nzpqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_nzpqk[4] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nzpqk[3];
+  return s_copy;
+}
+
+
+
+//
+// Wgrad output tensor (k, (_1, s, r, t), _0)
+//
+
+// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+
+
+//
+// Wgrad output tensor ((_1, s, r, t), k, _0)
+//
+
+// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ksc[0];
+  cute::get<0,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+
+// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/csrc/quantization/cutlass_test/test_mm_c3x.cu b/csrc/quantization/cutlass_test/test_mm_c3x.cu
new file mode 100644
index 0000000000000..b544e01a2913a
--- /dev/null
+++ b/csrc/quantization/cutlass_test/test_mm_c3x.cu
@@ -0,0 +1,205 @@
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
+// clang-format on
+
+#include "common_gemm.cuh"
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& e,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_test_gemm_caller<Cutlass3xGemmM64>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_test_gemm_caller<Cutlass3xGemmM128>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& e,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_int8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NBig =
+      typename sm90_int8_config_M32_NBig<InType, OutType,
+                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM32NSmall =
+      typename sm90_int8_config_M32_NSmall<InType, OutType,
+                                           Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = out.size(1);
+  bool const is_small_n = n < 8192;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 32) {
+    // m in [1, 32]
+    if (is_small_n) {
+      return cutlass_test_gemm_caller<Cutlass3xGemmM32NSmall>(
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return cutlass_test_gemm_caller<Cutlass3xGemmM32NBig>(
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (mp2 <= 64) {
+    // m in (32, 64]
+    return cutlass_test_gemm_caller<Cutlass3xGemmM64>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_test_gemm_caller<Cutlass3xGemmM128>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_test_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& e,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  if (a.dtype() == torch::kInt8) {
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kInt8);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  } else {
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+}
+
+void cutlass_scaled_test_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& e,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogueBias>(
+        c, a, e, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogue>(c, a, e, b,
+                                                           a_scales,
+                                                           b_scales);
+  }
+}
+
+void cutlass_scaled_test_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& e,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                c10::optional<torch::Tensor> const& azp,
+                                c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+        out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
+        out, a, e, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
+
+#endif
diff --git a/csrc/quantization/cutlass_test/test_mm_entry.cu b/csrc/quantization/cutlass_test/test_mm_entry.cu
new file mode 100644
index 0000000000000..8acd4cd1dd135
--- /dev/null
+++ b/csrc/quantization/cutlass_test/test_mm_entry.cu
@@ -0,0 +1,82 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+void cutlass_scaled_test_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& e,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias);
+#endif
+
+bool cutlass_scaled_test_mm_supports_fp8(int64_t cuda_device_capability) {
+  // CUTLASS FP8 kernels need at least
+  //   CUDA 12.0 on SM90 systems (Hopper)
+  //   CUDA 12.4 on SM89 systems (Lovelace)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 89) {
+    return CUDA_VERSION >= 12040;
+  }
+#endif
+
+  return false;
+}
+
+int32_t test_get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
+
+void cutlass_scaled_test_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& e,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = test_get_sm_version_num();
+  // Hopper
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    cutlass_scaled_test_mm_sm90(c, a, e, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_test_mm for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/quantization/cutlass_test/test_util.cu b/csrc/quantization/cutlass_test/test_util.cu
new file mode 100644
index 0000000000000..c8f10c50f50b3
--- /dev/null
+++ b/csrc/quantization/cutlass_test/test_util.cu
@@ -0,0 +1,199 @@
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/detail/dependent_false.hpp"
+
+#include "broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
+
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "host_tensor.h"
+#include "packed_stride.hpp"
+
+#include "helper.h"
+
+#include "common_gemm.cuh"
+
+/// Make A structured sparse by replacing elements with 0 and compress it
+template<typename ElementA_>
+bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
+{
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 ||
+              a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1)
+
+  int m = a.size(0);
+  int k = a.size(1);
+
+  using ProblemShape = Shape<int,int,int,int>;
+  using ElementA = ElementA_;
+  using LayoutTagA = cutlass::layout::RowMajor;
+
+  // Layouts for reference (non-sparse) tensors
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  using StrideE = StrideA;
+
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    float, float,
+    float, LayoutTagA, 4,
+    float, LayoutTagA, 4,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+  // static constexpr size_t CEStorageSize =
+  //     sizeof(typename CollectiveEpilogue::SharedStorage);
+  // using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+  //     static_cast<int>(CEStorageSize)>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+    ElementA, LayoutTagA, 32, // Assuming 8 bits - TODO: Extend to other types
+    ElementA, cutlass::layout::ColumnMajor, 16,
+    float,
+    TileShape, ClusterShape,
+    typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    KernelSchedule
+  >::CollectiveOp;
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    ProblemShape,
+    CollectiveMainloop,
+    CollectiveEpilogue
+  >;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
+
+  typename Gemm::GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+
+  // Offline compressor kernel
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig>;
+
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                            ProblemShape,
+                            ElementA,
+                            LayoutTagA,
+                            SparseConfig,
+                            cutlass::arch::Sm90>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+  
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA stride_A;
+  StrideA stride_A_compressed;
+  StrideE stride_E;
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+
+  CompressorUtility compressor_utility(prob_shape, stride_A);
+
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto a_ptr = static_cast<typename Gemm::ElementA*>(a.data_ptr());
+
+  // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
+  // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
+  // cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
+
+  auto a_compressed_ptr = static_cast<typename Gemm::ElementA*>(a_compressed.data_ptr());
+  auto e_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(e.data_ptr());
+
+  // block_A_compressed.reset(M * KC * L);
+  // block_E.reset(ME * KE * L);
+
+  stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
+  stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
+
+  // Random sparsification is performed on host
+  std::vector<ElementA> block_A_host(m * k);
+  cutlass::device_memory::copy_to_host(block_A_host.data(), a_ptr, m * k);
+  compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), 2024);
+  cutlass::device_memory::copy_to_device(a_ptr, block_A_host.data(), m * k);
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  typename Compressor::Arguments arguments {
+    prob_shape,
+    { a_ptr,
+      stride_A,
+      a_compressed_ptr,
+      e_ptr },
+    {hw_info} };
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return true;
+}
+
+bool cutlass_sparsify_and_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
+{
+  // if (a.dtype() == torch::kBFloat16) {
+  //   return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
+  // } else if (a.dtype() == torch::kFloat16) {
+  //   return sparsify_and_compress<cutlass::half_t>(a_compressed, e, a);
+  // } else
+  if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return sparsify_and_compress<cutlass::float_e4m3_t>(a_compressed, e, a);
+  }
+  // else if (a.dtype() == torch::kInt8) {
+  //   return sparsify_and_compress<int8_t>(a_compressed, e, a);
+  // }
+  return false;
+}
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 292c9e4b34e1c..ff68a4a0e62ec 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -79,13 +79,13 @@ struct ScaledEpilogueBase {
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
diff --git a/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu b/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
index 794d325b36eba..4df244234bdc9 100644
--- a/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
+++ b/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
@@ -23,7 +23,6 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "broadcast_load_epilogue_c3x.hpp"
 #include "common.hpp"
 // clang-format on
 
@@ -43,86 +42,139 @@ using namespace cute;
 namespace {
 
 template <typename ElementAB_, typename ElementD_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
+          typename TileShape_, typename ClusterShape_, typename KernelSchedule_,
+          typename EpilogueSchedule_>
 struct cutlass_3x_sparse_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  // using ElementAcc =
-  //     typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-  //                               float>::type;
-  using ElementAcc = ElementD;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;
-  using LayoutTagC  = cutlass::layout::ColumnMajor;
-  using StrideC = StrideD;
-
-  constexpr int AlignmentAB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
-
-  // using CollectiveEpilogue =
-  //     typename cutlass::epilogue::collective::CollectiveBuilder<
-  //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-  //         ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-  //         ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-  //         EpilogueSchedule, EVTCompute>::CollectiveOp;
+//   using ElementAB = ElementAB_;
+//   using ElementD = ElementD_;
+//   // using ElementAcc =
+//   //     typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+//   //                               float>::type;
+//   using ElementAcc = ElementD;
+
+//   using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+//   using ElementC = void;
+//   constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+//   using LayoutTagC  = cutlass::layout::ColumnMajor;
+//   using StrideC = StrideD;
+
+//   constexpr int AlignmentAB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
+
+//   // using CollectiveEpilogue =
+//   //     typename cutlass::epilogue::collective::CollectiveBuilder<
+//   //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+//   //         ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+//   //         ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+//   //         EpilogueSchedule, EVTCompute>::CollectiveOp;
   
-  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+//   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+//     cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+//     TileShape, ClusterShape,
+//     cutlass::epilogue::collective::EpilogueTileAuto,
+//     ElementAcc, ElementAcc,
+//     ElementC, LayoutTagC, AlignmentD,
+//     ElementD, LayoutTagC, AlignmentD,
+//     EpilogueSchedule
+//   >::CollectiveOp;
+
+//   // static constexpr size_t CEStorageSize =
+//   //     sizeof(typename CollectiveEpilogue::SharedStorage);
+//   // using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+//   //     static_cast<int>(CEStorageSize)>;
+
+//   // using CollectiveMainloop =
+//   //     typename cutlass::gemm::collective::CollectiveBuilder<
+//   //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+//   //         ElementAB, cutlass::layout::RowMajor, 16, 
+//   //         ElementAB, cutlass::layout::ColumnMajor, 16, 
+//   //         ElementAcc, TileShape, ClusterShape,
+//   //         Stages,
+//   //         KernelSchedule>::CollectiveOp;
+
+//   using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+//     // cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+//     cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+//     ElementAB, cutlass::layout::RowMajor, AlignmentAB,
+//     ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
+//     ElementAcc,
+//     TileShape, ClusterShape,
+//     cutlass::gemm::collective::StageCountAutoCarveout<
+//       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+//     >,
+//     KernelSchedule
+//   >::CollectiveOp;
+
+//   // using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+//   //     cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+//   //     cutlass::gemm::PersistentScheduler>>;
+  
+//   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+//     cute::Shape<int, int, int, int>,
+//     CollectiveMainloop,
+//     CollectiveEpilogue
+// >;
+
+//   struct GemmKernel : public KernelType {};
+
+  using         ElementA    = float;                                          // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+// constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = float;                                          // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+// constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C/D matrix configuration
+using         ElementC    = float;                                          // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C and D matrix operands
+// constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// Core kernel configurations
+using ElementAccumulator  = float;                                          // Element type for internal accumulation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_128,_128,_32>;                           // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
+using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;       // Kernel to launch based on the default setting in the Collective Builder
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
     cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
     TileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
-    ElementAcc, ElementAcc,
-    ElementC, LayoutTagC, AlignmentC,
-    ElementD, LayoutTagC, AlignmentC,
-    EpilogueSchedule
+    ElementAccumulator, ElementAccumulator,
+    ElementC, LayoutC, 4,
+    ElementC, LayoutC, 4,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
   >::CollectiveOp;
 
-  // static constexpr size_t CEStorageSize =
-  //     sizeof(typename CollectiveEpilogue::SharedStorage);
-  // using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-  //     static_cast<int>(CEStorageSize)>;
-
-  // using CollectiveMainloop =
-  //     typename cutlass::gemm::collective::CollectiveBuilder<
-  //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-  //         ElementAB, cutlass::layout::RowMajor, 16, 
-  //         ElementAB, cutlass::layout::ColumnMajor, 16, 
-  //         ElementAcc, TileShape, ClusterShape,
-  //         Stages,
-  //         KernelSchedule>::CollectiveOp;
-
-  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-    // cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    ElementAB, cutlass::layout::RowMajor, AlignmentAB,
-    ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
-    ElementAcc,
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, 4,
+    ElementB, LayoutB, 4,
+    ElementAccumulator,
     TileShape, ClusterShape,
     cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
-    >,
-    KernelSchedule
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+    cutlass::gemm::collective::KernelScheduleAuto
   >::CollectiveOp;
 
-  // using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-  //     cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-  //     cutlass::gemm::PersistentScheduler>>;
-  
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-    cute::Shape<int, int, int, int>,
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int>, // Indicates ProblemShape
     CollectiveMainloop,
     CollectiveEpilogue
 >;
 
-  struct GemmKernel : public KernelType {};
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
 };
 
 template <typename Gemm>
 void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                          torch::Tensor const& b) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
+  using ElementAB = typename Gemm::ElementA;
+  using ElementD = typename Gemm::ElementC;
 
   int32_t m = a.size(0);
   int32_t n = b.size(1);
@@ -134,14 +186,15 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 
   using StrideA = Stride<int64_t, Int<1>, int64_t>;
   using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
+  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
 
   StrideA a_stride{lda, Int<1>{}, 0};
   StrideB b_stride{ldb, Int<1>{}, 0};
   StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
 
   using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+  // typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+  typename GemmKernel::ProblemShape prob_shape{m, n, k};
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
   auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
@@ -149,13 +202,9 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                                                        b_stride};
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  // typename GemmKernel::EpilogueArguments epilogue_args{
-  //     Gemm::Epilogue::prepare_args(
-  //         std::forward<EpilogueArgs>(epilogue_params)...),
-  //     c_ptr, c_stride, c_ptr, c_stride};
 
   typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
+                                      prob_shape, mainloop_args};
 
   // Launch the CUTLASS GEMM kernel.
   using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
@@ -164,7 +213,7 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 
   size_t workspace_size = gemm_op.get_workspace_size(args);
   auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+      torch::TensorOptions().dtype(torch::kFloat32).device(a.device());
   auto workspace = torch::empty(workspace_size, workspace_options);
 
   auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
@@ -174,49 +223,37 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 }
 
 template <typename InType, typename OutType>
-struct sm90_fp8_config_default {
+struct sm90_config_default {
   // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_1, _2, _1>;
+  // using KernelSchedule =
+  //     cutlass::gemm::KernelTmaWarpSpecialized;
+  // using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape           = Shape<_128,_128,_32>;                           // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_2,_1>;
   using Cutlass3xGemm =
-      cutlass_3x_sparse_gemm<InType, OutType, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+      cutlass_3x_sparse_gemm<float, float, TileShape, ClusterShape,
+                      cutlass::gemm::collective::KernelScheduleAuto,
+                      cutlass::epilogue::collective::EpilogueScheduleAuto>;
 };
 
 }  // namespace
 
 template <typename InType, typename OutType>
-void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_gemm_sm90_dispatch(torch::Tensor& out, torch::Tensor const& a,
                                     torch::Tensor const& b) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
   using Cutlass3xGemmDefault =
-      typename sm90_fp8_config_default<InType, OutType>::Cutlass3xGemm;
+      typename sm90_config_default<InType, OutType>::Cutlass3xGemm;
 
   return cutlass_gemm_caller<Cutlass3xGemmDefault>(out, a, b);
 }
 
 void cutlass_semi_structured_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                                      torch::Tensor const& b) {
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-  if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                          cutlass::bfloat16_t>(
-          out, a, b);
-  } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                          cutlass::half_t>(
-          out, a, b);
-  }
+
+  TORCH_CHECK(out.dtype() == torch::kFloat32);
+  return cutlass_gemm_sm90_dispatch<float,
+                                      float>(
+      out, a, b);
   // TODO: Add other data types
 }
 
diff --git a/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu b/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
index 0d570a48b39ac..31d342739344c 100644
--- a/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
+++ b/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
@@ -5,13 +5,10 @@
 
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 void cutlass_semi_structured_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                            torch::Tensor const& b);
 #endif
 
-int32_t get_sm_version_num() {
+int32_t get_sm_version_number() {
   int32_t major_capability, minor_capability;
   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
                          0);
@@ -33,18 +30,19 @@ void cutlass_semi_structured_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(b.stride(0) == 1);                      // Column-major
   TORCH_CHECK(c.stride(0) % 16 == 0 &&
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a.is_contiguous() && b.is_contiguous());
 
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
-  int32_t version_num = get_sm_version_num();
+  int32_t version_num = get_sm_version_number();
   // Hopper
 
   // TODO: Guard against compilation issues for sm90 kernels
-// #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
   if (version_num >= 90) {
-    cutlass_semi_structured_mm_sm90(c, a, b, a_scales, b_scales, bias);
+    cutlass_semi_structured_mm_sm90(c, a, b);
     return;
   }
-// #endif
+#endif
 
   TORCH_CHECK_NOT_IMPLEMENTED(
       false,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index ff14f7fb97b52..7f4f5d8243adf 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -264,6 +264,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Test
+  ops.def(
+      "cutlass_scaled_test_mm(Tensor! out, Tensor a,"
+      "                  Tensor e,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_test_mm", torch::kCUDA, &cutlass_scaled_test_mm);
+
+  // Test
+  ops.def("cutlass_scaled_test_mm_supports_fp8(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_test_mm_supports_fp8", &cutlass_scaled_test_mm_supports_fp8);
+
+  // Test
+  ops.def("cutlass_sparsify_and_compress_entry(Tensor! a_compressed, Tensor! e,"
+          " Tensor a) -> bool");
+  ops.impl("cutlass_sparsify_and_compress_entry", &cutlass_sparsify_and_compress_entry);
+
   // CUTLASS sparse GEMM, supporting semi-structured sparsity
   ops.def(
       "cutlass_semi_structured_mm(Tensor! out, Tensor a,"
diff --git a/sane_cute_errors.py b/sane_cute_errors.py
new file mode 100644
index 0000000000000..d742fe1fbc827
--- /dev/null
+++ b/sane_cute_errors.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+
+# Usage:
+#  ```
+#  python setup.py build_ext --inplace | tee compile_log.txt
+#  cat compile_log.txt | python sane_cute_errors.py
+#  ```
+
+import sys
+import regex
+from colorama import Fore
+
+def _loop_replace(replace_fn, input_str, *args, **kwargs):
+    new_string, count = replace_fn(input_str, *args, **kwargs)
+    while count > 0:
+        new_string, count = replace_fn(new_string, *args, **kwargs)
+    return new_string
+
+def replace_delimited_substring(input_str, start_delim, end_delim, replace_fn, prefix=""):    
+    start_delim = regex.escape(start_delim)
+    end_delim = regex.escape(end_delim)
+    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'
+    return regex.subn(rx, lambda x: replace_fn(x.group(2)), input_str)
+
+def replace_all_delimited_substrings(input_str, start_delim, end_delim, replace_fn, prefix=""):
+    return _loop_replace(replace_delimited_substring, input_str, start_delim, end_delim, replace_fn, prefix=prefix)
+
+def replace_delimiters(input_str, start_delim, end_delim, new_start, new_end, prefix=""):
+    start_delim = regex.escape(start_delim)
+    end_delim = regex.escape(end_delim)
+    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'
+    return regex.subn(rx, f"{new_start}\\2{new_end}", input_str)
+
+def replace_all_delimiters(input_str, start_delim, end_delim, new_start, new_end, prefix=""):
+    return _loop_replace(replace_delimiters, input_str, start_delim, end_delim, new_start, new_end, prefix=prefix)
+
+def replace(input_str, old, new):
+    return regex.subn(old, new, input_str)
+
+def replace_all(input_str, old, new):
+    return _loop_replace(replace, input_str, old, new)
+    
+def sepreate_at_line_of(input_str):
+    return regex.sub(r"at line (\d+) of ([^\n\r]*)", f"\n\t\tat {Fore.GREEN}\\2:\\1{Fore.RESET}", input_str)
+
+def break_apart_instantiation_of(input_str):
+    def replace_fn(x):
+        def replace_fn_inner(x):
+            x = regex.sub(r"([^\s=]+=)", r"\n\t\t  \1", x)
+            return x
+        x = regex.sub(r"(at line)", r"\n\t\t\1", x)
+        y, _ = replace_delimited_substring(x, "[", "]", replace_fn_inner)
+        return "instantiation of " + regex.sub(r"([^(]*)", f"{Fore.MAGENTA}\\1{Fore.RESET}", y, count=1)
+    
+    return replace_all_delimited_substrings(input_str, "\"", "\"", replace_fn, prefix=r"instantiation of ")
+
+def template_replace_commas_at_depth_0(x, new_char):
+    brace_stack = []
+    brace_pairs = { "(": ")", "[": "]", "{": "}", "<": ">" }
+    replaced_comma = False
+    
+    for idx in range(len(x)):
+        if x[idx] in brace_pairs:
+            brace_stack.append(x[idx])
+        elif len(brace_stack) > 0 and x[idx] == brace_pairs[brace_stack[-1]]:
+            brace_stack.pop()
+        if len(brace_stack) == 0 and x[idx] == ",":
+            x = x[:idx] + new_char + x[idx+1:]
+            replaced_comma = True
+    return x, replaced_comma
+
+
+def replace_layout_commas(x):
+    def replace_commas_inner(x):
+        x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
+        if not replaced:
+            x, _ = replace_delimiters(x, "<", ">", "", "", prefix="cute::tuple")
+            x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
+        assert replaced == True
+        return f"{Fore.BLUE}{x}{Fore.RESET}"
+    
+    x, _ = replace_delimited_substring(x, "<", ">", replace_commas_inner, prefix="cute::Layout")
+    return x
+
+def replace_composed_layout_commas(x):
+    def replace_commas_inner(x):
+        x, replaced = template_replace_commas_at_depth_0(x, new_char=" o")
+        assert replaced == True
+        return x
+    
+    x, _ = replace_delimited_substring(x, "<", ">", replace_commas_inner, prefix="cute::ComposedLayout")
+    return x
+
+def clean_up_log(log):
+    new_str = sepreate_at_line_of(log)
+    new_str = break_apart_instantiation_of(new_str)
+    new_str = replace_layout_commas(new_str)
+    new_str = replace_composed_layout_commas(new_str)
+    new_str = replace_all_delimiters(new_str, "<", ">", "(", ")", prefix="cute::tuple")
+    new_str = replace_all_delimiters(new_str, "<", ">", "S<", ">", prefix="cute::Swizzle")
+    new_str = replace_all(new_str, r"cute::C<(\d+)>", r"_\1")
+    new_str = replace_all(new_str, r"cute::_(\d+)", r"_\1")
+    new_str = replace_all(new_str, r"cute::Underscore", r"_")
+
+    template_type_abbreviations = (
+        ("cute::ScaledBasis", "SB"),
+        ("cute::Tensor", "T"),
+        ("cute::ArithmeticTuple", "AT"),
+        ("cute::ArithmeticTupleIterator", "ATI"),
+        ("cute::ViewEngine", "VE")
+    )
+
+    for template_type, abrv in template_type_abbreviations:
+        new_str = replace_all(new_str, template_type + "<", abrv + "<")
+    print(new_str)
+
+
+data = sys.stdin.read()
+clean_up_log(data)
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e8efd5f339ced..525b63eaf2fd6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -509,11 +509,51 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
+def cutlass_scaled_test_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_test_mm_supports_fp8(cuda_device_capability)
+
+
+def cutlass_sparsify_and_compress_entry(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    assert (a.dtype is torch.int8 or a.dtype is torch.int8)
+
+    m = a.shape[0]
+    k = a.shape[1]
+    a_compressed = torch.empty((m, k), dtype=a.dtype, device=a.device)
+    e = torch.empty((m, k), dtype=torch.uint8, device=a.device)
+
+    if not (torch.ops._C.cutlass_sparsify_and_compress_entry(a_compressed, e, a)):
+        raise ValueError
+
+    return a_compressed, e
+
+
+def cutlass_scaled_test_mm(a: torch.Tensor,
+                      e: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
+                      scale_b: torch.Tensor,
+                      out_dtype: torch.dtype,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == b.shape[
+        1] and bias.dtype == out_dtype
+
+    m = a.shape[0]
+    n = b.shape[1]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+
+    torch.ops._C.cutlass_scaled_test_mm(out, a, e, b, scale_a, scale_b, bias)
+
+    return out
+
+
 def cutlass_semi_structured_mm(a: torch.Tensor,
                       b: torch.Tensor,
                       out_dtype: torch.dtype) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
-    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16 or out_dtype is torch.float)
 
     m = a.shape[0]
     n = b.shape[1]

From 471a03c2ebff4f2da3e110b6628d08f1aff7429a Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Wed, 30 Oct 2024 15:53:54 +0000
Subject: [PATCH 03/92] Clean up a bit; both fp8 and int8 working

---
 CMakeLists.txt                                |  36 +--
 .../semi_structured_benchmarks.py             | 278 ------------------
 .../cutlass_benchmarks/test_benchmarks.py     |  48 ++-
 csrc/ops.h                                    |   3 -
 csrc/semi_structured/cutlass/common.hpp       |  27 --
 .../cutlass/semi_structured_mm_c3x.cu         | 260 ----------------
 .../cutlass/semi_structured_mm_entry.cu       |  52 ----
 .../cusparselt/binding.py                     |   0
 .../cusparselt/cusparselt_mm.cu               |   0
 .../cusparselt/cusparselt_mm_entry.cu         |   0
 .../cutlass}/example/62_hopper_sparse_gemm.cu |   0
 .../cutlass}/example/Makefile                 |   0
 .../cutlass}/example/util/command_line.h      |   0
 .../cutlass}/example/util/distribution.h      |   0
 .../cutlass}/example/util/gather_tensor.hpp   |   0
 .../cutlass}/example/util/helper.h            |   0
 .../cutlass}/example/util/host_tensor.h       |   0
 .../cutlass}/example/util/packed_stride.hpp   |   0
 .../util/reference/detail/inner_product.h     |   0
 .../reference/detail/linear_to_coordinate.h   |   0
 .../util/reference/device/convolution.h       |   0
 .../example/util/reference/device/gemm.h      |   0
 .../util/reference/device/gemm_complex.h      |   0
 .../reference/device/gemm_planar_complex.h    |   0
 .../example/util/reference/device/gett.hpp    |   0
 .../util/reference/device/kernel/gemm.h       |   0
 .../device/kernel/tensor_elementwise.h        |   0
 .../reference/device/kernel/tensor_foreach.h  |   0
 .../util/reference/device/rank_2k_complex.h   |   0
 .../util/reference/device/tensor_compare.h    |   0
 .../util/reference/device/tensor_fill.h       |   0
 .../util/reference/device/tensor_foreach.h    |   0
 .../util/reference/device/tensor_reduce.h     |   0
 .../util/reference/device/tensor_relu.h       |   0
 .../util/reference/device/thread/gemm.h       |   0
 .../example/util/reference/host/conv.hpp      |   0
 .../example/util/reference/host/convolution.h |   0
 .../util/reference/host/error_metrics.h       |   0
 .../example/util/reference/host/gemm.h        |   0
 .../util/reference/host/gemm_complex.h        |   0
 .../util/reference/host/gemm_planar_complex.h |   0
 .../example/util/reference/host/gett.hpp      |   0
 .../example/util/reference/host/rank_2k.h     |   0
 .../util/reference/host/rank_2k_complex.h     |   0
 .../util/reference/host/rank_k_complex.h      |   0
 .../example/util/reference/host/symm.h        |   0
 .../util/reference/host/symm_complex.h        |   0
 .../util/reference/host/tensor_compare.h      |   0
 .../util/reference/host/tensor_compare.hpp    |   0
 .../example/util/reference/host/tensor_copy.h |   0
 .../util/reference/host/tensor_elementwise.h  |   0
 .../example/util/reference/host/tensor_fill.h |   0
 .../util/reference/host/tensor_fill.hpp       |   0
 .../util/reference/host/tensor_foreach.h      |   0
 .../example/util/reference/host/tensor_norm.h |   0
 .../util/reference/host/tensor_reduce.h       |   0
 .../util/reference/host/tensor_reduce.hpp     |   0
 .../example/util/reference/host/trmm.h        |   0
 .../util/reference/host/trmm_complex.h        |   0
 .../cutlass}/example/util/tensor_view_io.h    |   0
 .../cutlass/sparse_compressor.cu}             |  98 +++---
 .../cutlass/sparse_mm_c3x.cu}                 |   6 +-
 .../cutlass/sparse_mm_entry.cu}               |   2 +-
 .../util}/broadcast_load_epilogue_c3x.hpp     |   0
 .../cutlass/util}/common.hpp                  |   0
 .../cutlass/util}/common_gemm.cuh             |   0
 .../cutlass/util}/device_memory.h             |   0
 .../cutlass/util}/exceptions.h                |   0
 .../cutlass/util}/helper.h                    |   0
 .../cutlass/util}/host_tensor.h               |   0
 .../cutlass/util}/packed_stride.hpp           |   0
 csrc/torch_bindings.cpp                       |   7 -
 vllm/_custom_ops.py                           |  25 +-
 73 files changed, 101 insertions(+), 741 deletions(-)
 delete mode 100644 benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
 delete mode 100644 csrc/semi_structured/cutlass/common.hpp
 delete mode 100644 csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
 delete mode 100644 csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
 rename csrc/{semi_structured => sparse}/cusparselt/binding.py (100%)
 rename csrc/{semi_structured => sparse}/cusparselt/cusparselt_mm.cu (100%)
 rename csrc/{semi_structured => sparse}/cusparselt/cusparselt_mm_entry.cu (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/62_hopper_sparse_gemm.cu (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/Makefile (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/command_line.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/distribution.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/gather_tensor.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/helper.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/host_tensor.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/packed_stride.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/detail/inner_product.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/detail/linear_to_coordinate.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/convolution.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/gemm.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/gemm_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/gemm_planar_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/gett.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/kernel/gemm.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/kernel/tensor_elementwise.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/kernel/tensor_foreach.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/rank_2k_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/tensor_compare.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/tensor_fill.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/tensor_foreach.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/tensor_reduce.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/tensor_relu.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/device/thread/gemm.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/conv.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/convolution.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/error_metrics.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/gemm.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/gemm_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/gemm_planar_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/gett.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/rank_2k.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/rank_2k_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/rank_k_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/symm.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/symm_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_compare.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_compare.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_copy.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_elementwise.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_fill.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_fill.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_foreach.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_norm.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_reduce.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/tensor_reduce.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/trmm.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/reference/host/trmm_complex.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass}/example/util/tensor_view_io.h (100%)
 rename csrc/{quantization/cutlass_test/test_util.cu => sparse/cutlass/sparse_compressor.cu} (69%)
 rename csrc/{quantization/cutlass_test/test_mm_c3x.cu => sparse/cutlass/sparse_mm_c3x.cu} (98%)
 rename csrc/{quantization/cutlass_test/test_mm_entry.cu => sparse/cutlass/sparse_mm_entry.cu} (97%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/broadcast_load_epilogue_c3x.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/common.hpp (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/common_gemm.cuh (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/device_memory.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/exceptions.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/helper.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/host_tensor.h (100%)
 rename csrc/{quantization/cutlass_test => sparse/cutlass/util}/packed_stride.hpp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a13a1e8065e21..f51a9323c873b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,9 +227,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/quantization/cutlass_test/test_mm_entry.cu"
-    "csrc/quantization/cutlass_test/test_util.cu"
-    "csrc/semi_structured/cutlass/semi_structured_mm_entry.cu")
+    "csrc/sparse/cutlass/sparse_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_compressor.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -291,7 +290,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_test/test_util.cu")
+    set(SRCS "csrc/sparse/cutlass/sparse_compressor.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -319,7 +318,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_test/test_mm_c3x.cu")
+    set(SRCS "csrc/sparse/cutlass/sparse_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -342,33 +341,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(SCALED_MM_3X_ARCHS)
   endif()
 
-  #
-  # The cutlass_semi_structured_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
-    message(STATUS "Building semi_structured_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building semi_structured_mm_c3x as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
-  endif()
 
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
diff --git a/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py b/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
deleted file mode 100644
index ebe6668a89e43..0000000000000
--- a/benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py
+++ /dev/null
@@ -1,278 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float16)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float16)
-
-
-def to_fp32(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-    if dtype == torch.float16:
-        return to_fp16(a), to_fp16(b)
-    if dtype == torch.float:
-        return to_fp32(a), to_fp32(b)
-
-    raise ValueError("unsupported dtype")
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
-    min_run_time = 1
-
-    globals = {
-        "args": args,
-        "kwargs": kwargs,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-def bench_fp32(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float
-    a, b = make_rand_tensors(torch.float, m, n, k)
-
-    timers = []
-
-    # pytorch impl w. fp32
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_f32_f32_f32_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.float, device="cuda"),
-                 b.to(dtype=torch.float, device="cuda")))
-    
-    # cutlass impl: fp32
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp32_fp32_fp32_semi_structured_mm",
-                 ops.cutlass_semi_structured_mm, a, b,
-                 torch.float))
-    
-    return timers
-
-
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
-    return bench_fp32(torch.float, m, k, n, label, sub_label)
-    # if dtype == torch.int8:
-    #     return bench_int8(dtype, m, k, n, label, sub_label)
-    # if dtype == torch.float8_e4m3fn:
-    #     return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"semi_structured-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        if dt == "fp16":
-            return torch.float16
-        if dt == "fp32":
-            return torch.float
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/semi_structured_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8', 'fp16', 'fp32']")
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/test_benchmarks.py b/benchmarks/cutlass_benchmarks/test_benchmarks.py
index 4d1884dcd2135..d2cbe650d0d66 100644
--- a/benchmarks/cutlass_benchmarks/test_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/test_benchmarks.py
@@ -32,8 +32,12 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:
 
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                       k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
+    # a = torch.randn((m, k), device='cuda') * 5
+    # b = torch.randn((n, k), device='cuda').t() * 5
+
+    # Create ones
+    a = torch.ones((m, k), device='cuda') * 2
+    b = torch.ones((n, k), device='cuda').t() * 3
 
     if dtype == torch.int8:
         return to_int8(a), to_int8(b)
@@ -66,12 +70,19 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.int8
     a, b = make_rand_tensors(torch.int8, m, n, k)
-    a_compressed, e = cutlass_sparsify_and_compress_entry(a)
+    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
+
+    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
+    print(f'a: {a[-1, -23:]}')
+    print(f'a_compressed: {a_compressed[0, :12]}')
+    print(f'e: {e[-1, -23:]}')
+
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
+
+    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
+    print(f'Cutlass matmul: {ops.cutlass_scaled_test_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
 
     timers = []
     # pytorch impl - bfloat16
@@ -86,17 +97,29 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                  "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
                  a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
 
-    # cutlass impl
+    # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
                  ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.bfloat16))
 
-    # cutlass with bias
+    # cutlass with bias: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
                  ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
                  bias))
+    
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_mm",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.float16))
+
+    # cutlass with bias: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
 
     return timers
 
@@ -105,11 +128,20 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float8_e4m3fn
     a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    a_compressed, e = cutlass_sparsify_and_compress_entry(a)
+    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
+
+    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
+    print(f'a: {a[-1, -23:]}')
+    print(f'a_compressed: {a_compressed[0, :12]}')
+    print(f'e: {e[-1, -23:]}')
+
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
+    print(f'Cutlass matmul: {ops.cutlass_scaled_test_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
+
     timers = []
 
     # pytorch impl w. bf16
diff --git a/csrc/ops.h b/csrc/ops.h
index e5d798cc832dd..bfe8eb37b65a7 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -126,9 +126,6 @@ void cutlass_scaled_test_mm(torch::Tensor& out, torch::Tensor const& a,
 
 bool cutlass_sparsify_and_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
                                  torch::Tensor const& a);
-
-void cutlass_semi_structured_mm(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/semi_structured/cutlass/common.hpp b/csrc/semi_structured/cutlass/common.hpp
deleted file mode 100644
index bf04bb400790f..0000000000000
--- a/csrc/semi_structured/cutlass/common.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include <climits>
-
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
diff --git a/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu b/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
deleted file mode 100644
index 4df244234bdc9..0000000000000
--- a/csrc/semi_structured/cutlass/semi_structured_mm_c3x.cu
+++ /dev/null
@@ -1,260 +0,0 @@
-// clang-format will break include orders
-// clang-format off
-#include <cudaTypedefs.h>
-
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
-
-/*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
-*/
-
-namespace {
-
-template <typename ElementAB_, typename ElementD_,
-          typename TileShape_, typename ClusterShape_, typename KernelSchedule_,
-          typename EpilogueSchedule_>
-struct cutlass_3x_sparse_gemm {
-//   using ElementAB = ElementAB_;
-//   using ElementD = ElementD_;
-//   // using ElementAcc =
-//   //     typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-//   //                               float>::type;
-//   using ElementAcc = ElementD;
-
-//   using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-//   using ElementC = void;
-//   constexpr int AlignmentD  = 128 / cutlass::sizeof_bits<ElementD>::value;
-//   using LayoutTagC  = cutlass::layout::ColumnMajor;
-//   using StrideC = StrideD;
-
-//   constexpr int AlignmentAB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
-
-//   // using CollectiveEpilogue =
-//   //     typename cutlass::epilogue::collective::CollectiveBuilder<
-//   //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-//   //         ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-//   //         ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-//   //         EpilogueSchedule, EVTCompute>::CollectiveOp;
-  
-//   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
-//     cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-//     TileShape, ClusterShape,
-//     cutlass::epilogue::collective::EpilogueTileAuto,
-//     ElementAcc, ElementAcc,
-//     ElementC, LayoutTagC, AlignmentD,
-//     ElementD, LayoutTagC, AlignmentD,
-//     EpilogueSchedule
-//   >::CollectiveOp;
-
-//   // static constexpr size_t CEStorageSize =
-//   //     sizeof(typename CollectiveEpilogue::SharedStorage);
-//   // using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-//   //     static_cast<int>(CEStorageSize)>;
-
-//   // using CollectiveMainloop =
-//   //     typename cutlass::gemm::collective::CollectiveBuilder<
-//   //         cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-//   //         ElementAB, cutlass::layout::RowMajor, 16, 
-//   //         ElementAB, cutlass::layout::ColumnMajor, 16, 
-//   //         ElementAcc, TileShape, ClusterShape,
-//   //         Stages,
-//   //         KernelSchedule>::CollectiveOp;
-
-//   using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-//     // cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
-//     cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-//     ElementAB, cutlass::layout::RowMajor, AlignmentAB,
-//     ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
-//     ElementAcc,
-//     TileShape, ClusterShape,
-//     cutlass::gemm::collective::StageCountAutoCarveout<
-//       static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
-//     >,
-//     KernelSchedule
-//   >::CollectiveOp;
-
-//   // using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-//   //     cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-//   //     cutlass::gemm::PersistentScheduler>>;
-  
-//   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-//     cute::Shape<int, int, int, int>,
-//     CollectiveMainloop,
-//     CollectiveEpilogue
-// >;
-
-//   struct GemmKernel : public KernelType {};
-
-  using         ElementA    = float;                                          // Element type for A matrix operand
-using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
-// constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
-
-// B matrix configuration
-using         ElementB    = float;                                          // Element type for B matrix operand
-using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
-// constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
-
-// C/D matrix configuration
-using         ElementC    = float;                                          // Element type for C and D matrix operands
-using         LayoutC     = cutlass::layout::RowMajor;                   // Layout type for C and D matrix operands
-// constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
-
-// Core kernel configurations
-using ElementAccumulator  = float;                                          // Element type for internal accumulation
-using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
-using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
-using TileShape           = Shape<_128,_128,_32>;                           // Threadblock-level tile size
-using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
-using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
-using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;       // Kernel to launch based on the default setting in the Collective Builder
-
-using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    TileShape, ClusterShape,
-    cutlass::epilogue::collective::EpilogueTileAuto,
-    ElementAccumulator, ElementAccumulator,
-    ElementC, LayoutC, 4,
-    ElementC, LayoutC, 4,
-    cutlass::epilogue::collective::EpilogueScheduleAuto
-  >::CollectiveOp;
-
-using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-    ArchTag, OperatorClass,
-    ElementA, LayoutA, 4,
-    ElementB, LayoutB, 4,
-    ElementAccumulator,
-    TileShape, ClusterShape,
-    cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-    cutlass::gemm::collective::KernelScheduleAuto
-  >::CollectiveOp;
-
-using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-    Shape<int,int,int>, // Indicates ProblemShape
-    CollectiveMainloop,
-    CollectiveEpilogue
->;
-
-using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-};
-
-template <typename Gemm>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b) {
-  using ElementAB = typename Gemm::ElementA;
-  using ElementD = typename Gemm::ElementC;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  // typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-  typename GemmKernel::ProblemShape prob_shape{m, n, k};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kFloat32).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType>
-struct sm90_config_default {
-  // M in (128, inf)
-  // using KernelSchedule =
-  //     cutlass::gemm::KernelTmaWarpSpecialized;
-  // using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape           = Shape<_128,_128,_32>;                           // Threadblock-level tile size
-using ClusterShape        = Shape<_1,_2,_1>;
-  using Cutlass3xGemm =
-      cutlass_3x_sparse_gemm<float, float, TileShape, ClusterShape,
-                      cutlass::gemm::collective::KernelScheduleAuto,
-                      cutlass::epilogue::collective::EpilogueScheduleAuto>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType>
-void cutlass_gemm_sm90_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& b) {
-  using Cutlass3xGemmDefault =
-      typename sm90_config_default<InType, OutType>::Cutlass3xGemm;
-
-  return cutlass_gemm_caller<Cutlass3xGemmDefault>(out, a, b);
-}
-
-void cutlass_semi_structured_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b) {
-
-  TORCH_CHECK(out.dtype() == torch::kFloat32);
-  return cutlass_gemm_sm90_dispatch<float,
-                                      float>(
-      out, a, b);
-  // TODO: Add other data types
-}
-
-#endif
diff --git a/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu b/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
deleted file mode 100644
index 31d342739344c..0000000000000
--- a/csrc/semi_structured/cutlass/semi_structured_mm_entry.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <cudaTypedefs.h>
-
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
-
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
-void cutlass_semi_structured_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                            torch::Tensor const& b);
-#endif
-
-int32_t get_sm_version_number() {
-  int32_t major_capability, minor_capability;
-  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
-                         0);
-  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
-                         0);
-  int32_t version_num = major_capability * 10 + minor_capability;
-  return version_num;
-}
-
-void cutlass_semi_structured_mm(torch::Tensor& c, torch::Tensor const& a,
-                       torch::Tensor const& b) {
-  // Checks for conformality
-  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
-  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
-              b.size(1) == c.size(1));
-
-  // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
-  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
-  TORCH_CHECK(c.stride(0) % 16 == 0 &&
-              b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a.is_contiguous() && b.is_contiguous());
-
-  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
-  int32_t version_num = get_sm_version_number();
-  // Hopper
-
-  // TODO: Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
-  if (version_num >= 90) {
-    cutlass_semi_structured_mm_sm90(c, a, b);
-    return;
-  }
-#endif
-
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled cutlass_semi_structured_mm for a compute capability less than "
-      "CUDA device capability: ",
-      version_num);
-}
diff --git a/csrc/semi_structured/cusparselt/binding.py b/csrc/sparse/cusparselt/binding.py
similarity index 100%
rename from csrc/semi_structured/cusparselt/binding.py
rename to csrc/sparse/cusparselt/binding.py
diff --git a/csrc/semi_structured/cusparselt/cusparselt_mm.cu b/csrc/sparse/cusparselt/cusparselt_mm.cu
similarity index 100%
rename from csrc/semi_structured/cusparselt/cusparselt_mm.cu
rename to csrc/sparse/cusparselt/cusparselt_mm.cu
diff --git a/csrc/semi_structured/cusparselt/cusparselt_mm_entry.cu b/csrc/sparse/cusparselt/cusparselt_mm_entry.cu
similarity index 100%
rename from csrc/semi_structured/cusparselt/cusparselt_mm_entry.cu
rename to csrc/sparse/cusparselt/cusparselt_mm_entry.cu
diff --git a/csrc/quantization/cutlass_test/example/62_hopper_sparse_gemm.cu b/csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu
similarity index 100%
rename from csrc/quantization/cutlass_test/example/62_hopper_sparse_gemm.cu
rename to csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu
diff --git a/csrc/quantization/cutlass_test/example/Makefile b/csrc/sparse/cutlass/example/Makefile
similarity index 100%
rename from csrc/quantization/cutlass_test/example/Makefile
rename to csrc/sparse/cutlass/example/Makefile
diff --git a/csrc/quantization/cutlass_test/example/util/command_line.h b/csrc/sparse/cutlass/example/util/command_line.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/command_line.h
rename to csrc/sparse/cutlass/example/util/command_line.h
diff --git a/csrc/quantization/cutlass_test/example/util/distribution.h b/csrc/sparse/cutlass/example/util/distribution.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/distribution.h
rename to csrc/sparse/cutlass/example/util/distribution.h
diff --git a/csrc/quantization/cutlass_test/example/util/gather_tensor.hpp b/csrc/sparse/cutlass/example/util/gather_tensor.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/gather_tensor.hpp
rename to csrc/sparse/cutlass/example/util/gather_tensor.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/helper.h b/csrc/sparse/cutlass/example/util/helper.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/helper.h
rename to csrc/sparse/cutlass/example/util/helper.h
diff --git a/csrc/quantization/cutlass_test/example/util/host_tensor.h b/csrc/sparse/cutlass/example/util/host_tensor.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/host_tensor.h
rename to csrc/sparse/cutlass/example/util/host_tensor.h
diff --git a/csrc/quantization/cutlass_test/example/util/packed_stride.hpp b/csrc/sparse/cutlass/example/util/packed_stride.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/packed_stride.hpp
rename to csrc/sparse/cutlass/example/util/packed_stride.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/reference/detail/inner_product.h b/csrc/sparse/cutlass/example/util/reference/detail/inner_product.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/detail/inner_product.h
rename to csrc/sparse/cutlass/example/util/reference/detail/inner_product.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/detail/linear_to_coordinate.h b/csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/detail/linear_to_coordinate.h
rename to csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/convolution.h b/csrc/sparse/cutlass/example/util/reference/device/convolution.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/convolution.h
rename to csrc/sparse/cutlass/example/util/reference/device/convolution.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/gemm.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/gemm.h
rename to csrc/sparse/cutlass/example/util/reference/device/gemm.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gemm_complex.h b/csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/gemm_complex.h
rename to csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gemm_planar_complex.h b/csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/gemm_planar_complex.h
rename to csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/gett.hpp b/csrc/sparse/cutlass/example/util/reference/device/gett.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/gett.hpp
rename to csrc/sparse/cutlass/example/util/reference/device/gett.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/kernel/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/kernel/gemm.h
rename to csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_elementwise.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_elementwise.h
rename to csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/kernel/tensor_foreach.h
rename to csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/rank_2k_complex.h b/csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/rank_2k_complex.h
rename to csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_compare.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/tensor_compare.h
rename to csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_fill.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/tensor_fill.h
rename to csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/tensor_foreach.h
rename to csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_reduce.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/tensor_reduce.h
rename to csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/tensor_relu.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/tensor_relu.h
rename to csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/device/thread/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/device/thread/gemm.h
rename to csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/conv.hpp b/csrc/sparse/cutlass/example/util/reference/host/conv.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/conv.hpp
rename to csrc/sparse/cutlass/example/util/reference/host/conv.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/convolution.h b/csrc/sparse/cutlass/example/util/reference/host/convolution.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/convolution.h
rename to csrc/sparse/cutlass/example/util/reference/host/convolution.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/error_metrics.h b/csrc/sparse/cutlass/example/util/reference/host/error_metrics.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/error_metrics.h
rename to csrc/sparse/cutlass/example/util/reference/host/error_metrics.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gemm.h b/csrc/sparse/cutlass/example/util/reference/host/gemm.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/gemm.h
rename to csrc/sparse/cutlass/example/util/reference/host/gemm.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gemm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/gemm_complex.h
rename to csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gemm_planar_complex.h b/csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/gemm_planar_complex.h
rename to csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/gett.hpp b/csrc/sparse/cutlass/example/util/reference/host/gett.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/gett.hpp
rename to csrc/sparse/cutlass/example/util/reference/host/gett.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k.h b/csrc/sparse/cutlass/example/util/reference/host/rank_2k.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/rank_2k.h
rename to csrc/sparse/cutlass/example/util/reference/host/rank_2k.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/rank_2k_complex.h b/csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/rank_2k_complex.h
rename to csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/rank_k_complex.h b/csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/rank_k_complex.h
rename to csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/symm.h b/csrc/sparse/cutlass/example/util/reference/host/symm.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/symm.h
rename to csrc/sparse/cutlass/example/util/reference/host/symm.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/symm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/symm_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/symm_complex.h
rename to csrc/sparse/cutlass/example/util/reference/host/symm_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.h
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_compare.hpp
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_copy.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_copy.h
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_elementwise.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_elementwise.h
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.h
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_fill.hpp
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_foreach.h
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_norm.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_norm.h
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.h
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/tensor_reduce.hpp
rename to csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/trmm.h b/csrc/sparse/cutlass/example/util/reference/host/trmm.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/trmm.h
rename to csrc/sparse/cutlass/example/util/reference/host/trmm.h
diff --git a/csrc/quantization/cutlass_test/example/util/reference/host/trmm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/reference/host/trmm_complex.h
rename to csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h
diff --git a/csrc/quantization/cutlass_test/example/util/tensor_view_io.h b/csrc/sparse/cutlass/example/util/tensor_view_io.h
similarity index 100%
rename from csrc/quantization/cutlass_test/example/util/tensor_view_io.h
rename to csrc/sparse/cutlass/example/util/tensor_view_io.h
diff --git a/csrc/quantization/cutlass_test/test_util.cu b/csrc/sparse/cutlass/sparse_compressor.cu
similarity index 69%
rename from csrc/quantization/cutlass_test/test_util.cu
rename to csrc/sparse/cutlass/sparse_compressor.cu
index c8f10c50f50b3..3bc9a1b6a6efa 100644
--- a/csrc/quantization/cutlass_test/test_util.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -17,8 +17,8 @@
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/detail/dependent_false.hpp"
 
-#include "broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
+#include "util/broadcast_load_epilogue_c3x.hpp"
+#include "util/common.hpp"
 
 #include "cutlass/transform/device/transform_universal_adapter.hpp"
 #include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
@@ -37,12 +37,12 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/dispatch_policy.hpp"
 
-#include "host_tensor.h"
-#include "packed_stride.hpp"
+#include "util/host_tensor.h"
+#include "util/packed_stride.hpp"
 
-#include "helper.h"
+#include "util/helper.h"
 
-#include "common_gemm.cuh"
+#include "util/common_gemm.cuh"
 
 /// Make A structured sparse by replacing elements with 0 and compress it
 template<typename ElementA_>
@@ -66,47 +66,41 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
   using StrideE = StrideA;
 
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_1, _2, _1>;
-  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    TileShape, ClusterShape,
-    cutlass::epilogue::collective::EpilogueTileAuto,
-    float, float,
-    float, LayoutTagA, 4,
-    float, LayoutTagA, 4,
-    EpilogueSchedule
-  >::CollectiveOp;
-
-  // static constexpr size_t CEStorageSize =
-  //     sizeof(typename CollectiveEpilogue::SharedStorage);
-  // using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-  //     static_cast<int>(CEStorageSize)>;
-
-  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
-    ElementA, LayoutTagA, 32, // Assuming 8 bits - TODO: Extend to other types
-    ElementA, cutlass::layout::ColumnMajor, 16,
-    float,
-    TileShape, ClusterShape,
-    typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-    KernelSchedule
-  >::CollectiveOp;
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-    ProblemShape,
-    CollectiveMainloop,
-    CollectiveEpilogue
-  >;
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
-  using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
-
-  typename Gemm::GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
+  using Gemm =
+      typename std::conditional<std::is_same_v<ElementA, int8_t>,
+        typename sm90_int8_config_default<int8_t, cutlass::half_t,
+                                          ScaledEpilogue>::Cutlass3xGemm,
+        typename sm90_fp8_config_default<cutlass::float_e4m3_t, cutlass::half_t,
+                                         ScaledEpilogue>::Cutlass3xGemm
+      >::type;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  // Just a dummy value
+  int32_t n = 128;
+
+  int64_t lda = a.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
+
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+
+  LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
+
+  // typename Gemm::GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
 
   // Offline compressor kernel
   using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
@@ -138,13 +132,13 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   int KE = compressor_utility.get_metadata_k_physical();
   int KC = compressor_utility.get_tensorA_k_physical();
 
-  auto a_ptr = static_cast<typename Gemm::ElementA*>(a.data_ptr());
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
 
   // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
   // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
   // cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
 
-  auto a_compressed_ptr = static_cast<typename Gemm::ElementA*>(a_compressed.data_ptr());
+  auto a_compressed_ptr = static_cast<ElementA*>(a_compressed.data_ptr());
   auto e_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(e.data_ptr());
 
   // block_A_compressed.reset(M * KC * L);
@@ -192,8 +186,8 @@ bool cutlass_sparsify_and_compress_entry(torch::Tensor& a_compressed, torch::Ten
   if (a.dtype() == torch::kFloat8_e4m3fn) {
     return sparsify_and_compress<cutlass::float_e4m3_t>(a_compressed, e, a);
   }
-  // else if (a.dtype() == torch::kInt8) {
-  //   return sparsify_and_compress<int8_t>(a_compressed, e, a);
-  // }
+  else if (a.dtype() == torch::kInt8) {
+    return sparsify_and_compress<int8_t>(a_compressed, e, a);
+  }
   return false;
 }
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_test/test_mm_c3x.cu b/csrc/sparse/cutlass/sparse_mm_c3x.cu
similarity index 98%
rename from csrc/quantization/cutlass_test/test_mm_c3x.cu
rename to csrc/sparse/cutlass/sparse_mm_c3x.cu
index b544e01a2913a..d2af620e62e5e 100644
--- a/csrc/quantization/cutlass_test/test_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_mm_c3x.cu
@@ -23,11 +23,11 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
+#include "util/broadcast_load_epilogue_c3x.hpp"
+#include "util/common.hpp"
 // clang-format on
 
-#include "common_gemm.cuh"
+#include "util/common_gemm.cuh"
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
diff --git a/csrc/quantization/cutlass_test/test_mm_entry.cu b/csrc/sparse/cutlass/sparse_mm_entry.cu
similarity index 97%
rename from csrc/quantization/cutlass_test/test_mm_entry.cu
rename to csrc/sparse/cutlass/sparse_mm_entry.cu
index 8acd4cd1dd135..57c996141004f 100644
--- a/csrc/quantization/cutlass_test/test_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_mm_entry.cu
@@ -45,7 +45,7 @@ void cutlass_scaled_test_mm(torch::Tensor& c, torch::Tensor const& a,
                        c10::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
-  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) * 2 == b.size(0) &&
               b.size(1) == c.size(1));
   TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
   TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
diff --git a/csrc/quantization/cutlass_test/broadcast_load_epilogue_c3x.hpp b/csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/broadcast_load_epilogue_c3x.hpp
rename to csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp
diff --git a/csrc/quantization/cutlass_test/common.hpp b/csrc/sparse/cutlass/util/common.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/common.hpp
rename to csrc/sparse/cutlass/util/common.hpp
diff --git a/csrc/quantization/cutlass_test/common_gemm.cuh b/csrc/sparse/cutlass/util/common_gemm.cuh
similarity index 100%
rename from csrc/quantization/cutlass_test/common_gemm.cuh
rename to csrc/sparse/cutlass/util/common_gemm.cuh
diff --git a/csrc/quantization/cutlass_test/device_memory.h b/csrc/sparse/cutlass/util/device_memory.h
similarity index 100%
rename from csrc/quantization/cutlass_test/device_memory.h
rename to csrc/sparse/cutlass/util/device_memory.h
diff --git a/csrc/quantization/cutlass_test/exceptions.h b/csrc/sparse/cutlass/util/exceptions.h
similarity index 100%
rename from csrc/quantization/cutlass_test/exceptions.h
rename to csrc/sparse/cutlass/util/exceptions.h
diff --git a/csrc/quantization/cutlass_test/helper.h b/csrc/sparse/cutlass/util/helper.h
similarity index 100%
rename from csrc/quantization/cutlass_test/helper.h
rename to csrc/sparse/cutlass/util/helper.h
diff --git a/csrc/quantization/cutlass_test/host_tensor.h b/csrc/sparse/cutlass/util/host_tensor.h
similarity index 100%
rename from csrc/quantization/cutlass_test/host_tensor.h
rename to csrc/sparse/cutlass/util/host_tensor.h
diff --git a/csrc/quantization/cutlass_test/packed_stride.hpp b/csrc/sparse/cutlass/util/packed_stride.hpp
similarity index 100%
rename from csrc/quantization/cutlass_test/packed_stride.hpp
rename to csrc/sparse/cutlass/util/packed_stride.hpp
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 7f4f5d8243adf..34fe3f57c681e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -281,13 +281,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
           " Tensor a) -> bool");
   ops.impl("cutlass_sparsify_and_compress_entry", &cutlass_sparsify_and_compress_entry);
 
-  // CUTLASS sparse GEMM, supporting semi-structured sparsity
-  ops.def(
-      "cutlass_semi_structured_mm(Tensor! out, Tensor a,"
-      "                  Tensor b) -> ()");
-  ops.impl("cutlass_semi_structured_mm", torch::kCUDA,
-    &cutlass_semi_structured_mm);
-
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 525b63eaf2fd6..c44191cf750f1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -515,12 +515,16 @@ def cutlass_scaled_test_mm_supports_fp8(cuda_device_capability: int) -> bool:
 
 def cutlass_sparsify_and_compress_entry(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
-    assert (a.dtype is torch.int8 or a.dtype is torch.int8)
+    assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn)
+
+    # Not exactly sure what the right value would be based on cutlass definitions
+    # Let's assume e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
+    elemsPerElemE = 4
 
     m = a.shape[0]
     k = a.shape[1]
-    a_compressed = torch.empty((m, k), dtype=a.dtype, device=a.device)
-    e = torch.empty((m, k), dtype=torch.uint8, device=a.device)
+    a_compressed = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    e = torch.empty((m, k // 2 // elemsPerElemE), dtype=torch.uint8, device=a.device)
 
     if not (torch.ops._C.cutlass_sparsify_and_compress_entry(a_compressed, e, a)):
         raise ValueError
@@ -549,21 +553,6 @@ def cutlass_scaled_test_mm(a: torch.Tensor,
     return out
 
 
-def cutlass_semi_structured_mm(a: torch.Tensor,
-                      b: torch.Tensor,
-                      out_dtype: torch.dtype) -> torch.Tensor:
-    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
-    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16 or out_dtype is torch.float)
-
-    m = a.shape[0]
-    n = b.shape[1]
-    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
-
-    torch.ops._C.cutlass_semi_structured_mm(out, a, b)
-
-    return out
-
-
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,

From 0b332fb24f7eddc4e19192423c8c0d5ee06e597f Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Wed, 30 Oct 2024 19:13:55 +0000
Subject: [PATCH 04/92] Add fp16 and bf16 support to sparse cutlass mm

---
 .../cutlass_benchmarks/test_benchmarks.py     | 193 ++++++++++++++++--
 csrc/ops.h                                    |   4 +-
 csrc/sparse/cutlass/sparse_compressor.cu      |  34 +--
 csrc/sparse/cutlass/sparse_mm_c3x.cu          |  90 +++++++-
 csrc/sparse/cutlass/sparse_mm_entry.cu        |  10 +-
 csrc/sparse/cutlass/util/common_gemm.cuh      |  30 +++
 csrc/torch_bindings.cpp                       |   8 +-
 vllm/_custom_ops.py                           |  11 +-
 8 files changed, 331 insertions(+), 49 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/test_benchmarks.py b/benchmarks/cutlass_benchmarks/test_benchmarks.py
index d2cbe650d0d66..8fc2ff395fdb1 100644
--- a/benchmarks/cutlass_benchmarks/test_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/test_benchmarks.py
@@ -25,10 +25,15 @@ def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
     return torch.round(tensor.clamp(
         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
 
-
 def to_int8(tensor: torch.Tensor) -> torch.Tensor:
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
 
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                       k: int) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -43,6 +48,10 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
         return to_int8(a), to_int8(b)
     if dtype == torch.float8_e4m3fn:
         return to_fp8(a), to_fp8(b)
+    if dtype == torch.float16:
+        return to_fp16(a), to_fp16(b)
+    if dtype == torch.bfloat16:
+        return to_bf16(a), to_bf16(b)
 
     raise ValueError("unsupported dtype")
 
@@ -82,7 +91,7 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
     print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
-    print(f'Cutlass matmul: {ops.cutlass_scaled_test_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
+    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
 
     timers = []
     # pytorch impl - bfloat16
@@ -100,25 +109,25 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.bfloat16))
 
     # cutlass with bias: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
                  bias))
     
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_mm",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.float16))
 
     # cutlass with bias: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
                  bias.to(dtype=torch.float16)))
 
     return timers
@@ -140,7 +149,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
     print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
-    print(f'Cutlass matmul: {ops.cutlass_scaled_test_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
+    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
 
     timers = []
 
@@ -203,23 +212,175 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.bfloat16))
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
                  bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_test_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float16
+    a, b = make_rand_tensors(torch.float16, m, n, k)
+    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
+
+    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
+    print(f'a: {a[-1, -23:]}')
+    print(f'a_compressed: {a_compressed[0, :12]}')
+    print(f'e: {e[-1, -23:]}')
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
+    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
+
+    timers = []
+
+    # # pytorch impl w. bf16
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+    #              b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # pytorch impl: bf16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_bf16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16))
+
+    # # pytorch impl: fp16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_fp16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.bfloat16
+    a, b = make_rand_tensors(torch.bfloat16, m, n, k)
+    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
+
+    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
+    print(f'a: {a[-1, -23:]}')
+    print(f'a_compressed: {a_compressed[0, :12]}')
+    print(f'e: {e[-1, -23:]}')
+
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
+    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
+
+    timers = []
+
+    # # pytorch impl w. bf16
+    # timers.append(
+    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+    #              b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # # pytorch impl: bf16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_bf16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.bfloat16))
+
+    # # pytorch impl: fp16 output
+    # timers.append(
+    #     bench_fn(label,
+    #              sub_label,
+    #              "pytorch_fp16_fp16_fp16_scaled_mm",
+    #              torch._scaled_mm,
+    #              a,
+    #              b,
+    #              scale_a=scale_a,
+    #              scale_b=scale_b,
+    #              out_dtype=torch.float16))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_mm",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
                  bias.to(dtype=torch.float16)))
 
     return timers
@@ -231,6 +392,10 @@ def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
         return bench_int8(dtype, m, k, n, label, sub_label)
     if dtype == torch.float8_e4m3fn:
         return bench_fp8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float16:
+        return bench_fp16(dtype, m, k, n, label, sub_label)
+    if dtype == torch.bfloat16:
+        return bench_bf16(dtype, m, k, n, label, sub_label)
     raise ValueError("unsupported type")
 
 
@@ -338,6 +503,10 @@ def to_torch_dtype(dt):
             return torch.int8
         if dt == "fp8":
             return torch.float8_e4m3fn
+        if dt == "fp16":
+            return torch.float16
+        if dt == "bf16":
+            return torch.bfloat16
         raise ValueError("unsupported dtype")
 
     parser = FlexibleArgumentParser(
@@ -361,7 +530,7 @@ def to_torch_dtype(dt):
     parser.add_argument("--dtype",
                         type=to_torch_dtype,
                         required=True,
-                        help="Available options are ['int8', 'fp8']")
+                        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
     subparsers = parser.add_subparsers(dest="cmd")
 
     square_parser = subparsers.add_parser("square_bench")
diff --git a/csrc/ops.h b/csrc/ops.h
index bfe8eb37b65a7..5a8db9248ea2a 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -116,9 +116,9 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
 
-bool cutlass_scaled_test_mm_supports_fp8(int64_t cuda_device_capability);
+bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability);
 
-void cutlass_scaled_test_mm(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& e,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index 3bc9a1b6a6efa..118b56c7454df 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -50,7 +50,9 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
 {
   // Checks for conformality
   TORCH_CHECK(a.dtype() == torch::kInt8 ||
-              a.dtype() == torch::kFloat8_e4m3fn);
+              a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 ||
+              a.dtype() == torch::kBFloat16);
   TORCH_CHECK(a.dim() == 2)
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1)
@@ -67,12 +69,21 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   using StrideE = StrideA;
 
   using Gemm =
-      typename std::conditional<std::is_same_v<ElementA, int8_t>,
-        typename sm90_int8_config_default<int8_t, cutlass::half_t,
-                                          ScaledEpilogue>::Cutlass3xGemm,
+    typename std::conditional<std::is_same_v<ElementA, int8_t>,
+      typename sm90_int8_config_default<int8_t, cutlass::half_t,
+                                        ScaledEpilogue>::Cutlass3xGemm,
+      typename std::conditional<std::is_same_v<ElementA, cutlass::float_e4m3_t>,
         typename sm90_fp8_config_default<cutlass::float_e4m3_t, cutlass::half_t,
-                                         ScaledEpilogue>::Cutlass3xGemm
-      >::type;
+                                          ScaledEpilogue>::Cutlass3xGemm,
+        typename std::conditional<std::is_same_v<ElementA, cutlass::half_t>,
+          typename sm90_fp16_config_default<cutlass::half_t, cutlass::half_t,
+                                            ScaledEpilogue>::Cutlass3xGemm,
+          typename sm90_bf16_config_default<cutlass::bfloat16_t,
+                                            cutlass::half_t,
+                                            ScaledEpilogue>::Cutlass3xGemm
+        >::type
+      >::type
+    >::type;
 
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
@@ -178,12 +189,11 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
 
 bool cutlass_sparsify_and_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
 {
-  // if (a.dtype() == torch::kBFloat16) {
-  //   return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
-  // } else if (a.dtype() == torch::kFloat16) {
-  //   return sparsify_and_compress<cutlass::half_t>(a_compressed, e, a);
-  // } else
-  if (a.dtype() == torch::kFloat8_e4m3fn) {
+  if (a.dtype() == torch::kBFloat16) {
+    return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return sparsify_and_compress<cutlass::half_t>(a_compressed, e, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
     return sparsify_and_compress<cutlass::float_e4m3_t>(a_compressed, e, a);
   }
   else if (a.dtype() == torch::kInt8) {
diff --git a/csrc/sparse/cutlass/sparse_mm_c3x.cu b/csrc/sparse/cutlass/sparse_mm_c3x.cu
index d2af620e62e5e..7996c02fa4d33 100644
--- a/csrc/sparse/cutlass/sparse_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_mm_c3x.cu
@@ -68,6 +68,48 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& e,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat16);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp16_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+
+    // m in (128, inf)
+    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                    torch::Tensor const& e,
+                                    torch::Tensor const& b,
+                                    EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  TORCH_CHECK(a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(b.dtype() == torch::kBFloat16);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_bf16_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+
+    // m in (128, inf)
+    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+}
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
@@ -127,7 +169,7 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
 
 template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
-void cutlass_scaled_test_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
                                      torch::Tensor const& e,
                                      torch::Tensor const& b,
                                      EpilogueArgs&&... epilogue_args) {
@@ -144,8 +186,7 @@ void cutlass_scaled_test_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor cons
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
-  } else {
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
     TORCH_CHECK(e.dtype() == torch::kUInt8);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
@@ -160,9 +201,40 @@ void cutlass_scaled_test_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor cons
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
+  else if (a.dtype() == torch::kFloat16) {
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
+  else {
+    TORCH_CHECK(a.dtype() == torch::kBFloat16);
+    TORCH_CHECK(e.dtype() == torch::kUInt8);
+    TORCH_CHECK(b.dtype() == torch::kBFloat16);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  }
 }
 
-void cutlass_scaled_test_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& e,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -173,16 +245,16 @@ void cutlass_scaled_test_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == c.dtype(),
                 "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBias>(
         c, a, e, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogue>(c, a, e, b,
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogue>(c, a, e, b,
                                                            a_scales,
                                                            b_scales);
   }
 }
 
-void cutlass_scaled_test_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& e,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
@@ -194,10 +266,10 @@ void cutlass_scaled_test_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
         out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_test_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
         out, a, e, b, a_scales, b_scales, azp_adj, bias);
   }
 }
diff --git a/csrc/sparse/cutlass/sparse_mm_entry.cu b/csrc/sparse/cutlass/sparse_mm_entry.cu
index 57c996141004f..736d21dd03297 100644
--- a/csrc/sparse/cutlass/sparse_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_mm_entry.cu
@@ -4,7 +4,7 @@
 #include <torch/all.h>
 
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
-void cutlass_scaled_test_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& e,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -12,7 +12,7 @@ void cutlass_scaled_test_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             c10::optional<torch::Tensor> const& bias);
 #endif
 
-bool cutlass_scaled_test_mm_supports_fp8(int64_t cuda_device_capability) {
+bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability) {
   // CUTLASS FP8 kernels need at least
   //   CUDA 12.0 on SM90 systems (Hopper)
   //   CUDA 12.4 on SM89 systems (Lovelace)
@@ -38,7 +38,7 @@ int32_t test_get_sm_version_num() {
   return version_num;
 }
 
-void cutlass_scaled_test_mm(torch::Tensor& c, torch::Tensor const& a,
+void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& e,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -69,14 +69,14 @@ void cutlass_scaled_test_mm(torch::Tensor& c, torch::Tensor const& a,
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
   if (version_num >= 90) {
-    cutlass_scaled_test_mm_sm90(c, a, e, b, a_scales, b_scales, bias);
+    cutlass_scaled_sparse_mm_sm90(c, a, e, b, a_scales, b_scales, bias);
     return;
   }
 #endif
 
   TORCH_CHECK_NOT_IMPLEMENTED(
       false,
-      "No compiled cutlass_scaled_test_mm for a compute capability less than "
+      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
       "CUDA device capability: ",
       version_num);
 }
diff --git a/csrc/sparse/cutlass/util/common_gemm.cuh b/csrc/sparse/cutlass/util/common_gemm.cuh
index b0298a6bf5971..aab5b1cf0179c 100644
--- a/csrc/sparse/cutlass/util/common_gemm.cuh
+++ b/csrc/sparse/cutlass/util/common_gemm.cuh
@@ -447,6 +447,36 @@ void cutlass_test_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp16_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::half_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_bf16_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_default {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 34fe3f57c681e..9e55884549839 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -266,15 +266,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Test
   ops.def(
-      "cutlass_scaled_test_mm(Tensor! out, Tensor a,"
+      "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
       "                  Tensor e,"
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_test_mm", torch::kCUDA, &cutlass_scaled_test_mm);
+  ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // Test
-  ops.def("cutlass_scaled_test_mm_supports_fp8(int cuda_device_capability) -> bool");
-  ops.impl("cutlass_scaled_test_mm_supports_fp8", &cutlass_scaled_test_mm_supports_fp8);
+  ops.def("cutlass_scaled_sparse_mm_supports_fp8(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_sparse_mm_supports_fp8", &cutlass_scaled_sparse_mm_supports_fp8);
 
   // Test
   ops.def("cutlass_sparsify_and_compress_entry(Tensor! a_compressed, Tensor! e,"
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index c44191cf750f1..2c0bec4183708 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -509,13 +509,14 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
-def cutlass_scaled_test_mm_supports_fp8(cuda_device_capability: int) -> bool:
-    return torch.ops._C.cutlass_scaled_test_mm_supports_fp8(cuda_device_capability)
+def cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability)
 
 
 def cutlass_sparsify_and_compress_entry(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
-    assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn)
+    assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn or \
+            a.dtype is torch.bfloat16 or a.dtype is torch.float16)
 
     # Not exactly sure what the right value would be based on cutlass definitions
     # Let's assume e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
@@ -532,7 +533,7 @@ def cutlass_sparsify_and_compress_entry(a: torch.Tensor) \
     return a_compressed, e
 
 
-def cutlass_scaled_test_mm(a: torch.Tensor,
+def cutlass_scaled_sparse_mm(a: torch.Tensor,
                       e: torch.Tensor,
                       b: torch.Tensor,
                       scale_a: torch.Tensor,
@@ -548,7 +549,7 @@ def cutlass_scaled_test_mm(a: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_test_mm(out, a, e, b, scale_a, scale_b, bias)
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, e, b, scale_a, scale_b, bias)
 
     return out
 

From da3164838cd035a33e38d3dd4c710e6997ad7806 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Tue, 1 Oct 2024 13:25:33 +0000
Subject: [PATCH 05/92] semi_structured for fp16 and bf16 and int8

---
 tests/kernels/test_semi_structured.py         | 86 +++++++++++++++++++
 .../layers/sparsity/__init__.py               |  0
 .../layers/sparsity/utils/__init__.py         |  0
 .../sparsity/utils/cusparse_2_4_utils.py      | 49 +++++++++++
 4 files changed, 135 insertions(+)
 create mode 100644 tests/kernels/test_semi_structured.py
 create mode 100644 vllm/model_executor/layers/sparsity/__init__.py
 create mode 100644 vllm/model_executor/layers/sparsity/utils/__init__.py
 create mode 100644 vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py

diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
new file mode 100644
index 0000000000000..c5ca5bddd0f08
--- /dev/null
+++ b/tests/kernels/test_semi_structured.py
@@ -0,0 +1,86 @@
+import pytest
+import torch
+
+from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
+    generate_pruned_semi_structured_mat,
+    semi_structured_sparse_dense_gemm, 
+    semi_structured_dense_sparse_T_gemm, 
+    compress_to_torch_sparse_semi_structured_mat, 
+    decompress_torch_sparse_semi_structured_mat,
+    get_random_mat,
+    is_semi_structured_supported
+)
+
+from vllm import _custom_ops as ops
+
+DTYPES = [torch.float16, torch.bfloat16, torch.int8]
+SIZES=[(128, 128), (1024, 8192)]
+MNK = [
+    (64, 64, 64),
+    (64, 256, 512),
+    (512, 512, 512),
+    (512, 2048, 4096)
+]
+
+def dense_matmul(A, B, dtype):
+    if dtype is torch.int8:
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b, torch.bfloat16).to(torch.int8)
+    else:
+        return A @ B
+
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("size", SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_semi_structured_compress(size, dtype):
+    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
+    output_pruned = decompress_torch_sparse_semi_structured_mat(
+        compress_to_torch_sparse_semi_structured_mat(input_pruned)
+    )
+    torch.testing.assert_close(input_pruned, output_pruned)
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
+    if dtype is torch.int8:
+        pytest.skip("cusparse does not support sparse x non transposed dense")
+    M, N, K = mnk
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(K, N, dtype)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B)
+    C = dense_matmul(A_pruned, B, dtype)
+    torch.testing.assert_close(C, C_sparse)
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
+    M, N, K = mnk
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(N, K, dtype)
+
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
+    C = dense_matmul(A_pruned, B.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
+
+@pytest.mark.skipif(not is_semi_structured_supported(),
+                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk", MNK)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
+    M, N, K = mnk
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype)
+    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
+    A = get_random_mat(M, K, dtype)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T)
+    C = dense_matmul(A, B_T_pruned.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/sparsity/utils/__init__.py b/vllm/model_executor/layers/sparsity/utils/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
new file mode 100644
index 0000000000000..1be6ab4db18b1
--- /dev/null
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -0,0 +1,49 @@
+import torch
+from torch.sparse import to_sparse_semi_structured
+from vllm.platforms import current_platform
+from packaging.version import Version
+
+def compress_to_torch_sparse_semi_structured_mat(mat):
+    return to_sparse_semi_structured(mat)
+
+def decompress_torch_sparse_semi_structured_mat(sp_mat):
+    # Fix of to_dense() function supporting int8
+    # cuSparseLT for int8 requires dense matrix to be non-contiguous
+    return torch.mm(sp_mat, torch.eye(sp_mat.shape[-1], dtype=sp_mat.dtype, device=sp_mat.device).t())
+
+def semi_structured_sparse_dense_gemm(
+    a_sparse: torch.Tensor, b_dense: torch.Tensor
+):
+    return torch.mm(a_sparse, b_dense)
+
+def semi_structured_dense_sparse_T_gemm(
+    a: torch.Tensor, b_T: torch.Tensor
+):
+    return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
+
+def is_semi_structured_supported() -> bool:
+    if not (current_platform.is_cuda() or current_platform.is_rocm()):
+        return False
+
+    base_torch_version = Version(Version(torch.__version__).base_version)
+    
+    capability = current_platform.get_device_capability()
+    assert capability is not None
+    capability = capability.to_int()
+    min_capability = 80
+
+    return capability == min_capability or (capability > min_capability and base_torch_version >= Version("2.5.0"))
+
+def get_random_mat(M, K, dtype):
+    rand_tensor_dtype = dtype
+    if dtype is torch.int8:
+        rand_tensor_dtype = torch.float16
+    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda().to(dtype)
+    return mat
+
+def generate_pruned_semi_structured_mat(M, K, dtype):
+
+    mask = torch.Tensor([0, 0, 1, 1]).tile((M, K // 4)).cuda().bool()
+    mat = get_random_mat(M, K, dtype)
+    mat = mat.masked_fill_(mat == 0, 1)
+    return mat * mask

From e655f946424cef3fa77bce459ea60e6079e117d6 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 2 Oct 2024 12:26:05 -0400
Subject: [PATCH 06/92] Fix A100 int8 tests

---
 tests/kernels/test_semi_structured.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c5ca5bddd0f08..7060a36dd6c8b 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -16,10 +16,10 @@
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES=[(128, 128), (1024, 8192)]
 MNK = [
-    (64, 64, 64),
-    (64, 256, 512),
+    (128, 128, 128),
+    (128, 512, 1024),
     (512, 512, 512),
-    (512, 2048, 4096)
+    (1024, 2048, 4096)
 ]
 
 def dense_matmul(A, B, dtype):

From 5fc3c1c46a684a3f8931c5ffc4a6fd6b6b7e9a0d Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 13:56:35 +0000
Subject: [PATCH 07/92] Add fp8 cusparseLt

---
 CMakeLists.txt                                |   1 +
 csrc/ops.h                                    |   9 +
 .../fp8_semi_structured/cusparseLt.h          | 244 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |  11 +
 tests/kernels/test_semi_structured.py         |  71 ++---
 vllm/_custom_ops.py                           |   8 +
 .../sparsity/utils/cusparse_2_4_utils.py      |  80 ++++--
 7 files changed, 371 insertions(+), 53 deletions(-)
 create mode 100644 csrc/quantization/fp8_semi_structured/cusparseLt.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 943424bc4edfa..bde63b4f281a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
+  "csrc/quantization/fp8_semi_structured/cusparseLt.h"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/ops.h b/csrc/ops.h
index c50eb39a3dacc..b8e2fd3568042 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -216,3 +216,12 @@ std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
 #endif
+
+#ifndef USE_ROCM
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
+
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const std::optional<torch::Tensor>& bias_opt, bool transpose_result);
+
+#endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
new file mode 100644
index 0000000000000..867705c117074
--- /dev/null
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -0,0 +1,244 @@
+#include <cusparse.h>
+#include <torch/all.h>
+
+#include <cusparseLt.h>
+#include <cuda_fp8.h>
+
+namespace vllm {
+
+
+cusparseLtHandle_t handle;
+bool handle_initialized = false;
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    
+    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // create sparse descriptor, dtype
+    auto compression_factor = 9;
+    cusparseLtMatDescriptor_t input_descriptor;
+    cudaDataType type = CUDA_R_8F_E4M3;
+    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &input_descriptor,
+        input.size(0),
+        input.size(1),
+        input.size(1),
+        16,
+        type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    size_t compressed_size, compressed_buffer_size;
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
+        &handle,
+        &input_descriptor,
+        &compressed_size,
+        &compressed_buffer_size));
+
+    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
+    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
+        &handle,
+        &input_descriptor,
+        true,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        input.data_ptr(),
+        compressed_tensor.data_ptr(),
+        compressedBufferPtr.get(),
+        stream));
+    return compressed_tensor;
+}
+
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A,
+    const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt,
+    bool transpose_result
+)
+{
+    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
+    
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // cusparseLt data structures
+    cusparseLtMatmulDescriptor_t matmul;
+    cusparseLtMatmulPlan_t plan;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    
+    float alpha = 1.0;
+    float beta = 0.0;
+    cudaDataType input_type = CUDA_R_8F_E4M3;
+    cudaDataType output_type;
+    cudaDataType C_type;
+    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+    auto compression_factor = 9;
+    ScalarType out_dtype = dense_B.scalar_type();
+
+    switch (out_dtype)
+    {
+        case at::ScalarType::Float8_e4m3fn:
+            output_type = CUDA_R_8F_E4M3;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::Half:
+            output_type = CUDA_R_16F;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::BFloat16:
+            output_type = CUDA_R_16BF;
+            C_type = CUDA_R_16BF;
+            break;
+        case at::ScalarType::Float:
+            output_type = CUDA_R_32F;
+            C_type = CUDA_R_32F;
+            break;
+        default:
+            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
+            break;
+    }
+
+    int64_t k = dense_B.size(0);
+    int64_t n = dense_B.size(1);
+    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
+
+
+    //initialize sparse descriptor
+    cusparseLtMatDescriptor_t sparse_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &sparse_input_descriptor,
+        m,
+        k,
+        k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    // initialize dense input descriptor
+    cusparseLtMatDescriptor_t dense_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &dense_input_descriptor,
+        (dense_B.is_contiguous()) ? k : n,
+        (dense_B.is_contiguous()) ? n : k,
+        (dense_B.is_contiguous()) ? n : k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW));
+    
+    // create result tensor
+    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
+                                        : at::empty({m, n}, res_tensor_options);
+
+    cusparseLtMatDescriptor_t res_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &res_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        output_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    cusparseLtMatDescriptor_t C_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &C_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        C_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle,
+      &matmul,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
+      &sparse_input_descriptor,
+      &dense_input_descriptor,
+      &C_descriptor,
+      &res_descriptor,
+      compute_type));
+    
+    // set bias pointer for matmul, need to assign to get location
+    if (bias_opt.has_value()) {
+        auto& bias = bias_opt.value();
+        void* dBias = bias.data_ptr();
+        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
+    }
+
+    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
+                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
+    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
+    size_t workspace_size;
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+
+
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    auto workspacePtr = allocator.allocate(workspace_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+        &handle,
+        &plan,
+        &alpha,
+        compressed_A.data_ptr(),
+        dense_B.data_ptr(),
+        &beta,
+        res.data_ptr(),
+        res.data_ptr(),
+        workspacePtr.get(),
+        // jank because of the way we want this to be an array of streams
+        &stream,
+        1));
+
+    // Destroy descriptors
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
+    // Destroy plan
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
+    return res;
+}
+#else
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
+}
+
+at::Tensor cslt_mm_fp8_semi_structured(
+    const Tensor& compressed_A,
+    const Tensor& dense_B,
+    const std::optional<Tensor>& bias_opt,
+    bool transpose_result,
+)
+{
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
+#endif
+}
+
+#endif
+
+
+} // namespace vllm
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b8185c24d5628..3876c3dd326d2 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -305,6 +305,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "bool silu_activation,"
       "int pad_slot_id) -> ()");
   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
+
+  ops.def("cslt_compress_fp8_semi_structured(Tensor! input) -> Tensor");
+  ops.impl("cslt_compress_fp8_semi_structured", torch::kCUDA,
+           &cslt_compress_fp8_semi_structured);
+
+  ops.def(
+      "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
+      "Tensor!? bias, bool transpose_result) -> Tensor");
+
+  ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
+           &cslt_mm_fp8_semi_structured);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 7060a36dd6c8b..b9fd2800f1711 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -1,49 +1,46 @@
 import pytest
 import torch
 
+from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    generate_pruned_semi_structured_mat,
-    semi_structured_sparse_dense_gemm, 
-    semi_structured_dense_sparse_T_gemm, 
-    compress_to_torch_sparse_semi_structured_mat, 
+    compress_to_torch_sparse_semi_structured_mat,
     decompress_torch_sparse_semi_structured_mat,
-    get_random_mat,
-    is_semi_structured_supported
-)
+    generate_pruned_semi_structured_mat, get_random_mat,
+    is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
+    semi_structured_sparse_dense_gemm,
+    dense_matmul)
 
-from vllm import _custom_ops as ops
+# DTYPES = [torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn]
+DTYPES = [torch.float8_e4m3fn]
+SIZES = [(128, 128), (1024, 8192)]
+MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
 
-DTYPES = [torch.float16, torch.bfloat16, torch.int8]
-SIZES=[(128, 128), (1024, 8192)]
-MNK = [
-    (128, 128, 128),
-    (128, 512, 1024),
-    (512, 512, 512),
-    (1024, 2048, 4096)
-]
 
-def dense_matmul(A, B, dtype):
-    if dtype is torch.int8:
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b, torch.bfloat16).to(torch.int8)
-    else:
-        return A @ B
-
-
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
+    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
+        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
-        compress_to_torch_sparse_semi_structured_mat(input_pruned)
-    )
+        compress_to_torch_sparse_semi_structured_mat(input_pruned))
     torch.testing.assert_close(input_pruned, output_pruned)
 
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+
+# @pytest.mark.skipif(
+#  not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+#     reason="Semi structured matmul is not supported on this GPU type.")
+# @pytest.mark.parametrize("size", SIZES)
+# def test_torch_fp8_compress(size):
+#     x = generate_pruned_semi_structured_mat(*size, torch.float8_e4m3fn)
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
@@ -57,8 +54,10 @@ def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
     C = dense_matmul(A_pruned, B, dtype)
     torch.testing.assert_close(C, C_sparse)
 
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
@@ -71,8 +70,10 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     C = dense_matmul(A_pruned, B.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
 
-@pytest.mark.skipif(not is_semi_structured_supported(),
-                    reason="Semi structured matmul is not supported on this GPU type.")
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 46a2fb8bc80a2..25ab52e78758e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -697,6 +697,14 @@ def scaled_fp8_quant(
 
     return output, scale
 
+# semi structured fp8
+def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
+    assert input.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_compress_fp8_semi_structured(input)
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+    assert A_compressed.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
 
 # int8
 def scaled_int8_quant(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 1be6ab4db18b1..949d39cce72ec 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -1,49 +1,93 @@
 import torch
+from packaging.version import Version
 from torch.sparse import to_sparse_semi_structured
+
+from vllm._custom_ops import (semi_structured_fp8_compress,
+                              semi_structured_fp8_mm)
 from vllm.platforms import current_platform
-from packaging.version import Version
+from vllm import _custom_ops as ops
+
 
 def compress_to_torch_sparse_semi_structured_mat(mat):
-    return to_sparse_semi_structured(mat)
+    if mat.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_compress(mat)
+    else:
+        return to_sparse_semi_structured(mat)
+
 
 def decompress_torch_sparse_semi_structured_mat(sp_mat):
     # Fix of to_dense() function supporting int8
     # cuSparseLT for int8 requires dense matrix to be non-contiguous
-    return torch.mm(sp_mat, torch.eye(sp_mat.shape[-1], dtype=sp_mat.dtype, device=sp_mat.device).t())
+    if sp_mat.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_mm(sp_mat,
+                                      torch.eye(sp_mat.shape[-1],
+                                                dtype=sp_mat.dtype,
+                                                device=sp_mat.device),
+                                      transpose_result=False)
+    else:
+        return torch.mm(
+            sp_mat,
+            torch.eye(sp_mat.shape[-1],
+                      dtype=sp_mat.dtype,
+                      device=sp_mat.device).t())
+
 
-def semi_structured_sparse_dense_gemm(
-    a_sparse: torch.Tensor, b_dense: torch.Tensor
-):
-    return torch.mm(a_sparse, b_dense)
+def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
+                                      b_dense: torch.Tensor):
+    assert a_sparse.dtype in [
+        torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
+    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
+    if a_sparse.dtype == torch.float8_e4m3fn:
+        semi_structured_fp8_mm(a_sparse, b_dense, transpose_result=False)
+    else:
+        return torch.mm(a_sparse, b_dense)
 
-def semi_structured_dense_sparse_T_gemm(
-    a: torch.Tensor, b_T: torch.Tensor
-):
+
+def semi_structured_dense_sparse_T_gemm(a: torch.Tensor, b_T: torch.Tensor):
     return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
 
+# test utils
+def dense_matmul(A, B, dtype):
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
+                                     torch.bfloat16).to(dtype)
+    else:
+        return A @ B
+
+
 def is_semi_structured_supported() -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False
 
     base_torch_version = Version(Version(torch.__version__).base_version)
-    
+
     capability = current_platform.get_device_capability()
     assert capability is not None
     capability = capability.to_int()
     min_capability = 80
 
-    return capability == min_capability or (capability > min_capability and base_torch_version >= Version("2.5.0"))
+    return capability == min_capability or (
+        capability > min_capability and base_torch_version >= Version("2.5.0"))
+
 
 def get_random_mat(M, K, dtype):
     rand_tensor_dtype = dtype
-    if dtype is torch.int8:
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
         rand_tensor_dtype = torch.float16
-    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda().to(dtype)
-    return mat
+    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
+    mat = mat.masked_fill_(mat == 0, 1)
+    return mat.to(dtype)
 
-def generate_pruned_semi_structured_mat(M, K, dtype):
 
+def generate_pruned_semi_structured_mat(M, K, dtype):
     mask = torch.Tensor([0, 0, 1, 1]).tile((M, K // 4)).cuda().bool()
-    mat = get_random_mat(M, K, dtype)
+    rand_tensor_dtype = dtype
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
+        rand_tensor_dtype = torch.float16
+    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
     mat = mat.masked_fill_(mat == 0, 1)
-    return mat * mask
+    mat = mat * mask
+    # mat = get_random_mat(M, K, dtype)
+    return mat.to(dtype)

From 9cf36d66a2638385c43fc2ec780bb644a6069250 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 14:24:38 +0000
Subject: [PATCH 08/92] wip

---
 csrc/quantization/fp8_semi_structured/cusparseLt.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
index 867705c117074..6bd8aef2e943f 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -4,12 +4,11 @@
 #include <cusparseLt.h>
 #include <cuda_fp8.h>
 
-namespace vllm {
-
+// namespace vllm {
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
     
@@ -241,4 +240,4 @@ at::Tensor cslt_mm_fp8_semi_structured(
 #endif
 
 
-} // namespace vllm
\ No newline at end of file
+// } // namespace vllm
\ No newline at end of file

From ad09e79035cdad91800343ecf1c5b23e24bc6038 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 14:53:00 +0000
Subject: [PATCH 09/92] Fix signatures

---
 csrc/ops.h                                         | 2 +-
 csrc/quantization/fp8_semi_structured/cusparseLt.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index b8e2fd3568042..f23e97152fdef 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -222,6 +222,6 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const std::optional<torch::Tensor>& bias_opt, bool transpose_result);
+    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
index 6bd8aef2e943f..25570a019e913 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -225,10 +225,10 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
     TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
 }
 
-at::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_fp8_semi_structured(
     const Tensor& compressed_A,
     const Tensor& dense_B,
-    const std::optional<Tensor>& bias_opt,
+    const c10::optional<Tensor>& bias_opt,
     bool transpose_result,
 )
 {

From e75eabcaa693b63c1dc9303bd3159830436b6068 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Sun, 13 Oct 2024 20:55:04 +0000
Subject: [PATCH 10/92] Fix compilation and tests

---
 CMakeLists.txt                                |  2 +-
 .../{cusparseLt.h => cusparseLt.cpp}          | 39 ++++++------
 tests/kernels/test_semi_structured.py         | 60 +++++++++++++++----
 vllm/_custom_ops.py                           | 11 +++-
 .../sparsity/utils/cusparse_2_4_utils.py      | 40 +++++++++----
 5 files changed, 107 insertions(+), 45 deletions(-)
 rename csrc/quantization/fp8_semi_structured/{cusparseLt.h => cusparseLt.cpp} (88%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bde63b4f281a1..be89d70d27734 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,7 +196,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
-  "csrc/quantization/fp8_semi_structured/cusparseLt.h"
+  "csrc/quantization/fp8_semi_structured/cusparseLt.cpp"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
similarity index 88%
rename from csrc/quantization/fp8_semi_structured/cusparseLt.h
rename to csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 25570a019e913..5437dbb3ae2e4 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -1,14 +1,25 @@
 #include <cusparse.h>
 #include <torch/all.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/cuda/CUDAContext.h>
 
 #include <cusparseLt.h>
-#include <cuda_fp8.h>
 
-// namespace vllm {
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 600
+
+#define CUDASPARSE_CHECK(EXPR)                                  \
+  do {                                                          \
+    cusparseStatus_t __err = EXPR;                              \
+    TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,               \
+                "CUDA error: ",                                 \
+                cusparseGetErrorString(__err),                  \
+                " when calling `" #EXPR "`");                   \
+  } while (0)
+
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
-#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
     
@@ -41,7 +52,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
         &compressed_size,
         &compressed_buffer_size));
 
-    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
     auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -82,7 +93,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     cudaDataType C_type;
     cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
     auto compression_factor = 9;
-    ScalarType out_dtype = dense_B.scalar_type();
+    auto out_dtype = dense_B.scalar_type();
 
     switch (out_dtype)
     {
@@ -191,7 +202,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
 
 
-    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
     auto workspacePtr = allocator.allocate(workspace_size);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -205,7 +216,6 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         res.data_ptr(),
         res.data_ptr(),
         workspacePtr.get(),
-        // jank because of the way we want this to be an array of streams
         &stream,
         1));
 
@@ -219,6 +229,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
     return res;
 }
+
 #else
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
@@ -226,18 +237,12 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured(
-    const Tensor& compressed_A,
-    const Tensor& dense_B,
-    const c10::optional<Tensor>& bias_opt,
+    const torch::Tensor& compressed_A,
+    const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt,
     bool transpose_result,
-)
-{
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+) {
     TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
-#endif
 }
 
 #endif
-
-
-// } // namespace vllm
\ No newline at end of file
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index b9fd2800f1711..216cb4a547d3c 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -4,15 +4,14 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat,
-    decompress_torch_sparse_semi_structured_mat,
+    decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
-    semi_structured_sparse_dense_gemm,
-    dense_matmul)
+    semi_structured_sparse_dense_gemm)
 
-# DTYPES = [torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn]
-DTYPES = [torch.float8_e4m3fn]
+DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
+SIZES_FP8 = [(32, 64), (1024, 1024)]
 MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
 
 
@@ -22,20 +21,25 @@
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
-    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
-        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
         compress_to_torch_sparse_semi_structured_mat(input_pruned))
     torch.testing.assert_close(input_pruned, output_pruned)
 
 
-# @pytest.mark.skipif(
-#  not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
-#     reason="Semi structured matmul is not supported on this GPU type.")
-# @pytest.mark.parametrize("size", SIZES)
-# def test_torch_fp8_compress(size):
-#     x = generate_pruned_semi_structured_mat(*size, torch.float8_e4m3fn)
+@pytest.mark.skipif(
+    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+@pytest.mark.parametrize("size", SIZES_FP8)
+def test_semi_structured_fp8_compress(size):
+    dtype = torch.float8_e4m3fn
+    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
+    output_pruned = decompress_torch_sparse_semi_structured_mat(
+        compress_to_torch_sparse_semi_structured_mat(input_pruned))
+    torch.testing.assert_close(input_pruned.to(torch.float32),
+                               output_pruned.to(torch.float32),
+                               rtol=1e-1,
+                               atol=1e-1)
 
 
 @pytest.mark.skipif(
@@ -71,6 +75,21 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     torch.testing.assert_close(C, C_sparse)
 
 
+@pytest.mark.skipif(
+    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
     reason="Semi structured matmul is not supported on this GPU type.")
@@ -85,3 +104,18 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
     C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T)
     C = dense_matmul(A, B_T_pruned.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=dtype)
+    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
+    A = torch.full((M, K), .25, device='cuda', dtype=dtype)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 25ab52e78758e..e5993dcb7b94e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -697,14 +697,21 @@ def scaled_fp8_quant(
 
     return output, scale
 
+
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     assert input.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor,
+                           B_dense: torch.Tensor,
+                           bias: Optional[torch.Tensor] = None,
+                           transpose_result: bool = False) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
+                                                    bias, transpose_result)
+
 
 # int8
 def scaled_int8_quant(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 949d39cce72ec..201814b4f0401 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -1,30 +1,44 @@
 import torch
 from packaging.version import Version
-from torch.sparse import to_sparse_semi_structured
+from torch.sparse import (SparseSemiStructuredTensor,
+                          SparseSemiStructuredTensorCUSPARSELT,
+                          to_sparse_semi_structured)
 
+from vllm import _custom_ops as ops
 from vllm._custom_ops import (semi_structured_fp8_compress,
                               semi_structured_fp8_mm)
 from vllm.platforms import current_platform
-from vllm import _custom_ops as ops
 
 
-def compress_to_torch_sparse_semi_structured_mat(mat):
-    if mat.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_compress(mat)
+def compress_to_torch_sparse_semi_structured_mat(original_tensor):
+    if original_tensor.dtype == torch.float8_e4m3fn:
+        packed = semi_structured_fp8_compress(original_tensor)
+        return SparseSemiStructuredTensorCUSPARSELT(
+            shape=original_tensor.shape,
+            packed=packed,
+            meta=None,
+            packed_t=None,
+            meta_t=None,
+            compressed_swizzled_bitmask=None,
+            fuse_transpose_cusparselt=SparseSemiStructuredTensor.
+            _FUSE_TRANSPOSE,
+            alg_id_cusparselt=SparseSemiStructuredTensor._DEFAULT_ALG_ID,
+            requires_grad=original_tensor.requires_grad,
+        )
     else:
-        return to_sparse_semi_structured(mat)
+        return to_sparse_semi_structured(original_tensor)
 
 
 def decompress_torch_sparse_semi_structured_mat(sp_mat):
-    # Fix of to_dense() function supporting int8
-    # cuSparseLT for int8 requires dense matrix to be non-contiguous
     if sp_mat.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(sp_mat,
+        return semi_structured_fp8_mm(sp_mat.packed,
                                       torch.eye(sp_mat.shape[-1],
                                                 dtype=sp_mat.dtype,
-                                                device=sp_mat.device),
+                                                device=sp_mat.device).t(),
                                       transpose_result=False)
     else:
+        # Fix of to_dense() function supporting int8
+        # cuSparseLT for int8 requires dense matrix to be non-contiguous
         return torch.mm(
             sp_mat,
             torch.eye(sp_mat.shape[-1],
@@ -38,7 +52,9 @@ def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
     if a_sparse.dtype == torch.float8_e4m3fn:
-        semi_structured_fp8_mm(a_sparse, b_dense, transpose_result=False)
+        return semi_structured_fp8_mm(a_sparse.packed,
+                                      b_dense,
+                                      transpose_result=False)
     else:
         return torch.mm(a_sparse, b_dense)
 
@@ -46,6 +62,7 @@ def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
 def semi_structured_dense_sparse_T_gemm(a: torch.Tensor, b_T: torch.Tensor):
     return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
 
+
 # test utils
 def dense_matmul(A, B, dtype):
     if dtype in [torch.int8, torch.float8_e4m3fn]:
@@ -89,5 +106,4 @@ def generate_pruned_semi_structured_mat(M, K, dtype):
     mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
     mat = mat.masked_fill_(mat == 0, 1)
     mat = mat * mask
-    # mat = get_random_mat(M, K, dtype)
     return mat.to(dtype)

From 0306390d056214453b46ed716ed137e1d65a4310 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Tue, 15 Oct 2024 09:31:17 -0400
Subject: [PATCH 11/92] Update for older platforms

---
 CMakeLists.txt                                |   6 +
 .../fp8_semi_structured/cusparseLt.cpp        | 410 ++++++++----------
 tests/kernels/test_semi_structured.py         |   9 +-
 .../model_executor/layers/quantization/fp8.py |  63 +++
 4 files changed, 259 insertions(+), 229 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be89d70d27734..5424687303289 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -399,6 +399,12 @@ define_gpu_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
+# If cuSparseLt is not installed we skip 2:4 optimizations
+CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT)
+message(STATUS "Result of include cusparseLt ${HAVE_CUSPARSELT}")
+if(HAVE_CUSPARSELT)
+  target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+endif()
 #
 # _moe_C extension
 #
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 5437dbb3ae2e4..c7e0e9de703c3 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -3,246 +3,204 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#include <cusparseLt.h>
+#define STUB_FUNC_IMPL()                                                     \
+torch::Tensor cslt_compress_fp8_semi_structured(                           \
+    const torch::Tensor& input) {                                          \
+    TORCH_CHECK(false,                                                       \
+                "Unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+}                                                                          \
+                                                                            \
+torch::Tensor cslt_mm_fp8_semi_structured(                                 \
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
+    TORCH_CHECK(false,                                                       \
+                "Unsupported dtype for compressed matrix multiplication in " \
+                "current version of cuSPARSELt.");                           \
+}
+
 
-#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 600
+#if defined(VLLM_CUSPARSELT_ENABLED)
 
-#define CUDASPARSE_CHECK(EXPR)                                  \
-  do {                                                          \
-    cusparseStatus_t __err = EXPR;                              \
-    TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,               \
-                "CUDA error: ",                                 \
-                cusparseGetErrorString(__err),                  \
-                " when calling `" #EXPR "`");                   \
-  } while (0)
+  #include <cusparseLt.h>
 
+  #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 600
+
+    #define CUDASPARSE_CHECK(EXPR)                                 \
+      do {                                                         \
+        cusparseStatus_t __err = EXPR;                             \
+        TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,              \
+                    "CUDA error: ", cusparseGetErrorString(__err), \
+                    " when calling `" #EXPR "`");                  \
+      } while (0)
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
 
-
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    
-    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // create sparse descriptor, dtype
-    auto compression_factor = 9;
-    cusparseLtMatDescriptor_t input_descriptor;
-    cudaDataType type = CUDA_R_8F_E4M3;
-    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &input_descriptor,
-        input.size(0),
-        input.size(1),
-        input.size(1),
-        16,
-        type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    size_t compressed_size, compressed_buffer_size;
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-        &handle,
-        &input_descriptor,
-        &compressed_size,
-        &compressed_buffer_size));
-
-    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
-        &handle,
-        &input_descriptor,
-        true,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        input.data_ptr(),
-        compressed_tensor.data_ptr(),
-        compressedBufferPtr.get(),
-        stream));
-    return compressed_tensor;
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
+              "Only float8 e4m3 is supported in vllm:cslt_compress")
+  if (!handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+    handle_initialized = true;
+  }
+  // create sparse descriptor, dtype
+  auto compression_factor = 9;
+  cusparseLtMatDescriptor_t input_descriptor;
+  cudaDataType type = CUDA_R_8F_E4M3;
+  auto compressed_tensor =
+      input.new_empty(input.numel() * compression_factor / 16);
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
+      16, type, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  size_t compressed_size, compressed_buffer_size;
+  TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
+      &handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
+
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+  auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
+      &handle, &input_descriptor, true, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      input.data_ptr(), compressed_tensor.data_ptr(), compressedBufferPtr.get(),
+      stream));
+  return compressed_tensor;
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured(
-    const torch::Tensor& compressed_A,
-    const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& bias_opt,
-    bool transpose_result
-)
-{
-    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
-    
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // cusparseLt data structures
-    cusparseLtMatmulDescriptor_t matmul;
-    cusparseLtMatmulPlan_t plan;
-    cusparseLtMatmulAlgSelection_t alg_sel;
-    
-    float alpha = 1.0;
-    float beta = 0.0;
-    cudaDataType input_type = CUDA_R_8F_E4M3;
-    cudaDataType output_type;
-    cudaDataType C_type;
-    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-    auto compression_factor = 9;
-    auto out_dtype = dense_B.scalar_type();
-
-    switch (out_dtype)
-    {
-        case at::ScalarType::Float8_e4m3fn:
-            output_type = CUDA_R_8F_E4M3;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::Half:
-            output_type = CUDA_R_16F;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::BFloat16:
-            output_type = CUDA_R_16BF;
-            C_type = CUDA_R_16BF;
-            break;
-        case at::ScalarType::Float:
-            output_type = CUDA_R_32F;
-            C_type = CUDA_R_32F;
-            break;
-        default:
-            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
-            break;
-    }
-
-    int64_t k = dense_B.size(0);
-    int64_t n = dense_B.size(1);
-    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
-
-
-    //initialize sparse descriptor
-    cusparseLtMatDescriptor_t sparse_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &sparse_input_descriptor,
-        m,
-        k,
-        k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    // initialize dense input descriptor
-    cusparseLtMatDescriptor_t dense_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &dense_input_descriptor,
-        (dense_B.is_contiguous()) ? k : n,
-        (dense_B.is_contiguous()) ? n : k,
-        (dense_B.is_contiguous()) ? n : k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW));
-    
-    // create result tensor
-    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
-                                        : at::empty({m, n}, res_tensor_options);
-
-    cusparseLtMatDescriptor_t res_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &res_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        output_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    cusparseLtMatDescriptor_t C_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &C_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        C_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &handle,
-      &matmul,
-      CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
-      &sparse_input_descriptor,
-      &dense_input_descriptor,
-      &C_descriptor,
-      &res_descriptor,
-      compute_type));
-    
-    // set bias pointer for matmul, need to assign to get location
-    if (bias_opt.has_value()) {
-        auto& bias = bias_opt.value();
-        void* dBias = bias.data_ptr();
-        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
-    }
-
-    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
-                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
-    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
-    size_t workspace_size;
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
-
-
-    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-    auto workspacePtr = allocator.allocate(workspace_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-        &handle,
-        &plan,
-        &alpha,
-        compressed_A.data_ptr(),
-        dense_B.data_ptr(),
-        &beta,
-        res.data_ptr(),
-        res.data_ptr(),
-        workspacePtr.get(),
-        &stream,
-        1));
-
-    // Destroy descriptors
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
-    // Destroy plan
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
-    return res;
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
+  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
+              "Only float8 e4m3 is supported in vllm:cslt_compress");
+
+  if (!handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+    handle_initialized = true;
+  }
+  // cusparseLt data structures
+  cusparseLtMatmulDescriptor_t matmul;
+  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatmulAlgSelection_t alg_sel;
+
+  float alpha = 1.0;
+  float beta = 0.0;
+  cudaDataType input_type = CUDA_R_8F_E4M3;
+  cudaDataType output_type;
+  cudaDataType C_type;
+  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  auto compression_factor = 9;
+  auto out_dtype = dense_B.scalar_type();
+
+  switch (out_dtype) {
+    case at::ScalarType::Float8_e4m3fn:
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::Half:
+      output_type = CUDA_R_16F;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::BFloat16:
+      output_type = CUDA_R_16BF;
+      C_type = CUDA_R_16BF;
+      break;
+    case at::ScalarType::Float:
+      output_type = CUDA_R_32F;
+      C_type = CUDA_R_32F;
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
+                  "float32} for fp8 inputs");
+      break;
+  }
+
+  int64_t k = dense_B.size(0);
+  int64_t n = dense_B.size(1);
+  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+
+  // initialize sparse descriptor
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &handle, &sparse_input_descriptor, m, k, k, 16, input_type,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  // initialize dense input descriptor
+  cusparseLtMatDescriptor_t dense_input_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, &dense_input_descriptor, (dense_B.is_contiguous()) ? k : n,
+      (dense_B.is_contiguous()) ? n : k, (dense_B.is_contiguous()) ? n : k, 16,
+      input_type, CUSPARSE_ORDER_ROW));
+
+  // create result tensor
+  auto res_tensor_options =
+      c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+  at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
+                                      : at::empty({m, n}, res_tensor_options);
+
+  cusparseLtMatDescriptor_t res_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      output_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+  cusparseLtMatDescriptor_t C_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
+                                : CUSPARSE_OPERATION_TRANSPOSE,
+      &sparse_input_descriptor, &dense_input_descriptor, &C_descriptor,
+      &res_descriptor, compute_type));
+
+  // set bias pointer for matmul, need to assign to get location
+  if (bias_opt.has_value()) {
+    auto& bias = bias_opt.value();
+    void* dBias = bias.data_ptr();
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+        &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
+        sizeof(dBias)));
+  }
+
+  cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
+                                   CUSPARSELT_MATMUL_ALG_DEFAULT);
+  cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
+  size_t workspace_size;
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+  auto workspacePtr = allocator.allocate(workspace_size);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+      &handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
+      &beta, res.data_ptr(), res.data_ptr(), workspacePtr.get(), &stream, 1));
+
+  // Destroy descriptors
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
+  // Destroy plan
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
+  return res;
 }
-
 #else
 
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
-}
+STUB_FUNC_IMPL()
 
-torch::Tensor cslt_mm_fp8_semi_structured(
-    const torch::Tensor& compressed_A,
-    const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& bias_opt,
-    bool transpose_result,
-) {
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
-}
+#endif
+
+#else
+
+STUB_FUNC_IMPL()
 
 #endif
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 216cb4a547d3c..f14e959a8ad29 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -27,8 +27,9 @@ def test_semi_structured_compress(size, dtype):
     torch.testing.assert_close(input_pruned, output_pruned)
 
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("size", SIZES_FP8)
 def test_semi_structured_fp8_compress(size):
@@ -75,8 +76,9 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     torch.testing.assert_close(C, C_sparse)
 
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     M, N, K = (32, 64, 32)
@@ -106,8 +108,9 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
     torch.testing.assert_close(C, C_sparse)
 
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("fp8"),
+    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
     M, N, K = (32, 64, 32)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d34579b7099bb..d01d0be9306af 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -95,6 +95,69 @@ def get_quant_method(self, layer: torch.nn.Module,
     def get_scaled_act_names(self) -> List[str]:
         return []
 
+class Fp8Config(QuantizationConfig):
+    """Config class for FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
+    ) -> None:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning("Detected fp8 checkpoint. Please note that the "
+                           "format is experimental and subject to change.")
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+                   activation_scheme=activation_scheme,
+                   ignored_layers=ignored_layers)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return Fp8MoEMethod(self)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.

From 1021acbe218fd7a1a31e5f2a2875e4ff1972eeb2 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 16 Oct 2024 11:48:36 +0000
Subject: [PATCH 12/92] Add benchmarks

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 227 ++++++++++++++++++
 .../cusparseLt_benchmarks/weight_shapes.py    |  43 ++++
 2 files changed, 270 insertions(+)
 create mode 100644 benchmarks/cusparseLt_benchmarks/benchmark_24.py
 create mode 100644 benchmarks/cusparseLt_benchmarks/weight_shapes.py

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
new file mode 100644
index 0000000000000..11aaf3ce6e03b
--- /dev/null
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -0,0 +1,227 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
+    compress_to_torch_sparse_semi_structured_mat,
+    dense_matmul, get_random_mat,
+    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = get_random_mat(m, k, dtype)
+    b = get_random_mat(n, k, dtype).t()
+    return a, b
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench(m: int, k: int, n: int, label: str,
+              sub_label: str, use_fp8: bool) -> Iterable[TMeasurement]:
+    a, b = make_rand_tensors(torch.float16, m, n, k)
+
+    timers = []
+    # pytorch float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_matmul", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # pytorch bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+ 
+    # cusparseLt fp16
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_fp16_fp16_2_4", semi_structured_sparse_dense_gemm, 
+                 compress_to_torch_sparse_semi_structured_mat(a), b)
+    )
+
+    # cusparseLt bf16
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_bf16_bf16_2_4", semi_structured_sparse_dense_gemm, 
+                 compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16))
+    )
+
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    # cutlass i8
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
+                 dense_matmul, a, b, torch.int8))
+    
+    # cusparseLt i8
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_i8_i8_2_4", semi_structured_sparse_dense_gemm, 
+                 compress_to_torch_sparse_semi_structured_mat(a), b)
+    )
+
+    if use_fp8:
+        a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+        # cutlass fp8
+        timers.append(
+            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
+                    dense_matmul, a, b, torch.float8_e4m3fn))
+        
+        # cusparseLt fp8
+        timers.append(
+            bench_fn(label,
+                    sub_label,
+                    "cusparseLt_fp8_fp8_2_4", semi_structured_sparse_dense_gemm, 
+                    compress_to_torch_sparse_semi_structured_mat(a), b)
+        )
+
+    return timers
+
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(MKNs: Iterable[Tuple[int, int, int]], use_fp8: bool) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(m, k, n, f"gemm", f"MKN=({m}x{k}x{n})", use_fp8)
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+def run_model_bench(args):
+    if not is_semi_structured_supported():
+        raise ValueError("Device does not support semi-structured sparsity")
+
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            assert m % 32 == 0, "Batch size has to be a multiple of 32"
+            for k, n in KNs:
+                if k % 32 or n % 32:
+                    continue
+                MKNs.append((m, k, n))
+
+        data = run(MKNs, args.use_fp8)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark cuSparseLt 2:4 GEMMs.
+
+    To run dimensions from a model:
+        python3 ./benchmarks/cusparseLt_benchmarks/benchmark_24.py --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cusparseLt implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    parser.add_argument('--use-fp8',
+                        action='store_true',
+                        help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
+
+    args = parser.parse_args()
+    run_model_bench(args)
+
+    
\ No newline at end of file
diff --git a/benchmarks/cusparseLt_benchmarks/weight_shapes.py b/benchmarks/cusparseLt_benchmarks/weight_shapes.py
new file mode 100644
index 0000000000000..25ec9d6028627
--- /dev/null
+++ b/benchmarks/cusparseLt_benchmarks/weight_shapes.py
@@ -0,0 +1,43 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}

From 19ce35881061311b809f7628d1fefd3a94db3e72 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 23 Oct 2024 16:52:38 +0000
Subject: [PATCH 13/92] Fix typo

---
 .../model_executor/layers/quantization/fp8.py | 63 -------------------
 1 file changed, 63 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d01d0be9306af..d34579b7099bb 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -95,69 +95,6 @@ def get_quant_method(self, layer: torch.nn.Module,
     def get_scaled_act_names(self) -> List[str]:
         return []
 
-class Fp8Config(QuantizationConfig):
-    """Config class for FP8."""
-
-    def __init__(
-        self,
-        is_checkpoint_fp8_serialized: bool = False,
-        activation_scheme: str = "dynamic",
-        ignored_layers: Optional[List[str]] = None,
-    ) -> None:
-        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
-        if is_checkpoint_fp8_serialized:
-            logger.warning("Detected fp8 checkpoint. Please note that the "
-                           "format is experimental and subject to change.")
-        if activation_scheme not in ACTIVATION_SCHEMES:
-            raise ValueError(
-                f"Unsupported activation scheme {activation_scheme}")
-        self.activation_scheme = activation_scheme
-        self.ignored_layers = ignored_layers or []
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "fp8"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16, torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 80
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
-        quant_method = cls.get_from_keys(config, ["quant_method"])
-        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
-        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
-        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
-                   activation_scheme=activation_scheme,
-                   ignored_layers=ignored_layers)
-
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
-        if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix, self.ignored_layers):
-                return UnquantizedLinearMethod()
-            return Fp8LinearMethod(self)
-        elif isinstance(layer, FusedMoE):
-            return Fp8MoEMethod(self)
-        elif isinstance(layer, Attention):
-            return Fp8KVCacheMethod(self)
-        return None
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.

From 959408c4012658f807c178ac7872db10812c6f83 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Thu, 24 Oct 2024 12:19:06 +0000
Subject: [PATCH 14/92] Added scaled_mm for fp8.

Removed cmake check for cusparseLt, needs to be reverted when the cmake issue is resolved.
---
 CMakeLists.txt                                | 15 ++-
 .../cusparseLt_benchmarks/benchmark_24.py     | 96 +++++++++----------
 csrc/ops.h                                    |  1 +
 .../fp8_semi_structured/cusparseLt.cpp        | 41 +++++---
 csrc/torch_bindings.cpp                       |  2 +-
 tests/kernels/test_semi_structured.py         | 52 +++++++++-
 vllm/_custom_ops.py                           |  4 +-
 .../sparsity/utils/cusparse_2_4_utils.py      | 28 +++++-
 8 files changed, 161 insertions(+), 78 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5424687303289..73e17a3d814ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -400,11 +400,16 @@ define_gpu_extension_target(
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
 # If cuSparseLt is not installed we skip 2:4 optimizations
-CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT)
-message(STATUS "Result of include cusparseLt ${HAVE_CUSPARSELT}")
-if(HAVE_CUSPARSELT)
-  target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
-endif()
+CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT_H)
+
+# TODO has to be fixed.
+target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+
+# if(HAVE_CUSPARSELT_H)
+#   message(STATUS "cusparseLt found")
+#   target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
+# endif()
+
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 11aaf3ce6e03b..aa3328b0f17cf 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -10,17 +10,16 @@
 from torch.utils.benchmark import Measurement as TMeasurement
 from weight_shapes import WEIGHT_SHAPES
 
-from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    compress_to_torch_sparse_semi_structured_mat,
-    dense_matmul, get_random_mat,
+    compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
     is_semi_structured_supported, semi_structured_sparse_dense_gemm)
+from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
+
 # helpers
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                       k: int) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -28,6 +27,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
     b = get_random_mat(n, k, dtype).t()
     return a, b
 
+
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
              **kwargs) -> TMeasurement:
@@ -47,82 +47,75 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
     ).blocked_autorange(min_run_time=min_run_time)
 
 
-def bench(m: int, k: int, n: int, label: str,
-              sub_label: str, use_fp8: bool) -> Iterable[TMeasurement]:
+def bench(m: int, k: int, n: int, label: str, sub_label: str,
+          use_fp8: bool) -> Iterable[TMeasurement]:
     a, b = make_rand_tensors(torch.float16, m, n, k)
 
     timers = []
     # pytorch float16
     timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_matmul", torch.mm,
+        bench_fn(label, sub_label, "pytorch_fp16_fp16_matmul", torch.mm,
                  a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
 
     # pytorch bf16
     timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul", torch.mm,
+                 a.to(dtype=torch.bfloat16, device="cuda"),
                  b.to(dtype=torch.bfloat16, device="cuda")))
- 
+
     # cusparseLt fp16
     timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_fp16_fp16_2_4", semi_structured_sparse_dense_gemm, 
-                 compress_to_torch_sparse_semi_structured_mat(a), b)
-    )
+        bench_fn(label, sub_label, "cusparseLt_fp16_fp16_2_4",
+                 semi_structured_sparse_dense_gemm,
+                 compress_to_torch_sparse_semi_structured_mat(a), b))
 
     # cusparseLt bf16
     timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_bf16_bf16_2_4", semi_structured_sparse_dense_gemm, 
-                 compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16))
-    )
+        bench_fn(
+            label, sub_label, "cusparseLt_bf16_bf16_2_4",
+            semi_structured_sparse_dense_gemm,
+            compress_to_torch_sparse_semi_structured_mat(
+                a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16)))
 
     a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
                  dense_matmul, a, b, torch.int8))
-    
+
     # cusparseLt i8
     timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_i8_i8_2_4", semi_structured_sparse_dense_gemm, 
-                 compress_to_torch_sparse_semi_structured_mat(a), b)
-    )
+        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
+                 semi_structured_sparse_dense_gemm,
+                 compress_to_torch_sparse_semi_structured_mat(a), b))
 
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
         timers.append(
             bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
-                    dense_matmul, a, b, torch.float8_e4m3fn))
-        
+                     dense_matmul, a, b, torch.float8_e4m3fn))
+
         # cusparseLt fp8
         timers.append(
-            bench_fn(label,
-                    sub_label,
-                    "cusparseLt_fp8_fp8_2_4", semi_structured_sparse_dense_gemm, 
-                    compress_to_torch_sparse_semi_structured_mat(a), b)
-        )
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+                     semi_structured_sparse_dense_gemm,
+                     compress_to_torch_sparse_semi_structured_mat(a), b))
 
     return timers
 
 
-
 # runner
 def print_timers(timers: Iterable[TMeasurement]):
     compare = TBenchmark.Compare(timers)
     compare.print()
 
 
-def run(MKNs: Iterable[Tuple[int, int, int]], use_fp8: bool) -> Iterable[TMeasurement]:
+def run(MKNs: Iterable[Tuple[int, int, int]],
+        use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
-        timers = bench(m, k, n, f"gemm", f"MKN=({m}x{k}x{n})", use_fp8)
+        timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
         results.extend(timers)
 
@@ -205,23 +198,22 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         formatter_class=argparse.RawTextHelpFormatter)
 
     parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
+                        nargs="+",
+                        type=str,
+                        default=DEFAULT_MODELS,
+                        choices=WEIGHT_SHAPES.keys())
     parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_TP_SIZES)
     parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    parser.add_argument('--use-fp8',
-                        action='store_true',
-                        help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_BATCH_SIZES)
+    parser.add_argument(
+        '--use-fp8',
+        action='store_true',
+        help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
 
     args = parser.parse_args()
     run_model_bench(args)
-
-    
\ No newline at end of file
diff --git a/csrc/ops.h b/csrc/ops.h
index f23e97152fdef..014f0068d423b 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -222,6 +222,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index c7e0e9de703c3..a5e8d0ca6c0bf 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -4,21 +4,20 @@
 #include <ATen/cuda/CUDAContext.h>
 
 #define STUB_FUNC_IMPL()                                                     \
-torch::Tensor cslt_compress_fp8_semi_structured(                           \
-    const torch::Tensor& input) {                                          \
+  torch::Tensor cslt_compress_fp8_semi_structured(                           \
+      const torch::Tensor& input) {                                          \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix in current "        \
                 "version of cuSPARSELt.");                                   \
-}                                                                          \
-                                                                            \
-torch::Tensor cslt_mm_fp8_semi_structured(                                 \
-    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
+  }                                                                          \
+                                                                             \
+  torch::Tensor cslt_mm_fp8_semi_structured(                                 \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+      const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
-}
-
+  }
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -72,6 +71,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
@@ -85,6 +85,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
+  int tensor_alpha_mode = 0;
   float alpha = 1.0;
   float beta = 0.0;
   cudaDataType input_type = CUDA_R_8F_E4M3;
@@ -168,8 +169,24 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         sizeof(dBias)));
   }
 
+  const auto alpha_tensor =
+      alpha_opt.has_value() ? *alpha_opt : torch::Tensor{};
+  auto alpha_ptr = &alpha;
+  if (alpha_opt.has_value()) {
+    if (alpha_tensor.numel() == 1) {
+      alpha = alpha_tensor.item<float>();
+    } else {
+      tensor_alpha_mode = 1;
+      TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+          &handle, &matmul, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING,
+          &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
+      alpha_ptr = static_cast<float*>(alpha_tensor.data_ptr());
+    }
+  }
+
   cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
+
   cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
@@ -180,7 +197,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-      &handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
+      &handle, &plan, alpha_ptr, compressed_A.data_ptr(), dense_B.data_ptr(),
       &beta, res.data_ptr(), res.data_ptr(), workspacePtr.get(), &stream, 1));
 
   // Destroy descriptors
@@ -193,11 +210,11 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
   return res;
 }
-#else
+  #else
 
 STUB_FUNC_IMPL()
 
-#endif
+  #endif
 
 #else
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 3876c3dd326d2..46e1fd15c139b 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -312,7 +312,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "Tensor!? bias, bool transpose_result) -> Tensor");
+      "Tensor!? alpha, Tensor!? bias, bool transpose_result) -> Tensor");
 
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index f14e959a8ad29..0b54006e8997d 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -7,7 +7,8 @@
     decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
-    semi_structured_sparse_dense_gemm)
+    semi_structured_sparse_dense_gemm,
+    semi_structured_sparse_dense_gemm_scaled)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -15,6 +16,20 @@
 MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
 
 
+# From pytorch test
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    # Calculate the scale as dtype max divided by absmax
+    scale = finfo.max / x.abs().max().clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
     reason="Semi structured matmul is not supported on this GPU type.")
@@ -29,7 +44,8 @@ def test_semi_structured_compress(size, dtype):
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 @pytest.mark.parametrize("size", SIZES_FP8)
 def test_semi_structured_fp8_compress(size):
@@ -78,7 +94,8 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     M, N, K = (32, 64, 32)
@@ -92,6 +109,32 @@ def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
 
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
+    A_pruned_fp8, scale_A = to_float8(A_pruned)
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
     reason="Semi structured matmul is not supported on this GPU type.")
@@ -110,7 +153,8 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
-    not is_semi_structured_supported() or not is_quant_method_supported("modelopt"),
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
 def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
     M, N, K = (32, 64, 32)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e5993dcb7b94e..5466f9629dd27 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -706,11 +706,13 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
 
 def semi_structured_fp8_mm(A_compressed: torch.Tensor,
                            B_dense: torch.Tensor,
+                           alpha: Optional[torch.Tensor] = None,
                            bias: Optional[torch.Tensor] = None,
                            transpose_result: bool = False) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    bias, transpose_result)
+                                                    alpha, bias,
+                                                    transpose_result)
 
 
 # int8
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 201814b4f0401..66913833a7e53 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -47,20 +47,42 @@ def decompress_torch_sparse_semi_structured_mat(sp_mat):
 
 
 def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
-                                      b_dense: torch.Tensor):
+                                      b_dense: torch.Tensor,
+                                      bias: torch.Tensor = None):
     assert a_sparse.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
     if a_sparse.dtype == torch.float8_e4m3fn:
         return semi_structured_fp8_mm(a_sparse.packed,
                                       b_dense,
+                                      bias=bias,
                                       transpose_result=False)
     else:
         return torch.mm(a_sparse, b_dense)
 
 
-def semi_structured_dense_sparse_T_gemm(a: torch.Tensor, b_T: torch.Tensor):
-    return (semi_structured_sparse_dense_gemm(b_T, a.t())).t()
+def semi_structured_dense_sparse_T_gemm(a: torch.Tensor,
+                                        b_T: torch.Tensor,
+                                        bias: torch.Tensor = None):
+    return (semi_structured_sparse_dense_gemm(b_T, a.t(), bias)).t()
+
+
+def semi_structured_sparse_dense_gemm_scaled(a_sparse: torch.Tensor,
+                                             b_dense: torch.Tensor,
+                                             scale_a: torch.Tensor,
+                                             scale_b: torch.Tensor,
+                                             bias: torch.Tensor = None):
+    assert (a_sparse.dtype == torch.float8_e4m3fn
+            and b_dense.dtype == torch.float8_e4m3fn)
+    assert not b_dense.is_contiguous(
+    ), "cusparseLt requires dense matrix be non-contiguous"
+    # cusparseLt requires alpha to be float
+    assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
+    return semi_structured_fp8_mm(a_sparse.packed,
+                                  b_dense,
+                                  alpha=scale_a * scale_b,
+                                  bias=bias,
+                                  transpose_result=False)
 
 
 # test utils

From 117b87b7d13c2af728eb1faba87297fb4679f148 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Mon, 28 Oct 2024 15:14:05 +0000
Subject: [PATCH 15/92] Add docstrings

---
 .../sparsity/utils/cusparse_2_4_utils.py      | 111 +++++++++++++-----
 1 file changed, 79 insertions(+), 32 deletions(-)

diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 66913833a7e53..bdae2e9f765fe 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -10,11 +10,19 @@
 from vllm.platforms import current_platform
 
 
-def compress_to_torch_sparse_semi_structured_mat(original_tensor):
-    if original_tensor.dtype == torch.float8_e4m3fn:
-        packed = semi_structured_fp8_compress(original_tensor)
+def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
+    '''
+    Compresses original pruned (with zeros) tensor into packed version
+    Args:
+        pruned_tensor(torch.Tensor) - pruned but not packed tensor
+    Returns: 
+        torch.SparseSemiStructuredTensorCUSPARSELT: torch wrapped cusparseLt-packed tensor. 
+    '''
+    
+    if pruned_tensor.dtype == torch.float8_e4m3fn:
+        packed = semi_structured_fp8_compress(pruned_tensor)
         return SparseSemiStructuredTensorCUSPARSELT(
-            shape=original_tensor.shape,
+            shape=pruned_tensor.shape,
             packed=packed,
             meta=None,
             packed_t=None,
@@ -23,62 +31,101 @@ def compress_to_torch_sparse_semi_structured_mat(original_tensor):
             fuse_transpose_cusparselt=SparseSemiStructuredTensor.
             _FUSE_TRANSPOSE,
             alg_id_cusparselt=SparseSemiStructuredTensor._DEFAULT_ALG_ID,
-            requires_grad=original_tensor.requires_grad,
+            requires_grad=pruned_tensor.requires_grad,
         )
     else:
-        return to_sparse_semi_structured(original_tensor)
-
-
-def decompress_torch_sparse_semi_structured_mat(sp_mat):
-    if sp_mat.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(sp_mat.packed,
-                                      torch.eye(sp_mat.shape[-1],
-                                                dtype=sp_mat.dtype,
-                                                device=sp_mat.device).t(),
+        return to_sparse_semi_structured(pruned_tensor)
+
+#  
+def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
+    '''
+    Unpacks the cusparseLt packed tensor into pruned tensor
+    Args:
+        packed_tensor - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
+    Returns:
+        pruned (torch.Tensor) - pruned torch.tensor
+    '''
+    if packed_tensor.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_mm(packed_tensor.packed,
+                                      torch.eye(packed_tensor.shape[-1],
+                                                dtype=packed_tensor.dtype,
+                                                device=packed_tensor.device).t(),
                                       transpose_result=False)
     else:
         # Fix of to_dense() function supporting int8
         # cuSparseLT for int8 requires dense matrix to be non-contiguous
         return torch.mm(
-            sp_mat,
-            torch.eye(sp_mat.shape[-1],
-                      dtype=sp_mat.dtype,
-                      device=sp_mat.device).t())
+            packed_tensor,
+            torch.eye(packed_tensor.shape[-1],
+                      dtype=packed_tensor.dtype,
+                      device=packed_tensor.device).t())
 
 
-def semi_structured_sparse_dense_gemm(a_sparse: torch.Tensor,
+def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
                                       b_dense: torch.Tensor,
                                       bias: torch.Tensor = None):
-    assert a_sparse.dtype in [
+    '''
+    Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices
+    Args:
+        a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
+        b_dense (torch.Tensor) - dense matrix tensor.
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None. 
+    Result:
+        torch.Tensor - Result of matrix multiplication.
+    '''
+    assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
-    if a_sparse.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(a_sparse.packed,
+    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
+    if a_packed.dtype == torch.float8_e4m3fn:
+        return semi_structured_fp8_mm(a_packed.packed,
                                       b_dense,
                                       bias=bias,
                                       transpose_result=False)
     else:
-        return torch.mm(a_sparse, b_dense)
+        return torch.mm(a_packed, b_dense)
 
 
-def semi_structured_dense_sparse_T_gemm(a: torch.Tensor,
-                                        b_T: torch.Tensor,
+def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
+                                        b_T_packed: torch.Tensor,
                                         bias: torch.Tensor = None):
-    return (semi_structured_sparse_dense_gemm(b_T, a.t(), bias)).t()
-
-
-def semi_structured_sparse_dense_gemm_scaled(a_sparse: torch.Tensor,
+    '''
+    Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
+    Args:
+        a_dense (torch.Tensor) - dense matrix tensor.
+        b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+    
+    Returns:
+        torch.Tensor - Result of matrix multiplication.
+    '''
+    return (semi_structured_sparse_dense_gemm(b_T_packed, a_dense.t(), bias)).t()
+
+
+def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
                                              b_dense: torch.Tensor,
                                              scale_a: torch.Tensor,
                                              scale_b: torch.Tensor,
                                              bias: torch.Tensor = None):
-    assert (a_sparse.dtype == torch.float8_e4m3fn
+    '''
+    Performs scaled matrix multiplication (a @ b) of transposed semi-structured sparse and dense fp8 matrices
+    Args:
+        a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
+        b_dense (torch.Tensor) - dense matrix tensor.
+        scale_a (torch.Tensor) - scaling factor for sparse matrix, must be in float32.
+        scale_b (torch.Tensor) - scaling factor for dense matrix, must be in float32.
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+
+    Returns:
+        torch.Tensor - Result of matrix multiplication.
+    '''
+
+    assert (a_packed.dtype == torch.float8_e4m3fn
             and b_dense.dtype == torch.float8_e4m3fn)
     assert not b_dense.is_contiguous(
     ), "cusparseLt requires dense matrix be non-contiguous"
     # cusparseLt requires alpha to be float
     assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
-    return semi_structured_fp8_mm(a_sparse.packed,
+    return semi_structured_fp8_mm(a_packed.packed,
                                   b_dense,
                                   alpha=scale_a * scale_b,
                                   bias=bias,

From 2c7e68ee218634b6f497221c1d299ed96d2a8791 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 30 Oct 2024 09:37:36 +0000
Subject: [PATCH 16/92] Update for torch 2.5

---
 vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index bdae2e9f765fe..7770bbfd09138 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -9,6 +9,7 @@
                               semi_structured_fp8_mm)
 from vllm.platforms import current_platform
 
+SparseSemiStructuredTensor._FORCE_CUTLASS = False
 
 def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
     '''

From 922f4f8d325d2b9d223d20dd960137f6c52610e9 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 30 Oct 2024 10:13:17 +0000
Subject: [PATCH 17/92] Add handling contiguous dense input for int8 and fp8

---
 tests/kernels/test_semi_structured.py              | 14 +++++++++-----
 .../layers/sparsity/utils/cusparse_2_4_utils.py    |  5 ++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 0b54006e8997d..c098be7820d7c 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -65,15 +65,19 @@ def test_semi_structured_fp8_compress(size):
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
-    if dtype is torch.int8:
-        pytest.skip("cusparse does not support sparse x non transposed dense")
+    # if dtype is torch.int8:
+    #     pytest.skip("cusparse does not support sparse x non transposed dense")
     M, N, K = mnk
     A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
     B = get_random_mat(K, N, dtype)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B)
-    C = dense_matmul(A_pruned, B, dtype)
-    torch.testing.assert_close(C, C_sparse)
+    if dtype is torch.int8:
+        with pytest.raises(ValueError) as e:
+            C_sparse = semi_structured_sparse_dense_gemm(A, B)
+    else:
+        C_sparse = semi_structured_sparse_dense_gemm(A, B)
+        C = dense_matmul(A_pruned, B, dtype)
+        torch.testing.assert_close(C, C_sparse)
 
 
 @pytest.mark.skipif(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 7770bbfd09138..4afe325f4e3a9 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -66,7 +66,8 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
                                       b_dense: torch.Tensor,
                                       bias: torch.Tensor = None):
     '''
-    Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices
+    Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices.
+    In case of int8 and fp8 types, dense matrix B has to be non-contiguous.
     Args:
         a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
         b_dense (torch.Tensor) - dense matrix tensor.
@@ -77,6 +78,8 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
     ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
+    if b_dense.is_contiguous() and a_packed.dtype in [torch.int8, torch.float8_e4m3fn]:
+        raise ValueError("cuSparseLt does not support contiguous dense matrix for int8 and fp8 types")
     if a_packed.dtype == torch.float8_e4m3fn:
         return semi_structured_fp8_mm(a_packed.packed,
                                       b_dense,

From beca03833408820c0caf38163020705925de1a86 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 9 Oct 2024 13:56:35 +0000
Subject: [PATCH 18/92] Add fp8 cusparseLt

---
 .../fp8_semi_structured/cusparseLt.h          | 244 ++++++++++++++++++
 tests/kernels/test_semi_structured.py         |   2 +
 vllm/_custom_ops.py                           |   8 +
 .../sparsity/utils/cusparse_2_4_utils.py      |  11 +
 4 files changed, 265 insertions(+)
 create mode 100644 csrc/quantization/fp8_semi_structured/cusparseLt.h

diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
new file mode 100644
index 0000000000000..867705c117074
--- /dev/null
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.h
@@ -0,0 +1,244 @@
+#include <cusparse.h>
+#include <torch/all.h>
+
+#include <cusparseLt.h>
+#include <cuda_fp8.h>
+
+namespace vllm {
+
+
+cusparseLtHandle_t handle;
+bool handle_initialized = false;
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    
+    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // create sparse descriptor, dtype
+    auto compression_factor = 9;
+    cusparseLtMatDescriptor_t input_descriptor;
+    cudaDataType type = CUDA_R_8F_E4M3;
+    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &input_descriptor,
+        input.size(0),
+        input.size(1),
+        input.size(1),
+        16,
+        type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    size_t compressed_size, compressed_buffer_size;
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
+        &handle,
+        &input_descriptor,
+        &compressed_size,
+        &compressed_buffer_size));
+
+    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
+    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
+        &handle,
+        &input_descriptor,
+        true,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        input.data_ptr(),
+        compressed_tensor.data_ptr(),
+        compressedBufferPtr.get(),
+        stream));
+    return compressed_tensor;
+}
+
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A,
+    const torch::Tensor& dense_B,
+    const c10::optional<torch::Tensor>& bias_opt,
+    bool transpose_result
+)
+{
+    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
+    
+    if (!handle_initialized){
+        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
+        handle_initialized = true;
+    }
+    // cusparseLt data structures
+    cusparseLtMatmulDescriptor_t matmul;
+    cusparseLtMatmulPlan_t plan;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    
+    float alpha = 1.0;
+    float beta = 0.0;
+    cudaDataType input_type = CUDA_R_8F_E4M3;
+    cudaDataType output_type;
+    cudaDataType C_type;
+    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+    auto compression_factor = 9;
+    ScalarType out_dtype = dense_B.scalar_type();
+
+    switch (out_dtype)
+    {
+        case at::ScalarType::Float8_e4m3fn:
+            output_type = CUDA_R_8F_E4M3;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::Half:
+            output_type = CUDA_R_16F;
+            C_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::BFloat16:
+            output_type = CUDA_R_16BF;
+            C_type = CUDA_R_16BF;
+            break;
+        case at::ScalarType::Float:
+            output_type = CUDA_R_32F;
+            C_type = CUDA_R_32F;
+            break;
+        default:
+            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
+            break;
+    }
+
+    int64_t k = dense_B.size(0);
+    int64_t n = dense_B.size(1);
+    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
+
+
+    //initialize sparse descriptor
+    cusparseLtMatDescriptor_t sparse_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+        &handle,
+        &sparse_input_descriptor,
+        m,
+        k,
+        k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW,
+        CUSPARSELT_SPARSITY_50_PERCENT));
+
+    // initialize dense input descriptor
+    cusparseLtMatDescriptor_t dense_input_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &dense_input_descriptor,
+        (dense_B.is_contiguous()) ? k : n,
+        (dense_B.is_contiguous()) ? n : k,
+        (dense_B.is_contiguous()) ? n : k,
+        16,
+        input_type,
+        CUSPARSE_ORDER_ROW));
+    
+    // create result tensor
+    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
+                                        : at::empty({m, n}, res_tensor_options);
+
+    cusparseLtMatDescriptor_t res_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &res_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        output_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    cusparseLtMatDescriptor_t C_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+        &handle,
+        &C_descriptor,
+        m,
+        n,
+        (transpose_result) ? m: n,
+        16,
+        C_type,
+        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle,
+      &matmul,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
+      &sparse_input_descriptor,
+      &dense_input_descriptor,
+      &C_descriptor,
+      &res_descriptor,
+      compute_type));
+    
+    // set bias pointer for matmul, need to assign to get location
+    if (bias_opt.has_value()) {
+        auto& bias = bias_opt.value();
+        void* dBias = bias.data_ptr();
+        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
+    }
+
+    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
+                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
+    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
+    size_t workspace_size;
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+
+
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    auto workspacePtr = allocator.allocate(workspace_size);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+        &handle,
+        &plan,
+        &alpha,
+        compressed_A.data_ptr(),
+        dense_B.data_ptr(),
+        &beta,
+        res.data_ptr(),
+        res.data_ptr(),
+        workspacePtr.get(),
+        // jank because of the way we want this to be an array of streams
+        &stream,
+        1));
+
+    // Destroy descriptors
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
+    // Destroy plan
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
+    return res;
+}
+#else
+
+torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
+}
+
+at::Tensor cslt_mm_fp8_semi_structured(
+    const Tensor& compressed_A,
+    const Tensor& dense_B,
+    const std::optional<Tensor>& bias_opt,
+    bool transpose_result,
+)
+{
+#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
+    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
+#endif
+}
+
+#endif
+
+
+} // namespace vllm
\ No newline at end of file
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c098be7820d7c..338c4dfa79c84 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -36,6 +36,8 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
+    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
+        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
         compress_to_torch_sparse_semi_structured_mat(input_pruned))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 5466f9629dd27..cf27dc8d5342f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -697,6 +697,14 @@ def scaled_fp8_quant(
 
     return output, scale
 
+# semi structured fp8
+def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
+    assert input.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_compress_fp8_semi_structured(input)
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+    assert A_compressed.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
 
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 4afe325f4e3a9..e966bf1ed0ed7 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -147,6 +147,17 @@ def dense_matmul(A, B, dtype):
         return A @ B
 
 
+# test utils
+def dense_matmul(A, B, dtype):
+    if dtype in [torch.int8, torch.float8_e4m3fn]:
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
+                                     torch.bfloat16).to(dtype)
+    else:
+        return A @ B
+
+
 def is_semi_structured_supported() -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False

From 5d9cd253c9a22e15c014b4758ab8762091c1422e Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Sun, 13 Oct 2024 20:55:04 +0000
Subject: [PATCH 19/92] Fix compilation and tests

---
 .../fp8_semi_structured/cusparseLt.h          | 244 ------------------
 tests/kernels/test_semi_structured.py         |   2 -
 vllm/_custom_ops.py                           |  11 +-
 .../sparsity/utils/cusparse_2_4_utils.py      |  16 +-
 4 files changed, 13 insertions(+), 260 deletions(-)
 delete mode 100644 csrc/quantization/fp8_semi_structured/cusparseLt.h

diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.h b/csrc/quantization/fp8_semi_structured/cusparseLt.h
deleted file mode 100644
index 867705c117074..0000000000000
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.h
+++ /dev/null
@@ -1,244 +0,0 @@
-#include <cusparse.h>
-#include <torch/all.h>
-
-#include <cusparseLt.h>
-#include <cuda_fp8.h>
-
-namespace vllm {
-
-
-cusparseLtHandle_t handle;
-bool handle_initialized = false;
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
-
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    
-    TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress")
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // create sparse descriptor, dtype
-    auto compression_factor = 9;
-    cusparseLtMatDescriptor_t input_descriptor;
-    cudaDataType type = CUDA_R_8F_E4M3;
-    auto compressed_tensor = input.new_empty(input.numel() * compression_factor / 16);
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &input_descriptor,
-        input.size(0),
-        input.size(1),
-        input.size(1),
-        16,
-        type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    size_t compressed_size, compressed_buffer_size;
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-        &handle,
-        &input_descriptor,
-        &compressed_size,
-        &compressed_buffer_size));
-
-    auto& allocator = ::c10::cuda::CUDACachingAllocator::get();
-    auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
-        &handle,
-        &input_descriptor,
-        true,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        input.data_ptr(),
-        compressed_tensor.data_ptr(),
-        compressedBufferPtr.get(),
-        stream));
-    return compressed_tensor;
-}
-
-torch::Tensor cslt_mm_fp8_semi_structured(
-    const torch::Tensor& compressed_A,
-    const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& bias_opt,
-    bool transpose_result
-)
-{
-    TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn, "Only float8 e4m3 is supported in vllm:cslt_compress");
-    
-    if (!handle_initialized){
-        TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-        handle_initialized = true;
-    }
-    // cusparseLt data structures
-    cusparseLtMatmulDescriptor_t matmul;
-    cusparseLtMatmulPlan_t plan;
-    cusparseLtMatmulAlgSelection_t alg_sel;
-    
-    float alpha = 1.0;
-    float beta = 0.0;
-    cudaDataType input_type = CUDA_R_8F_E4M3;
-    cudaDataType output_type;
-    cudaDataType C_type;
-    cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-    auto compression_factor = 9;
-    ScalarType out_dtype = dense_B.scalar_type();
-
-    switch (out_dtype)
-    {
-        case at::ScalarType::Float8_e4m3fn:
-            output_type = CUDA_R_8F_E4M3;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::Half:
-            output_type = CUDA_R_16F;
-            C_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::BFloat16:
-            output_type = CUDA_R_16BF;
-            C_type = CUDA_R_16BF;
-            break;
-        case at::ScalarType::Float:
-            output_type = CUDA_R_32F;
-            C_type = CUDA_R_32F;
-            break;
-        default:
-            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
-            break;
-    }
-
-    int64_t k = dense_B.size(0);
-    int64_t n = dense_B.size(1);
-    int64_t m = (compressed_A.numel() * 16 / compression_factor  ) / k;
-
-
-    //initialize sparse descriptor
-    cusparseLtMatDescriptor_t sparse_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-        &handle,
-        &sparse_input_descriptor,
-        m,
-        k,
-        k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW,
-        CUSPARSELT_SPARSITY_50_PERCENT));
-
-    // initialize dense input descriptor
-    cusparseLtMatDescriptor_t dense_input_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &dense_input_descriptor,
-        (dense_B.is_contiguous()) ? k : n,
-        (dense_B.is_contiguous()) ? n : k,
-        (dense_B.is_contiguous()) ? n : k,
-        16,
-        input_type,
-        CUSPARSE_ORDER_ROW));
-    
-    // create result tensor
-    auto res_tensor_options = c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-    at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
-                                        : at::empty({m, n}, res_tensor_options);
-
-    cusparseLtMatDescriptor_t res_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &res_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        output_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    cusparseLtMatDescriptor_t C_descriptor;
-    TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-        &handle,
-        &C_descriptor,
-        m,
-        n,
-        (transpose_result) ? m: n,
-        16,
-        C_type,
-        (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &handle,
-      &matmul,
-      CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
-      &sparse_input_descriptor,
-      &dense_input_descriptor,
-      &C_descriptor,
-      &res_descriptor,
-      compute_type));
-    
-    // set bias pointer for matmul, need to assign to get location
-    if (bias_opt.has_value()) {
-        auto& bias = bias_opt.value();
-        void* dBias = bias.data_ptr();
-        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-            &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
-    }
-
-    cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
-                                    CUSPARSELT_MATMUL_ALG_DEFAULT);
-    cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
-    size_t workspace_size;
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
-
-
-    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-    auto workspacePtr = allocator.allocate(workspace_size);
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-        &handle,
-        &plan,
-        &alpha,
-        compressed_A.data_ptr(),
-        dense_B.data_ptr(),
-        &beta,
-        res.data_ptr(),
-        res.data_ptr(),
-        workspacePtr.get(),
-        // jank because of the way we want this to be an array of streams
-        &stream,
-        1));
-
-    // Destroy descriptors
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
-    // Destroy plan
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
-    return res;
-}
-#else
-
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix in current version of cuSPARSELt.");
-}
-
-at::Tensor cslt_mm_fp8_semi_structured(
-    const Tensor& compressed_A,
-    const Tensor& dense_B,
-    const std::optional<Tensor>& bias_opt,
-    bool transpose_result,
-)
-{
-#if not (defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602)
-    TORCH_CHECK(false, "Unsupported dtype for compressed matrix multiplication in current version of cuSPARSELt.");
-#endif
-}
-
-#endif
-
-
-} // namespace vllm
\ No newline at end of file
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 338c4dfa79c84..c098be7820d7c 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -36,8 +36,6 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @pytest.mark.parametrize("size", SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_semi_structured_compress(size, dtype):
-    if dtype == torch.float8_e4m3fn and not is_quant_method_supported("fp8"):
-        pytest.skip("fp8 is not supported on this device")
     input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
     output_pruned = decompress_torch_sparse_semi_structured_mat(
         compress_to_torch_sparse_semi_structured_mat(input_pruned))
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index cf27dc8d5342f..e49a5531e2527 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -697,14 +697,21 @@ def scaled_fp8_quant(
 
     return output, scale
 
+
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     assert input.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor, B_dense: torch.Tensor, bias: Optional[torch.Tensor], transpose_result: bool = False) -> torch.Tensor:
+
+def semi_structured_fp8_mm(A_compressed: torch.Tensor,
+                           B_dense: torch.Tensor,
+                           bias: Optional[torch.Tensor] = None,
+                           transpose_result: bool = False) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense, bias, transpose_result)
+    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
+                                                    bias, transpose_result)
+
 
 # semi structured fp8
 def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index e966bf1ed0ed7..58af70318aac8 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -3,6 +3,9 @@
 from torch.sparse import (SparseSemiStructuredTensor,
                           SparseSemiStructuredTensorCUSPARSELT,
                           to_sparse_semi_structured)
+from torch.sparse import (SparseSemiStructuredTensor,
+                          SparseSemiStructuredTensorCUSPARSELT,
+                          to_sparse_semi_structured)
 
 from vllm import _custom_ops as ops
 from vllm._custom_ops import (semi_structured_fp8_compress,
@@ -37,7 +40,7 @@ def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
     else:
         return to_sparse_semi_structured(pruned_tensor)
 
-#  
+
 def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
     '''
     Unpacks the cusparseLt packed tensor into pruned tensor
@@ -147,17 +150,6 @@ def dense_matmul(A, B, dtype):
         return A @ B
 
 
-# test utils
-def dense_matmul(A, B, dtype):
-    if dtype in [torch.int8, torch.float8_e4m3fn]:
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
-                                     torch.bfloat16).to(dtype)
-    else:
-        return A @ B
-
-
 def is_semi_structured_supported() -> bool:
     if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False

From 39ad9d4dbf82b07c6a916f508a1d9019bef970b6 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 23 Oct 2024 16:53:42 +0000
Subject: [PATCH 20/92] Add caching of cusparseLT meta

---
 CMakeLists.txt                                |   9 +-
 .../cusparseLt_benchmarks/benchmark_24.py     |   9 +-
 csrc/ops.h                                    |   7 +
 .../fp8_semi_structured/cusparseLt.cpp        | 259 ++++++++++++++++--
 csrc/torch_bindings.cpp                       |  14 +
 vllm/_custom_ops.py                           |  24 +-
 6 files changed, 274 insertions(+), 48 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73e17a3d814ac..48b43259b57d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -400,16 +400,13 @@ define_gpu_extension_target(
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
 # If cuSparseLt is not installed we skip 2:4 optimizations
-CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT_H)
-
-# TODO has to be fixed.
+CHECK_INCLUDE_FILE_CXX("cusparseLt.h" HAVE_CUSPARSELT)
+message(STATUS "Result of include cusparseLt ${HAVE_CUSPARSELT}")
 target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
 
-# if(HAVE_CUSPARSELT_H)
-#   message(STATUS "cusparseLt found")
+# if(HAVE_CUSPARSELT)
 #   target_compile_definitions(_C PRIVATE VLLM_CUSPARSELT_ENABLED=1)
 # endif()
-
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index aa3328b0f17cf..426fb653598a2 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -114,6 +114,9 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
+    # MKNs = [(2048, 8192, 14336)]
+    # MKNs = [(32, 11008, 4096)]
+    MKNs = [(2048, 11008, 14336)]
     for m, k, n in MKNs:
         timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
@@ -130,9 +133,9 @@ def make_output(data: Iterable[TMeasurement],
     print_timers(data)
 
     # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
+    # timestamp = int(time.time()) if timestamp is None else timestamp
+    # with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+    #     pkl.dump(data, f)
 
 
 def run_model_bench(args):
diff --git a/csrc/ops.h b/csrc/ops.h
index 014f0068d423b..c1948a4d6af98 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -225,4 +225,11 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
+int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
+                                            const torch::Tensor& dense_B);
+
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
+
+void cslt_fp8_semi_structured_destroy(int64_t id);
+
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index a5e8d0ca6c0bf..a79076567b3d2 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -4,10 +4,13 @@
 #include <ATen/cuda/CUDAContext.h>
 
 #define STUB_FUNC_IMPL()                                                     \
+  torch::Tensor cslt_compress_fp8_semi_structured(                           \
+      const torch::Tensor& input) {                                          \
   torch::Tensor cslt_compress_fp8_semi_structured(                           \
       const torch::Tensor& input) {                                          \
     TORCH_CHECK(false,                                                       \
-                "Unsupported dtype for compressed matrix in current "        \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
                 "version of cuSPARSELt.");                                   \
   }                                                                          \
                                                                              \
@@ -17,7 +20,29 @@
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
-  }
+  }                                                                          \
+                                                                             \
+  int64_t cslt_prepare_mm_fp8_semi_structured(                               \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B) {     \
+    TORCH_CHECK(false,                                                       \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+  }                                                                          \
+                                                                             \
+  torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id) {           \
+    TORCH_CHECK(false,                                                       \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+  }                                                                          \
+                                                                             \
+  void cslt_fp8_semi_structured_destroy(int64_t id) {                        \
+    TORCH_CHECK(false,                                                       \
+                "cusparseLt is not found or "                                \
+                "unsupported dtype for compressed matrix in current "        \
+                "version of cuSPARSELt.");                                   \
+  }                                                                          \
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -33,15 +58,49 @@
                     " when calling `" #EXPR "`");                  \
       } while (0)
 
+namespace vllm {
+namespace cusparseLt {
+
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
+using cacheID = int64_t;
+
+struct cusparseLtEntry {
+  // cusparseLtEntry(): device() {}
+  int m;
+  int n;
+  int k;
+
+  cusparseLtMatmulDescriptor_t matmul;
+  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatmulAlgSelection_t alg_sel;
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+  cusparseLtMatDescriptor_t dense_input_descriptor;
+  cusparseLtMatDescriptor_t res_descriptor;
+  cusparseLtMatDescriptor_t C_descriptor;
+
+  void* sparse_mat_ptr;
+  void* dense_mat_ptr;
+
+  torch::Device device = torch::kCUDA;
+  torch::Dtype out_dtype;
+
+  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator;
+  c10::DataPtr workspace_ptr;
+};
+
+std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
+
+}  // namespace cusparseLt
+}  // namespace vllm
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress")
-  if (!handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-    handle_initialized = true;
+              "Only float8 e4m3 is supported in vllm:cslt_compress");
+  namespace vc = vllm::cusparseLt;
+  if (!vc::handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
+    vc::handle_initialized = true;
   }
   // create sparse descriptor, dtype
   auto compression_factor = 9;
@@ -51,35 +110,181 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
       input.new_empty(input.numel() * compression_factor / 16);
 
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
+      &vc::handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
       16, type, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   size_t compressed_size, compressed_buffer_size;
   TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-      &handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
+      &vc::handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
 
   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
-      &handle, &input_descriptor, true, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      &vc::handle, &input_descriptor, true, CUSPARSE_OPERATION_NON_TRANSPOSE,
       input.data_ptr(), compressed_tensor.data_ptr(), compressedBufferPtr.get(),
       stream));
   return compressed_tensor;
 }
 
+// vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const
+// torch::Tensor& compressed_A, const torch::Tensor& dense_B) {
+int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
+                                            const torch::Tensor& dense_B) {
+  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
+              "Only float8 e4m3 is supported in vllm:cslt_compress");
+  namespace vc = vllm::cusparseLt;
+  if (!vc::handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vllm::cusparseLt::handle));
+    vc::handle_initialized = true;
+  }
+  vc::cacheID id;
+  if (vc::cusparseLt_cache.empty()) {
+    id = 0;
+  } else {
+    id = vc::cusparseLt_cache.rbegin()->first + 1;
+  }
+  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+
+  float alpha = 1.0;
+  float beta = 0.0;
+  cudaDataType input_type = CUDA_R_8F_E4M3;
+  cudaDataType output_type;
+  cudaDataType C_type;
+  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  auto compression_factor = 9;
+  auto out_dtype = dense_B.scalar_type();
+
+  int64_t k = dense_B.size(0);
+  int64_t n = dense_B.size(1);
+  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+
+  switch (out_dtype) {
+    case at::ScalarType::Float8_e4m3fn:
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::Half:
+      output_type = CUDA_R_16F;
+      C_type = CUDA_R_16F;
+      break;
+    case at::ScalarType::BFloat16:
+      output_type = CUDA_R_16BF;
+      C_type = CUDA_R_16BF;
+      break;
+    case at::ScalarType::Float:
+      output_type = CUDA_R_32F;
+      C_type = CUDA_R_32F;
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
+                  "float32} for fp8 inputs");
+      break;
+  }
+
+  // initialize sparse descriptor
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &vc::handle, &entry.sparse_input_descriptor, m, k, k, 16, input_type,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  // initialize dense descriptor
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &entry.dense_input_descriptor,
+      (dense_B.is_contiguous()) ? k : n, (dense_B.is_contiguous()) ? n : k,
+      (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
+
+  // initialize result descriptor
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&vc::handle, &entry.res_descriptor, m, n, m,
+                                    16, output_type, CUSPARSE_ORDER_ROW));
+
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&vc::handle, &entry.C_descriptor, m, n, n,
+                                    16, C_type, CUSPARSE_ORDER_ROW));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
+                                : CUSPARSE_OPERATION_TRANSPOSE,
+      &entry.sparse_input_descriptor, &entry.dense_input_descriptor,
+      &entry.C_descriptor, &entry.res_descriptor, compute_type));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
+      &vc::handle, &entry.alg_sel, &entry.matmul,
+      CUSPARSELT_MATMUL_ALG_DEFAULT));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
+      &vc::handle, &entry.plan, &entry.matmul, &entry.alg_sel));
+  
+  size_t workspace_size;
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulGetWorkspace(&vc::handle, &entry.plan, &workspace_size));
+
+  entry.allocator = c10::cuda::CUDACachingAllocator::get();
+  entry.workspace_ptr = entry.allocator->allocate(workspace_size);
+  entry.device = dense_B.device();
+  entry.out_dtype = out_dtype;
+  entry.m = m;
+  entry.n = n;
+  entry.k = k;
+  return id;
+}
+
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(
+    vllm::cusparseLt::cacheID id) {
+  namespace vc = vllm::cusparseLt;
+  TORCH_CHECK(vc::handle_initialized,
+              "Call of matmul with unintialized matmul");
+  if (vc::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  const auto& entry = vc::cusparseLt_cache[id];
+
+  auto res_tensor_options =
+      c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
+  at::Tensor res = at::empty({entry.m, entry.n}, res_tensor_options);
+  float alpha = 1.0;
+  float beta = 0.0;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmul(&vc::handle, &entry.plan, &alpha, entry.sparse_mat_ptr,
+                       entry.dense_mat_ptr, &beta, res.data_ptr(),
+                       res.data_ptr(), entry.workspace_ptr.get(), &stream, 1));
+
+  return res;
+}
+
+void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
+  TORCH_CHECK(vllm::cusparseLt::handle_initialized,
+              "Call of destroy cusparseId with unintialized cusparseLt");
+  if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
+
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatDescriptorDestroy(&entry.dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.res_descriptor));
+  // Destroy plan
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&entry.plan));
+}
+
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<torch::Tensor>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
-
-  if (!handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
-    handle_initialized = true;
+  namespace vc = vllm::cusparseLt;
+  if (!vc::handle_initialized) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
+    vc::handle_initialized = true;
   }
+
   // cusparseLt data structures
   cusparseLtMatmulDescriptor_t matmul;
   cusparseLtMatmulPlan_t plan;
@@ -126,13 +331,13 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &handle, &sparse_input_descriptor, m, k, k, 16, input_type,
+      &vc::handle, &sparse_input_descriptor, m, k, k, 16, input_type,
       CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   // initialize dense input descriptor
   cusparseLtMatDescriptor_t dense_input_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, &dense_input_descriptor, (dense_B.is_contiguous()) ? k : n,
+      &vc::handle, &dense_input_descriptor, (dense_B.is_contiguous()) ? k : n,
       (dense_B.is_contiguous()) ? n : k, (dense_B.is_contiguous()) ? n : k, 16,
       input_type, CUSPARSE_ORDER_ROW));
 
@@ -144,17 +349,17 @@ torch::Tensor cslt_mm_fp8_semi_structured(
 
   cusparseLtMatDescriptor_t res_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      &vc::handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
       output_type,
       (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
 
   cusparseLtMatDescriptor_t C_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
+      &vc::handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
       (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      &vc::handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
       (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
                                 : CUSPARSE_OPERATION_TRANSPOSE,
       &sparse_input_descriptor, &dense_input_descriptor, &C_descriptor,
@@ -165,7 +370,7 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     auto& bias = bias_opt.value();
     void* dBias = bias.data_ptr();
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-        &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
+        &vc::handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
         sizeof(dBias)));
   }
 
@@ -184,27 +389,29 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     }
   }
 
-  cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul,
-                                   CUSPARSELT_MATMUL_ALG_DEFAULT);
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
+      &vc::handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulPlanInit(&vc::handle, &plan, &matmul, &alg_sel));
 
-  cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel);
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));
+      cusparseLtMatmulGetWorkspace(&vc::handle, &plan, &workspace_size));
 
   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-  auto workspacePtr = allocator.allocate(workspace_size);
+  auto workspace_ptr = allocator.allocate(workspace_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-      &handle, &plan, alpha_ptr, compressed_A.data_ptr(), dense_B.data_ptr(),
-      &beta, res.data_ptr(), res.data_ptr(), workspacePtr.get(), &stream, 1));
+      &vc::handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
+      &beta, res.data_ptr(), res.data_ptr(), workspace_ptr.get(), &stream, 1));
 
   // Destroy descriptors
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&C_descriptor));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
   // Destroy plan
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 46e1fd15c139b..a35287f6f737d 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -316,6 +316,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
+
+  ops.def(
+      "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
+      "denseB) -> int");
+  ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
+           &cslt_prepare_mm_fp8_semi_structured);
+
+  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
+  ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
+           &cslt_mm_fp8_semi_structured_prepared);
+
+  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
+  ops.impl("cslt_fp8_semi_structured_destroy", torch::kCUDA,
+           &cslt_fp8_semi_structured_destroy);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e49a5531e2527..b974b1921c354 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -713,21 +713,19 @@ def semi_structured_fp8_mm(A_compressed: torch.Tensor,
                                                     bias, transpose_result)
 
 
-# semi structured fp8
-def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
-    assert input.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_compress_fp8_semi_structured(input)
+def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
+                                   B_dense: torch.Tensor) -> int:
+    assert A_compressed.dtype == torch.float8_e4m3fn
+    return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
+        A_compressed, B_dense)
 
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor,
-                           B_dense: torch.Tensor,
-                           alpha: Optional[torch.Tensor] = None,
-                           bias: Optional[torch.Tensor] = None,
-                           transpose_result: bool = False) -> torch.Tensor:
-    assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    alpha, bias,
-                                                    transpose_result)
+def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:
+    return torch.ops.cslt_mm_fp8_semi_structured_prepared(cacheId)
+
+
+def semi_structured_fp8_destroy(cacheId: int):
+    torch.ops.cslt_fp8_semi_structured_destroy(cacheId)
 
 
 # int8

From 520eb6231b66adab6843e1c7c93be1f67819a3bb Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Fri, 25 Oct 2024 14:35:31 +0000
Subject: [PATCH 21/92] Cached cusparseLt

---
 .../cusparseLt_benchmarks/benchmark_24.py     |  29 +++--
 csrc/ops.h                                    |   5 +-
 .../fp8_semi_structured/cusparseLt.cpp        | 104 +++++++++++-------
 csrc/torch_bindings.cpp                       |   4 +-
 tests/test_cusparseLt.cpp                     |  12 ++
 vllm/_custom_ops.py                           |  10 +-
 6 files changed, 107 insertions(+), 57 deletions(-)
 create mode 100644 tests/test_cusparseLt.cpp

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 426fb653598a2..4599964421bc0 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
     is_semi_structured_supported, semi_structured_sparse_dense_gemm)
+from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -79,15 +80,15 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
 
     a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
-                 dense_matmul, a, b, torch.int8))
+    # timers.append(
+    #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
+    #              dense_matmul, a, b, torch.int8))
 
     # cusparseLt i8
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
-                 semi_structured_sparse_dense_gemm,
-                 compress_to_torch_sparse_semi_structured_mat(a), b))
+    # timers.append(
+    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
+    #              semi_structured_sparse_dense_gemm,
+    #              compress_to_torch_sparse_semi_structured_mat(a), b))
 
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
@@ -101,6 +102,13 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
                      semi_structured_sparse_dense_gemm,
                      compress_to_torch_sparse_semi_structured_mat(a), b))
+        
+        a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
+        handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
+        timers.append(
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
+                     semi_structured_fp8_mm_prepared,
+                     torch.tensor([handle], dtype=torch.int64, device='cuda')))
 
     return timers
 
@@ -114,9 +122,6 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
-    # MKNs = [(2048, 8192, 14336)]
-    # MKNs = [(32, 11008, 4096)]
-    MKNs = [(2048, 11008, 14336)]
     for m, k, n in MKNs:
         timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
@@ -181,8 +186,8 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
     for d in model_bench_data:
         all_data.extend(d)
     # pickle all data
-    with open(f"model_bench-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
+    # with open(f"model_bench-{timestamp}.pkl", "wb") as f:
+    #     pkl.dump(all_data, f)
 
 
 if __name__ == '__main__':
diff --git a/csrc/ops.h b/csrc/ops.h
index c1948a4d6af98..b7b3dfcd22028 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -226,9 +226,10 @@ torch::Tensor cslt_mm_fp8_semi_structured(
     const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
 int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B);
+                                            const torch::Tensor& dense_B,
+                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
 
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
 
 void cslt_fp8_semi_structured_destroy(int64_t id);
 
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index a79076567b3d2..42975d5ff0eea 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -58,27 +58,44 @@
                     " when calling `" #EXPR "`");                  \
       } while (0)
 
+
+
 namespace vllm {
 namespace cusparseLt {
 
-cusparseLtHandle_t handle;
-bool handle_initialized = false;
-using cacheID = int64_t;
-
 struct cusparseLtEntry {
-  // cusparseLtEntry(): device() {}
-  int m;
-  int n;
-  int k;
+  // cusparseLtEntry() {}
+  // void operator=(const cusparseLtEntry& entry) {
+  //   sparse_input_descriptor = entry.sparse_input_descriptor;
+  //   dense_input_descriptor = entry.dense_input_descriptor;
+  //   res_descriptor = entry.res_descriptor;
+  //   C_descriptor = entry.C_descriptor;
+  //   matmul = entry.matmul;
+  //   plan = entry.plan;
+
+  //   sparse_mat_ptr = entry.sparse_mat_ptr;    
+  //   dense_mat_ptr = entry.dense_mat_ptr;
+
+  //   device = std::move(entry.device);
+  //   allocator = entry.allocator;
+  //   out_dtype = std::move(entry.out_dtype);
+
+  //   workspace_ptr = std::move(entry.workspace_ptr);
+
+  //   m = entry.m;
+  //   n = entry.n;
+  //   k = entry.k;
+  // }
 
-  cusparseLtMatmulDescriptor_t matmul;
-  cusparseLtMatmulPlan_t plan;
-  cusparseLtMatmulAlgSelection_t alg_sel;
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   cusparseLtMatDescriptor_t dense_input_descriptor;
   cusparseLtMatDescriptor_t res_descriptor;
   cusparseLtMatDescriptor_t C_descriptor;
 
+  cusparseLtMatmulDescriptor_t matmul;
+  cusparseLtMatmulPlan_t plan;
+
+
   void* sparse_mat_ptr;
   void* dense_mat_ptr;
 
@@ -87,13 +104,23 @@ struct cusparseLtEntry {
 
   c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator;
   c10::DataPtr workspace_ptr;
+
+  int m;
+  int n;
+  int k;
 };
 
-std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
+cusparseLtHandle_t handle;
+bool handle_initialized = false;
+using cacheID = int64_t;
+
 
+std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
 }  // namespace cusparseLt
 }  // namespace vllm
 
+vllm::cusparseLt::cusparseLtEntry entry;
+
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
@@ -128,15 +155,14 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   return compressed_tensor;
 }
 
-// vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const
-// torch::Tensor& compressed_A, const torch::Tensor& dense_B) {
-int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B) {
+vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
+                                            const torch::Tensor& dense_B, 
+                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vllm::cusparseLt::handle));
+    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
     vc::handle_initialized = true;
   }
   vc::cacheID id;
@@ -145,7 +171,9 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
   } else {
     id = vc::cusparseLt_cache.rbegin()->first + 1;
   }
-  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+
+  // vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+  // vc::cusparseLtEntry entry;
 
   float alpha = 1.0;
   float beta = 0.0;
@@ -155,7 +183,6 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
   cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
   auto compression_factor = 9;
   auto out_dtype = dense_B.scalar_type();
-
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
@@ -183,10 +210,9 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                   "float32} for fp8 inputs");
       break;
   }
-
   // initialize sparse descriptor
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &entry.sparse_input_descriptor, m, k, k, 16, input_type,
+      &vc::handle, &(entry.sparse_input_descriptor), m, k, k, 16, input_type,
       CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   // initialize dense descriptor
@@ -196,13 +222,15 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
       (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
 
   // initialize result descriptor
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtDenseDescriptorInit(&vc::handle, &entry.res_descriptor, m, n, m,
-                                    16, output_type, CUSPARSE_ORDER_ROW));
+ TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &entry.res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      output_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &entry.C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
 
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtDenseDescriptorInit(&vc::handle, &entry.C_descriptor, m, n, n,
-                                    16, C_type, CUSPARSE_ORDER_ROW));
+  cusparseLtMatmulAlgSelection_t alg_sel;
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -210,13 +238,11 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                                 : CUSPARSE_OPERATION_TRANSPOSE,
       &entry.sparse_input_descriptor, &entry.dense_input_descriptor,
       &entry.C_descriptor, &entry.res_descriptor, compute_type));
-
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
-      &vc::handle, &entry.alg_sel, &entry.matmul,
+      &vc::handle, &alg_sel, &entry.matmul,
       CUSPARSELT_MATMUL_ALG_DEFAULT));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-      &vc::handle, &entry.plan, &entry.matmul, &entry.alg_sel));
-  
+      &vc::handle, &entry.plan, &entry.matmul, &alg_sel));
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatmulGetWorkspace(&vc::handle, &entry.plan, &workspace_size));
@@ -228,18 +254,22 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
   entry.m = m;
   entry.n = n;
   entry.k = k;
+  entry.sparse_mat_ptr = compressed_A.data_ptr();
+  entry.dense_mat_ptr = dense_B.data_ptr();
   return id;
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured_prepared(
-    vllm::cusparseLt::cacheID id) {
+    const torch::Tensor& id_tensor) {
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of matmul with unintialized matmul");
-  if (vc::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
-  }
-  const auto& entry = vc::cusparseLt_cache[id];
+  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  // auto id = id_tensor.item<vc::cacheID>();
+  // if (vc::cusparseLt_cache.count(id) == 0) {
+  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
+  // }
+  // const auto& entry = vc::cusparseLt_cache[id];
 
   auto res_tensor_options =
       c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
@@ -262,7 +292,7 @@ void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
   if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
     TORCH_CHECK(false, "cusparse matmul Id is not found");
   }
-  auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
+  // auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
 
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index a35287f6f737d..9b1b25be469ee 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -319,11 +319,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
-      "denseB) -> int");
+      "denseB, Tensor!? bias, bool transpose_result) -> int");
   ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_prepare_mm_fp8_semi_structured);
 
-  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
+  ops.def("cslt_mm_fp8_semi_structured_prepared(Tensor cacheId) -> Tensor");
   ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
            &cslt_mm_fp8_semi_structured_prepared);
 
diff --git a/tests/test_cusparseLt.cpp b/tests/test_cusparseLt.cpp
new file mode 100644
index 0000000000000..9c8d3cb813ef1
--- /dev/null
+++ b/tests/test_cusparseLt.cpp
@@ -0,0 +1,12 @@
+ #include <cusparseLt.h>
+
+cusparseLtHandle_t handle;
+
+
+struct Entry {
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+};
+
+int main() {
+    
+}
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b974b1921c354..6fc5778ce9865 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -714,18 +714,20 @@ def semi_structured_fp8_mm(A_compressed: torch.Tensor,
 
 
 def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
-                                   B_dense: torch.Tensor) -> int:
+                                   B_dense: torch.Tensor,
+                                   bias: Optional[torch.Tensor] = None,
+                                   transpose_result: bool = False) -> int:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
-        A_compressed, B_dense)
+        A_compressed, B_dense, bias, transpose_result)
 
 
 def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:
-    return torch.ops.cslt_mm_fp8_semi_structured_prepared(cacheId)
+    return torch.ops._C.cslt_mm_fp8_semi_structured_prepared(cacheId)
 
 
 def semi_structured_fp8_destroy(cacheId: int):
-    torch.ops.cslt_fp8_semi_structured_destroy(cacheId)
+    torch.ops._C.cslt_fp8_semi_structured_destroy(cacheId)
 
 
 # int8

From 20956e69b9ad79f9875068aa41f28ab3777f272f Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Fri, 25 Oct 2024 15:08:11 +0000
Subject: [PATCH 22/92] Fix destroy function

---
 benchmarks/cusparseLt_benchmarks/benchmark_24.py     | 11 ++++++++---
 csrc/ops.h                                           |  2 +-
 csrc/quantization/fp8_semi_structured/cusparseLt.cpp | 10 ++++++----
 csrc/torch_bindings.cpp                              |  2 +-
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 4599964421bc0..101a9bc20be6e 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
     is_semi_structured_supported, semi_structured_sparse_dense_gemm)
-from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared)
+from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared, semi_structured_fp8_destroy)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -105,11 +105,12 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
         
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
+        id = torch.tensor([handle], dtype=torch.int64, device='cuda')
         timers.append(
             bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
                      semi_structured_fp8_mm_prepared,
-                     torch.tensor([handle], dtype=torch.int64, device='cuda')))
-
+                     id))
+        semi_structured_fp8_destroy(id)
     return timers
 
 
@@ -122,6 +123,10 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
+    # MKNs = [(2048, 8192, 14336)]
+    MKNs = [(32, 11008, 4096)]
+    # MKNs = [(2048, 11008, 14336)]
+
     for m, k, n in MKNs:
         timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
         print_timers(timers)
diff --git a/csrc/ops.h b/csrc/ops.h
index b7b3dfcd22028..6b4cd3d65b074 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -231,6 +231,6 @@ int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
 
 torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
 
-void cslt_fp8_semi_structured_destroy(int64_t id);
+void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 42975d5ff0eea..80462925fb993 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -286,12 +286,14 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
   return res;
 }
 
-void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
+void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
   TORCH_CHECK(vllm::cusparseLt::handle_initialized,
               "Call of destroy cusparseId with unintialized cusparseLt");
-  if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
-  }
+  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  // auto id = id_tensor.item<vc::cacheID>();
+  // if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
+  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
+  // }
   // auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
 
   TORCH_CUDASPARSE_CHECK(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9b1b25be469ee..836871c815919 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -327,7 +327,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
            &cslt_mm_fp8_semi_structured_prepared);
 
-  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
+  ops.def("cslt_fp8_semi_structured_destroy(Tensor cacheId) -> ()");
   ops.impl("cslt_fp8_semi_structured_destroy", torch::kCUDA,
            &cslt_fp8_semi_structured_destroy);
 #endif

From 87c8088024d7d145175234908c5da225e84477ce Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Fri, 25 Oct 2024 15:27:51 +0000
Subject: [PATCH 23/92] Prepare for reproduce

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 14 ++++-----
 .../fp8_semi_structured/cusparseLt.cpp        | 31 ++++++++++---------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 101a9bc20be6e..cc9a9e1c2603c 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -93,15 +93,15 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
-        timers.append(
-            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
-                     dense_matmul, a, b, torch.float8_e4m3fn))
+        # timers.append(
+        #     bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
+        #              dense_matmul, a, b, torch.float8_e4m3fn))
 
         # cusparseLt fp8
-        timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-                     semi_structured_sparse_dense_gemm,
-                     compress_to_torch_sparse_semi_structured_mat(a), b))
+        # timers.append(
+        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+        #              semi_structured_sparse_dense_gemm,
+        #              compress_to_torch_sparse_semi_structured_mat(a), b))
         
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 80462925fb993..be242283648d1 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -119,7 +119,7 @@ std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
 }  // namespace cusparseLt
 }  // namespace vllm
 
-vllm::cusparseLt::cusparseLtEntry entry;
+// vllm::cusparseLt::cusparseLtEntry entry;
 
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
@@ -172,7 +172,7 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
     id = vc::cusparseLt_cache.rbegin()->first + 1;
   }
 
-  // vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
+  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
   // vc::cusparseLtEntry entry;
 
   float alpha = 1.0;
@@ -264,12 +264,12 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of matmul with unintialized matmul");
-  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  // auto id = id_tensor.item<vc::cacheID>();
-  // if (vc::cusparseLt_cache.count(id) == 0) {
-  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
-  // }
-  // const auto& entry = vc::cusparseLt_cache[id];
+  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  auto id = id_tensor.item<vc::cacheID>();
+  if (vc::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  const auto& entry = vc::cusparseLt_cache[id];
 
   auto res_tensor_options =
       c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
@@ -287,14 +287,15 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
 }
 
 void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
-  TORCH_CHECK(vllm::cusparseLt::handle_initialized,
+  namespace vc = vllm::cusparseLt;
+  TORCH_CHECK(vc::handle_initialized,
               "Call of destroy cusparseId with unintialized cusparseLt");
-  // TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  // auto id = id_tensor.item<vc::cacheID>();
-  // if (vllm::cusparseLt::cusparseLt_cache.count(id) == 0) {
-  //   TORCH_CHECK(false, "cusparse matmul Id is not found");
-  // }
-  // auto& entry = vllm::cusparseLt::cusparseLt_cache[id];
+  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
+  auto id = id_tensor.item<vc::cacheID>();
+  if (vc::cusparseLt_cache.count(id) == 0) {
+    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  }
+  auto& entry = vc::cusparseLt_cache[id];
 
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));

From 4ea58b185ef434efb8e7e63137327936fac72a5e Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 30 Oct 2024 15:47:29 +0000
Subject: [PATCH 24/92] Fix cusparseLt caching

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 19 ++--
 csrc/ops.h                                    | 10 +-
 .../fp8_semi_structured/cusparseLt.cpp        | 95 ++++++++-----------
 csrc/torch_bindings.cpp                       | 15 ++-
 tests/kernels/test_semi_structured.py         | 19 ++++
 tests/test_cusparseLt.cpp                     | 12 ---
 vllm/_custom_ops.py                           | 10 +-
 7 files changed, 84 insertions(+), 96 deletions(-)
 delete mode 100644 tests/test_cusparseLt.cpp

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index cc9a9e1c2603c..17d42d386dd6c 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -93,19 +93,20 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
-        # timers.append(
-        #     bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
-        #              dense_matmul, a, b, torch.float8_e4m3fn))
+        timers.append(
+            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
+                     dense_matmul, a, b, torch.float8_e4m3fn))
 
         # cusparseLt fp8
-        # timers.append(
-        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-        #              semi_structured_sparse_dense_gemm,
-        #              compress_to_torch_sparse_semi_structured_mat(a), b))
+        timers.append(
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+                     semi_structured_sparse_dense_gemm,
+                     compress_to_torch_sparse_semi_structured_mat(a), b))
         
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
-        id = torch.tensor([handle], dtype=torch.int64, device='cuda')
+        # id = torch.tensor([handle], dtype=torch.int64, device='cuda')
+        id = int(handle)
         timers.append(
             bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
                      semi_structured_fp8_mm_prepared,
@@ -124,7 +125,7 @@ def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
     # MKNs = [(2048, 8192, 14336)]
-    MKNs = [(32, 11008, 4096)]
+    # MKNs = [(32, 11008, 4096)]
     # MKNs = [(2048, 11008, 14336)]
 
     for m, k, n in MKNs:
diff --git a/csrc/ops.h b/csrc/ops.h
index 6b4cd3d65b074..1b85ceed1684c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -223,14 +223,16 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<torch::Tensor>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
+    const c10::optional<torch::Tensor>& bias_opt);
 
 int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                                             const torch::Tensor& dense_B,
-                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result);
+                                            const c10::optional<torch::Tensor>& bias_opt);
 
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
+// torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
 
-void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
+// void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
+void cslt_fp8_semi_structured_destroy(int64_t id);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index be242283648d1..36106c25ee179 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -16,7 +16,7 @@
                                                                              \
   torch::Tensor cslt_mm_fp8_semi_structured(                                 \
       const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-      const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) { \
+      const c10::optional<torch::Tensor>& bias_opt) {                        \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
@@ -63,30 +63,8 @@
 namespace vllm {
 namespace cusparseLt {
 
-struct cusparseLtEntry {
-  // cusparseLtEntry() {}
-  // void operator=(const cusparseLtEntry& entry) {
-  //   sparse_input_descriptor = entry.sparse_input_descriptor;
-  //   dense_input_descriptor = entry.dense_input_descriptor;
-  //   res_descriptor = entry.res_descriptor;
-  //   C_descriptor = entry.C_descriptor;
-  //   matmul = entry.matmul;
-  //   plan = entry.plan;
-
-  //   sparse_mat_ptr = entry.sparse_mat_ptr;    
-  //   dense_mat_ptr = entry.dense_mat_ptr;
-
-  //   device = std::move(entry.device);
-  //   allocator = entry.allocator;
-  //   out_dtype = std::move(entry.out_dtype);
-
-  //   workspace_ptr = std::move(entry.workspace_ptr);
-
-  //   m = entry.m;
-  //   n = entry.n;
-  //   k = entry.k;
-  // }
 
+struct cusparseLtEntry {
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   cusparseLtMatDescriptor_t dense_input_descriptor;
   cusparseLtMatDescriptor_t res_descriptor;
@@ -101,9 +79,7 @@ struct cusparseLtEntry {
 
   torch::Device device = torch::kCUDA;
   torch::Dtype out_dtype;
-
-  c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator;
-  c10::DataPtr workspace_ptr;
+  void* workspace_ptr;
 
   int m;
   int n;
@@ -157,7 +133,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
 
 vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
                                             const torch::Tensor& dense_B, 
-                                            const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
+                                            const c10::optional<torch::Tensor>& bias_opt) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
@@ -173,7 +149,6 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   }
 
   vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
-  // vc::cusparseLtEntry entry;
 
   float alpha = 1.0;
   float beta = 0.0;
@@ -187,6 +162,11 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
 
+  cusparseLtMatDescriptor_t sparse_input_descriptor;
+  cusparseLtMatDescriptor_t dense_input_descriptor;
+  cusparseLtMatDescriptor_t res_descriptor;
+  cusparseLtMatDescriptor_t C_descriptor;
+
   switch (out_dtype) {
     case at::ScalarType::Float8_e4m3fn:
       output_type = CUDA_R_8F_E4M3;
@@ -212,43 +192,43 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   }
   // initialize sparse descriptor
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &(entry.sparse_input_descriptor), m, k, k, 16, input_type,
+      &vc::handle, &sparse_input_descriptor, m, k, k, 16, input_type,
       CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
 
   // initialize dense descriptor
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &entry.dense_input_descriptor,
+      &vc::handle, &dense_input_descriptor,
       (dense_B.is_contiguous()) ? k : n, (dense_B.is_contiguous()) ? n : k,
       (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
 
   // initialize result descriptor
- TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &entry.res_descriptor, m, n, (transpose_result) ? m : n, 16,
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &vc::handle, &res_descriptor, m, n, n, 16,
       output_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      CUSPARSE_ORDER_ROW));
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &entry.C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
+      CUSPARSE_ORDER_ROW));
 
+  cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
       (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
                                 : CUSPARSE_OPERATION_TRANSPOSE,
-      &entry.sparse_input_descriptor, &entry.dense_input_descriptor,
-      &entry.C_descriptor, &entry.res_descriptor, compute_type));
+      &sparse_input_descriptor, &dense_input_descriptor,
+      &C_descriptor, &res_descriptor, compute_type));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
       &vc::handle, &alg_sel, &entry.matmul,
       CUSPARSELT_MATMUL_ALG_DEFAULT));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-      &vc::handle, &entry.plan, &entry.matmul, &alg_sel));
+      &vc::handle, &plan, &entry.matmul, &alg_sel));
   size_t workspace_size;
   TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&vc::handle, &entry.plan, &workspace_size));
+      cusparseLtMatmulGetWorkspace(&vc::handle, &plan, &workspace_size));
+  AT_CUDA_CHECK(cudaMalloc((void**) &entry.workspace_ptr, workspace_size));
 
-  entry.allocator = c10::cuda::CUDACachingAllocator::get();
-  entry.workspace_ptr = entry.allocator->allocate(workspace_size);
   entry.device = dense_B.device();
   entry.out_dtype = out_dtype;
   entry.m = m;
@@ -256,16 +236,19 @@ vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tenso
   entry.k = k;
   entry.sparse_mat_ptr = compressed_A.data_ptr();
   entry.dense_mat_ptr = dense_B.data_ptr();
+  entry.plan = plan;
+  entry.sparse_input_descriptor = sparse_input_descriptor;
+  entry.dense_input_descriptor = dense_input_descriptor;
+  entry.C_descriptor = C_descriptor;
+  entry.res_descriptor = res_descriptor;
+
   return id;
 }
 
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(
-    const torch::Tensor& id_tensor) {
+torch::Tensor cslt_mm_fp8_semi_structured_prepared(vllm::cusparseLt::cacheID id) {
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of matmul with unintialized matmul");
-  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  auto id = id_tensor.item<vc::cacheID>();
   if (vc::cusparseLt_cache.count(id) == 0) {
     TORCH_CHECK(false, "cusparse matmul Id is not found");
   }
@@ -281,17 +264,15 @@ torch::Tensor cslt_mm_fp8_semi_structured_prepared(
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatmul(&vc::handle, &entry.plan, &alpha, entry.sparse_mat_ptr,
                        entry.dense_mat_ptr, &beta, res.data_ptr(),
-                       res.data_ptr(), entry.workspace_ptr.get(), &stream, 1));
+                       res.data_ptr(), entry.workspace_ptr, &stream, 1));
 
   return res;
 }
 
-void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
+void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
   namespace vc = vllm::cusparseLt;
   TORCH_CHECK(vc::handle_initialized,
               "Call of destroy cusparseId with unintialized cusparseLt");
-  TORCH_CHECK(id_tensor.numel() == 1, "ID has to be single valued");
-  auto id = id_tensor.item<vc::cacheID>();
   if (vc::cusparseLt_cache.count(id) == 0) {
     TORCH_CHECK(false, "cusparse matmul Id is not found");
   }
@@ -301,9 +282,12 @@ void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor) {
       cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatDescriptorDestroy(&entry.dense_input_descriptor));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.C_descriptor));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.res_descriptor));
   // Destroy plan
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&entry.plan));
+  AT_CUDA_CHECK(cudaFree(entry.workspace_ptr));
+  vc::cusparseLt_cache.erase(id);
 }
 
 torch::Tensor cslt_mm_fp8_semi_structured(
@@ -377,19 +361,18 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   // create result tensor
   auto res_tensor_options =
       c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-  at::Tensor res = (transpose_result) ? at::empty({n, m}, res_tensor_options)
-                                      : at::empty({m, n}, res_tensor_options);
+  at::Tensor res = at::empty({m, n}, res_tensor_options);
 
   cusparseLtMatDescriptor_t res_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &res_descriptor, m, n, (transpose_result) ? m : n, 16,
+      &vc::handle, &res_descriptor, m, n, n, 16,
       output_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      CUSPARSE_ORDER_ROW));
 
   cusparseLtMatDescriptor_t C_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &C_descriptor, m, n, (transpose_result) ? m : n, 16, C_type,
-      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
+      CUSPARSE_ORDER_ROW));
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 836871c815919..9669d0822beb2 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -312,24 +312,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "Tensor!? alpha, Tensor!? bias, bool transpose_result) -> Tensor");
-
+      "Tensor!? alpha, Tensor!? bias) -> Tensor");
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
 
   ops.def(
       "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
-      "denseB, Tensor!? bias, bool transpose_result) -> int");
+      "denseB, Tensor!? bias) -> int");
   ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_prepare_mm_fp8_semi_structured);
 
-  ops.def("cslt_mm_fp8_semi_structured_prepared(Tensor cacheId) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured_prepared", torch::kCUDA,
-           &cslt_mm_fp8_semi_structured_prepared);
+  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
+  ops.impl("cslt_mm_fp8_semi_structured_prepared", &cslt_mm_fp8_semi_structured_prepared);
 
-  ops.def("cslt_fp8_semi_structured_destroy(Tensor cacheId) -> ()");
-  ops.impl("cslt_fp8_semi_structured_destroy", torch::kCUDA,
-           &cslt_fp8_semi_structured_destroy);
+  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
+  ops.impl("cslt_fp8_semi_structured_destroy", &cslt_fp8_semi_structured_destroy);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c098be7820d7c..143253beffe91 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -9,6 +9,9 @@
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
     semi_structured_sparse_dense_gemm,
     semi_structured_sparse_dense_gemm_scaled)
+from vllm._custom_ops import (semi_structured_fp8_prepare_mm,
+                              semi_structured_fp8_mm_prepared)
+
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -138,6 +141,22 @@ def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
                                                             torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
 
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul_prepared():
+    M, N, K = (32, 64, 32)
+    dtype = torch.float8_e4m3fn
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+    handle = semi_structured_fp8_prepare_mm(A.packed, B)
+
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_fp8_mm_prepared(int(handle)).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
 
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
diff --git a/tests/test_cusparseLt.cpp b/tests/test_cusparseLt.cpp
deleted file mode 100644
index 9c8d3cb813ef1..0000000000000
--- a/tests/test_cusparseLt.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
- #include <cusparseLt.h>
-
-cusparseLtHandle_t handle;
-
-
-struct Entry {
-  cusparseLtMatDescriptor_t sparse_input_descriptor;
-};
-
-int main() {
-    
-}
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6fc5778ce9865..5e1f410121ebe 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -706,20 +706,18 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
 
 def semi_structured_fp8_mm(A_compressed: torch.Tensor,
                            B_dense: torch.Tensor,
-                           bias: Optional[torch.Tensor] = None,
-                           transpose_result: bool = False) -> torch.Tensor:
+                           bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    bias, transpose_result)
+                                                    bias)
 
 
 def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
                                    B_dense: torch.Tensor,
-                                   bias: Optional[torch.Tensor] = None,
-                                   transpose_result: bool = False) -> int:
+                                   bias: Optional[torch.Tensor] = None) -> int:
     assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
-        A_compressed, B_dense, bias, transpose_result)
+        A_compressed, B_dense, bias)
 
 
 def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:

From f0551effd4b7b5fe396e9b72d492db90603f77d6 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Tue, 5 Nov 2024 15:29:11 +0000
Subject: [PATCH 25/92] Make cached version default function

---
 .../cusparseLt_benchmarks/benchmark_24.py     |  71 ++-
 csrc/ops.h                                    |  15 +-
 .../fp8_semi_structured/cusparseLt.cpp        | 466 ++++++++++--------
 csrc/torch_bindings.cpp                       |  15 +-
 tests/kernels/test_semi_structured.py         |  65 +--
 vllm/_custom_ops.py                           |  31 +-
 .../sparsity/utils/cusparse_2_4_utils.py      |  30 +-
 7 files changed, 355 insertions(+), 338 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index 17d42d386dd6c..a3b013034e725 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -1,8 +1,8 @@
 import argparse
 import copy
 import itertools
-import pickle as pkl
-import time
+# import pickle as pkl
+# import time
 from typing import Callable, Iterable, List, Tuple
 
 import torch
@@ -12,8 +12,7 @@
 
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
-    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
-from vllm._custom_ops import (semi_structured_fp8_prepare_mm, semi_structured_fp8_mm_prepared, semi_structured_fp8_destroy)
+    is_semi_structured_supported, semi_structured_sparse_dense_gemm2)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -78,18 +77,36 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             compress_to_torch_sparse_semi_structured_mat(
                 a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16)))
 
-    a, b = make_rand_tensors(torch.int8, m, n, k)
+    # a_compressed = compress_to_torch_sparse_semi_structured_mat(
+    #         a.to(dtype=torch.bfloat16))
+    # b = b.to(torch.bfloat16)
+    a, b = make_rand_tensors(torch.float16, m, n, k)
+    a_compressed = compress_to_torch_sparse_semi_structured_mat(
+        a.to(dtype=torch.bfloat16))
+    # warmup
+    semi_structured_sparse_dense_gemm2(a_compressed, b)
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4_v2",
+                 semi_structured_sparse_dense_gemm2, a_compressed, b))
+
+    # a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
     # timers.append(
     #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
     #              dense_matmul, a, b, torch.int8))
-
+    # a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
     # cusparseLt i8
     # timers.append(
     #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
     #              semi_structured_sparse_dense_gemm,
     #              compress_to_torch_sparse_semi_structured_mat(a), b))
 
+    # warmup
+    # semi_structured_sparse_dense_gemm2(a_compressed, b)
+    # timers.append(
+    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_v2",
+    #              semi_structured_sparse_dense_gemm2, a_compressed, b))
+
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
         # cutlass fp8
@@ -97,21 +114,27 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
                      dense_matmul, a, b, torch.float8_e4m3fn))
 
-        # cusparseLt fp8
-        timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-                     semi_structured_sparse_dense_gemm,
-                     compress_to_torch_sparse_semi_structured_mat(a), b))
-        
         a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
-        handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
-        # id = torch.tensor([handle], dtype=torch.int64, device='cuda')
-        id = int(handle)
+        # cusparseLt fp8
+        # timers.append(
+        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+        #              semi_structured_sparse_dense_gemm,
+        #              a_compressed, b))
+
+        # warmup
+        semi_structured_sparse_dense_gemm2(a_compressed, b)
         timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
-                     semi_structured_fp8_mm_prepared,
-                     id))
-        semi_structured_fp8_destroy(id)
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_v2",
+                     semi_structured_sparse_dense_gemm2, a_compressed, b))
+
+        # handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
+        # id = int(handle)
+        # scale = torch.tensor(1.0, device='cuda', dtype=torch.float32)
+        # # scale = None
+        # timers.append(
+        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
+        #              semi_structured_fp8_mm_prepared, id, scale=scale))
+        # semi_structured_fp8_destroy(id)
     return timers
 
 
@@ -124,7 +147,9 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(MKNs: Iterable[Tuple[int, int, int]],
         use_fp8: bool) -> Iterable[TMeasurement]:
     results = []
+    # MKNs = [(1024, 8192, 14336)]
     # MKNs = [(2048, 8192, 14336)]
+    # MKNs = [(2048, 8192, 14336), (2048, 8192, 14336)]
     # MKNs = [(32, 11008, 4096)]
     # MKNs = [(2048, 11008, 14336)]
 
@@ -186,11 +211,11 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
         print_timers(data)
 
-    timestamp = int(time.time())
+    # timestamp = int(time.time())
 
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
+    # all_data = []
+    # for d in model_bench_data:
+    #     all_data.extend(d)
     # pickle all data
     # with open(f"model_bench-{timestamp}.pkl", "wb") as f:
     #     pkl.dump(all_data, f)
diff --git a/csrc/ops.h b/csrc/ops.h
index 1b85ceed1684c..d821f17205644 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -222,17 +222,12 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
 torch::Tensor cslt_mm_fp8_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& alpha_opt,
+    const c10::optional<double>& scale_opt,
     const c10::optional<torch::Tensor>& bias_opt);
 
-int64_t cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B,
-                                            const c10::optional<torch::Tensor>& bias_opt);
-
-// torch::Tensor cslt_mm_fp8_semi_structured_prepared(const torch::Tensor& id);
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id);
-
-// void cslt_fp8_semi_structured_destroy(const torch::Tensor& id_tensor);
-void cslt_fp8_semi_structured_destroy(int64_t id);
+torch::Tensor cslt_mm_fp8_semi_structured2(
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<double>& scale_opt,
+    const c10::optional<torch::Tensor>& bias_opt);
 
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index 36106c25ee179..eafc4ac72736c 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -16,33 +16,20 @@
                                                                              \
   torch::Tensor cslt_mm_fp8_semi_structured(                                 \
       const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+      const c10::optional<torch::Tensor>& scale_opt,                         \
       const c10::optional<torch::Tensor>& bias_opt) {                        \
     TORCH_CHECK(false,                                                       \
                 "Unsupported dtype for compressed matrix multiplication in " \
                 "current version of cuSPARSELt.");                           \
   }                                                                          \
-                                                                             \
-  int64_t cslt_prepare_mm_fp8_semi_structured(                               \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B) {     \
-    TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
-                                                                             \
-  torch::Tensor cslt_mm_fp8_semi_structured_prepared(int64_t id) {           \
-    TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
-                                                                             \
-  void cslt_fp8_semi_structured_destroy(int64_t id) {                        \
+  torch::Tensor cslt_mm_fp8_semi_structured2(                                \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
+      const c10::optional<torch::Tensor>& scale_opt,                         \
+      const c10::optional<torch::Tensor>& bias_opt) {                        \
     TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
+                "Unsupported dtype for compressed matrix multiplication in " \
+                "current version of cuSPARSELt.");                           \
+  }
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -58,67 +45,228 @@
                     " when calling `" #EXPR "`");                  \
       } while (0)
 
-
-
 namespace vllm {
 namespace cusparseLt {
 
-
 struct cusparseLtEntry {
-  cusparseLtMatDescriptor_t sparse_input_descriptor;
-  cusparseLtMatDescriptor_t dense_input_descriptor;
-  cusparseLtMatDescriptor_t res_descriptor;
-  cusparseLtMatDescriptor_t C_descriptor;
-
-  cusparseLtMatmulDescriptor_t matmul;
-  cusparseLtMatmulPlan_t plan;
+  cusparseLtMatDescriptor_t* sparse_input_descriptor_p;
+  cusparseLtMatDescriptor_t* dense_input_descriptor_p;
+  cusparseLtMatDescriptor_t* res_descriptor_p;
+  cusparseLtMatDescriptor_t* C_descriptor_p;
 
+  cusparseLtMatmulDescriptor_t* matmul_p;
+  cusparseLtMatmulPlan_t* plan_p;
+  cusparseLtMatmulAlgSelection_t* alg_sel_p;
 
-  void* sparse_mat_ptr;
-  void* dense_mat_ptr;
-
-  torch::Device device = torch::kCUDA;
-  torch::Dtype out_dtype;
   void* workspace_ptr;
 
-  int m;
-  int n;
-  int k;
+  ~cusparseLtEntry() {
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(sparse_input_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(
+        cusparseLtMatDescriptorDestroy(dense_input_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(C_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(res_descriptor_p));
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(plan_p));
+
+    // Destructor is called after the cuda cleanup so double free is done here.
+    // AT_CUDA_CHECK(cudaFree(workspace_ptr));
+    delete sparse_input_descriptor_p;
+    delete dense_input_descriptor_p;
+    delete res_descriptor_p;
+    delete C_descriptor_p;
+    delete plan_p;
+    delete alg_sel_p;
+    delete matmul_p;
+  }
 };
 
 cusparseLtHandle_t handle;
 bool handle_initialized = false;
-using cacheID = int64_t;
-
+using cacheID = std::tuple<int64_t, int64_t, int64_t, at::ScalarType>;
 
 std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
+
+void prepare_mm_semi_structured(const cacheID& tuple_id,
+                                at::ScalarType out_dtype,
+                                bool is_B_contiguous) {
+  auto m = std::get<0>(tuple_id);
+  auto k = std::get<1>(tuple_id);
+  auto n = std::get<2>(tuple_id);
+  at::ScalarType input_dtype = std::get<3>(tuple_id);
+  auto& entry = cusparseLt_cache[tuple_id];
+
+  cudaDataType input_type;
+  cudaDataType output_type;
+  cudaDataType C_type;
+  cusparseComputeType compute_type;
+
+  switch (input_dtype) {
+    case at::ScalarType::Char:
+      input_type = CUDA_R_8I;
+      output_type = CUDA_R_8I;
+      C_type = CUDA_R_8I;
+      compute_type = CUSPARSE_COMPUTE_32I;
+      break;
+    case at::ScalarType::Half:
+      input_type = CUDA_R_16F;
+      output_type = CUDA_R_16F;
+      C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::BFloat16:
+      input_type = CUDA_R_16BF;
+      output_type = CUDA_R_16BF;
+      C_type = CUDA_R_16BF;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::Float:
+      input_type = CUDA_R_32F;
+      output_type = CUDA_R_32F;
+      C_type = CUDA_R_32F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::Float8_e4m3fn:
+      input_type = CUDA_R_8F_E4M3;
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    default:
+      TORCH_CHECK(
+          false,
+          "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
+      break;
+  }
+
+  // cudaDataType input_type = CUDA_R_8F_E4M3;
+  // cudaDataType output_type;
+  // cudaDataType C_type;
+  // cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  // switch (out_dtype) {
+  //   case at::ScalarType::Float8_e4m3fn:
+  //     output_type = CUDA_R_8F_E4M3;
+  //     C_type = CUDA_R_16F;
+  //     break;
+  //   case at::ScalarType::Half:
+  //     output_type = CUDA_R_16F;
+  //     C_type = CUDA_R_16F;
+  //     break;
+  //   case at::ScalarType::BFloat16:
+  //     output_type = CUDA_R_16BF;
+  //     C_type = CUDA_R_16BF;
+  //     break;
+  //   case at::ScalarType::Float:
+  //     output_type = CUDA_R_32F;
+  //     C_type = CUDA_R_32F;
+  //     break;
+  //   default:
+  //     TORCH_CHECK(false,
+  //                 "Unsupported out_dtype passed, must be one of {fp16, bf16,
+  //                 " "float32} for fp8 inputs");
+  //     break;
+  // }
+  entry.sparse_input_descriptor_p = new cusparseLtMatDescriptor_t();
+  entry.dense_input_descriptor_p = new cusparseLtMatDescriptor_t();
+  entry.res_descriptor_p = new cusparseLtMatDescriptor_t();
+  entry.C_descriptor_p = new cusparseLtMatDescriptor_t();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
+      &handle, entry.sparse_input_descriptor_p, m, k, k, 16, input_type,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+
+  // initialize dense descriptor
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, entry.dense_input_descriptor_p, (is_B_contiguous) ? k : n,
+      (is_B_contiguous) ? n : k, (is_B_contiguous) ? n : k, 16, input_type,
+      CUSPARSE_ORDER_ROW));
+
+  // initialize result descriptor
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&handle, entry.res_descriptor_p, m, n, n,
+                                    16, output_type, CUSPARSE_ORDER_ROW));
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle, entry.C_descriptor_p, m, n, n, 16, C_type, CUSPARSE_ORDER_ROW));
+
+  entry.matmul_p = new cusparseLtMatmulDescriptor_t();
+  entry.plan_p = new cusparseLtMatmulPlan_t();
+  entry.alg_sel_p = new cusparseLtMatmulAlgSelection_t();
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
+      &handle, entry.matmul_p, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      (is_B_contiguous) ? CUSPARSE_OPERATION_NON_TRANSPOSE
+                        : CUSPARSE_OPERATION_TRANSPOSE,
+      entry.sparse_input_descriptor_p, entry.dense_input_descriptor_p,
+      entry.C_descriptor_p, entry.res_descriptor_p, compute_type));
+
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
+      &handle, entry.alg_sel_p, entry.matmul_p, CUSPARSELT_MATMUL_ALG_DEFAULT));
+  int num_search_iters = 5;
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
+      &handle, entry.alg_sel_p, CUSPARSELT_MATMUL_SEARCH_ITERATIONS,
+      &num_search_iters, sizeof(num_search_iters)));
+
+  // TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
+  //     &handle, &plan, &entry.matmul, &alg_sel));
+  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
+      &handle, entry.plan_p, entry.matmul_p, entry.alg_sel_p));
+
+  size_t workspace_size;
+  // TORCH_CUDASPARSE_CHECK(
+  //     cusparseLtMatmulGetWorkspace(&handle, &global_plan, &workspace_size));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmulGetWorkspace(&handle, entry.plan_p, &workspace_size));
+  AT_CUDA_CHECK(cudaMalloc((void**)&entry.workspace_ptr, workspace_size));
+}
+
 }  // namespace cusparseLt
 }  // namespace vllm
 
-// vllm::cusparseLt::cusparseLtEntry entry;
-
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
     vc::handle_initialized = true;
   }
-  // create sparse descriptor, dtype
+
+  cudaDataType type;
   auto compression_factor = 9;
   cusparseLtMatDescriptor_t input_descriptor;
-  cudaDataType type = CUDA_R_8F_E4M3;
+
+  switch (input.scalar_type()) {
+    case at::ScalarType::Char:
+      type = CUDA_R_8I;
+      compression_factor = 10;
+      break;
+    case at::ScalarType::Half:
+      type = CUDA_R_16F;
+      break;
+    case at::ScalarType::BFloat16:
+      type = CUDA_R_16BF;
+      break;
+    case at::ScalarType::Float:
+      type = CUDA_R_32F;
+      break;
+    case at::ScalarType::Float8_e4m3fn:
+      type = CUDA_R_8F_E4M3;
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix");
+      break;
+  }
+
   auto compressed_tensor =
       input.new_empty(input.numel() * compression_factor / 16);
 
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &input_descriptor, input.size(0), input.size(1), input.size(1),
-      16, type, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
+      &vc::handle, &input_descriptor, input.size(0), input.size(1),
+      input.size(1), 16, type, CUSPARSE_ORDER_ROW,
+      CUSPARSELT_SPARSITY_50_PERCENT));
 
   size_t compressed_size, compressed_buffer_size;
   TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-      &vc::handle, &input_descriptor, &compressed_size, &compressed_buffer_size));
+      &vc::handle, &input_descriptor, &compressed_size,
+      &compressed_buffer_size));
 
   auto& allocator = *c10::cuda::CUDACachingAllocator::get();
   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
@@ -131,169 +279,69 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   return compressed_tensor;
 }
 
-vllm::cusparseLt::cacheID cslt_prepare_mm_fp8_semi_structured(const torch::Tensor& compressed_A,
-                                            const torch::Tensor& dense_B, 
-                                            const c10::optional<torch::Tensor>& bias_opt) {
-  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress");
+torch::Tensor cslt_mm_fp8_semi_structured(
+    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
+    const c10::optional<double>& alpha_opt,
+    const c10::optional<torch::Tensor>& bias_opt) {
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
     vc::handle_initialized = true;
   }
-  vc::cacheID id;
-  if (vc::cusparseLt_cache.empty()) {
-    id = 0;
-  } else {
-    id = vc::cusparseLt_cache.rbegin()->first + 1;
-  }
-
-  vc::cusparseLtEntry& entry = vc::cusparseLt_cache[id];
 
-  float alpha = 1.0;
-  float beta = 0.0;
-  cudaDataType input_type = CUDA_R_8F_E4M3;
-  cudaDataType output_type;
-  cudaDataType C_type;
-  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-  auto compression_factor = 9;
+  auto input_dtype = compressed_A.scalar_type();
   auto out_dtype = dense_B.scalar_type();
+  auto compression_factor = (input_dtype == at::ScalarType::Char) ? 10 : 9;
+
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
 
-  cusparseLtMatDescriptor_t sparse_input_descriptor;
-  cusparseLtMatDescriptor_t dense_input_descriptor;
-  cusparseLtMatDescriptor_t res_descriptor;
-  cusparseLtMatDescriptor_t C_descriptor;
-
-  switch (out_dtype) {
-    case at::ScalarType::Float8_e4m3fn:
-      output_type = CUDA_R_8F_E4M3;
-      C_type = CUDA_R_16F;
-      break;
-    case at::ScalarType::Half:
-      output_type = CUDA_R_16F;
-      C_type = CUDA_R_16F;
-      break;
-    case at::ScalarType::BFloat16:
-      output_type = CUDA_R_16BF;
-      C_type = CUDA_R_16BF;
-      break;
-    case at::ScalarType::Float:
-      output_type = CUDA_R_32F;
-      C_type = CUDA_R_32F;
-      break;
-    default:
-      TORCH_CHECK(false,
-                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
-                  "float32} for fp8 inputs");
-      break;
+  vc::cacheID tuple_id = std::make_tuple(m, k, n, input_dtype);
+  bool found = vc::cusparseLt_cache.count(tuple_id);
+  if (not found) {
+    vc::prepare_mm_semi_structured(tuple_id, out_dtype,
+                                   dense_B.is_contiguous());
   }
-  // initialize sparse descriptor
-  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &sparse_input_descriptor, m, k, k, 16, input_type,
-      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
-
-  // initialize dense descriptor
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &dense_input_descriptor,
-      (dense_B.is_contiguous()) ? k : n, (dense_B.is_contiguous()) ? n : k,
-      (dense_B.is_contiguous()) ? n : k, 16, input_type, CUSPARSE_ORDER_ROW));
-
-  // initialize result descriptor
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &res_descriptor, m, n, n, 16,
-      output_type,
-      CUSPARSE_ORDER_ROW));
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
-      CUSPARSE_ORDER_ROW));
+  auto& entry = vc::cusparseLt_cache[tuple_id];
 
-  cusparseLtMatmulPlan_t plan;
-  cusparseLtMatmulAlgSelection_t alg_sel;
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &vc::handle, &entry.matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
-                                : CUSPARSE_OPERATION_TRANSPOSE,
-      &sparse_input_descriptor, &dense_input_descriptor,
-      &C_descriptor, &res_descriptor, compute_type));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
-      &vc::handle, &alg_sel, &entry.matmul,
-      CUSPARSELT_MATMUL_ALG_DEFAULT));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-      &vc::handle, &plan, &entry.matmul, &alg_sel));
-  size_t workspace_size;
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&vc::handle, &plan, &workspace_size));
-  AT_CUDA_CHECK(cudaMalloc((void**) &entry.workspace_ptr, workspace_size));
-
-  entry.device = dense_B.device();
-  entry.out_dtype = out_dtype;
-  entry.m = m;
-  entry.n = n;
-  entry.k = k;
-  entry.sparse_mat_ptr = compressed_A.data_ptr();
-  entry.dense_mat_ptr = dense_B.data_ptr();
-  entry.plan = plan;
-  entry.sparse_input_descriptor = sparse_input_descriptor;
-  entry.dense_input_descriptor = dense_input_descriptor;
-  entry.C_descriptor = C_descriptor;
-  entry.res_descriptor = res_descriptor;
-
-  return id;
-}
-
-torch::Tensor cslt_mm_fp8_semi_structured_prepared(vllm::cusparseLt::cacheID id) {
-  namespace vc = vllm::cusparseLt;
-  TORCH_CHECK(vc::handle_initialized,
-              "Call of matmul with unintialized matmul");
-  if (vc::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  // set bias pointer for matmul, need to assign to get location
+  if (bias_opt.has_value()) {
+    auto& bias = bias_opt.value();
+    void* dBias = bias.data_ptr();
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+        &vc::handle, entry.matmul_p, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
+        sizeof(dBias)));
   }
-  const auto& entry = vc::cusparseLt_cache[id];
 
-  auto res_tensor_options =
-      c10::TensorOptions().dtype(entry.out_dtype).device(entry.device);
-  at::Tensor res = at::empty({entry.m, entry.n}, res_tensor_options);
-  float alpha = 1.0;
+  // float alpha = 1.0;
+  float alpha = alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
   float beta = 0.0;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmul(&vc::handle, &entry.plan, &alpha, entry.sparse_mat_ptr,
-                       entry.dense_mat_ptr, &beta, res.data_ptr(),
-                       res.data_ptr(), entry.workspace_ptr, &stream, 1));
+  auto alpha_ptr = &alpha;
 
-  return res;
-}
+  auto res_tensor_options =
+      c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
+  at::Tensor res = at::empty({m, n}, res_tensor_options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-void cslt_fp8_semi_structured_destroy(vllm::cusparseLt::cacheID id) {
-  namespace vc = vllm::cusparseLt;
-  TORCH_CHECK(vc::handle_initialized,
-              "Call of destroy cusparseId with unintialized cusparseLt");
-  if (vc::cusparseLt_cache.count(id) == 0) {
-    TORCH_CHECK(false, "cusparse matmul Id is not found");
+  if (found) {
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
+        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
+        dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
+        entry.workspace_ptr, &stream, 1));
+  } else {
+    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulSearch(
+        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
+        dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
+        entry.workspace_ptr, &stream, 1));
   }
-  auto& entry = vc::cusparseLt_cache[id];
-
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatDescriptorDestroy(&entry.sparse_input_descriptor));
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatDescriptorDestroy(&entry.dense_input_descriptor));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.C_descriptor));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&entry.res_descriptor));
-  // Destroy plan
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&entry.plan));
-  AT_CUDA_CHECK(cudaFree(entry.workspace_ptr));
-  vc::cusparseLt_cache.erase(id);
+  return res;
 }
 
-torch::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_fp8_semi_structured2(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<torch::Tensor>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt, bool transpose_result) {
+    const c10::optional<double>& alpha_opt,
+    const c10::optional<torch::Tensor>& bias_opt) {
   TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
               "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
@@ -307,9 +355,6 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
-  int tensor_alpha_mode = 0;
-  float alpha = 1.0;
-  float beta = 0.0;
   cudaDataType input_type = CUDA_R_8F_E4M3;
   cudaDataType output_type;
   cudaDataType C_type;
@@ -364,15 +409,13 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   at::Tensor res = at::empty({m, n}, res_tensor_options);
 
   cusparseLtMatDescriptor_t res_descriptor;
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &res_descriptor, m, n, n, 16,
-      output_type,
-      CUSPARSE_ORDER_ROW));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtDenseDescriptorInit(&vc::handle, &res_descriptor, m, n, n, 16,
+                                    output_type, CUSPARSE_ORDER_ROW));
 
   cusparseLtMatDescriptor_t C_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &C_descriptor, m, n, n, 16, C_type,
-      CUSPARSE_ORDER_ROW));
+      &vc::handle, &C_descriptor, m, n, n, 16, C_type, CUSPARSE_ORDER_ROW));
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &vc::handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -390,20 +433,10 @@ torch::Tensor cslt_mm_fp8_semi_structured(
         sizeof(dBias)));
   }
 
-  const auto alpha_tensor =
-      alpha_opt.has_value() ? *alpha_opt : torch::Tensor{};
+  float beta = 0.0;
+  const float alpha =
+      alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
   auto alpha_ptr = &alpha;
-  if (alpha_opt.has_value()) {
-    if (alpha_tensor.numel() == 1) {
-      alpha = alpha_tensor.item<float>();
-    } else {
-      tensor_alpha_mode = 1;
-      TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-          &handle, &matmul, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING,
-          &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
-      alpha_ptr = static_cast<float*>(alpha_tensor.data_ptr());
-    }
-  }
 
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
       &vc::handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
@@ -418,9 +451,10 @@ torch::Tensor cslt_mm_fp8_semi_structured(
   auto workspace_ptr = allocator.allocate(workspace_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-      &vc::handle, &plan, &alpha, compressed_A.data_ptr(), dense_B.data_ptr(),
-      &beta, res.data_ptr(), res.data_ptr(), workspace_ptr.get(), &stream, 1));
+  TORCH_CUDASPARSE_CHECK(
+      cusparseLtMatmul(&vc::handle, &plan, alpha_ptr, compressed_A.data_ptr(),
+                       dense_B.data_ptr(), &beta, res.data_ptr(),
+                       res.data_ptr(), workspace_ptr.get(), &stream, 1));
 
   // Destroy descriptors
   TORCH_CUDASPARSE_CHECK(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9669d0822beb2..8064af5134761 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -312,21 +312,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   ops.def(
       "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "Tensor!? alpha, Tensor!? bias) -> Tensor");
+      "float!? scale, Tensor!? bias) -> Tensor");
   ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
            &cslt_mm_fp8_semi_structured);
 
   ops.def(
-      "cslt_prepare_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! "
-      "denseB, Tensor!? bias) -> int");
-  ops.impl("cslt_prepare_mm_fp8_semi_structured", torch::kCUDA,
-           &cslt_prepare_mm_fp8_semi_structured);
+      "cslt_mm_fp8_semi_structured2(Tensor! compressed_A, Tensor! denseB,"
+      "float!? scale, Tensor!? bias) -> Tensor");
+  ops.impl("cslt_mm_fp8_semi_structured2", torch::kCUDA,
+           &cslt_mm_fp8_semi_structured2);
 
-  ops.def("cslt_mm_fp8_semi_structured_prepared(int cacheId) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured_prepared", &cslt_mm_fp8_semi_structured_prepared);
-
-  ops.def("cslt_fp8_semi_structured_destroy(int cacheId) -> ()");
-  ops.impl("cslt_fp8_semi_structured_destroy", &cslt_fp8_semi_structured_destroy);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 143253beffe91..c67bee2d74b22 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -8,10 +8,7 @@
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
     semi_structured_sparse_dense_gemm,
-    semi_structured_sparse_dense_gemm_scaled)
-from vllm._custom_ops import (semi_structured_fp8_prepare_mm,
-                              semi_structured_fp8_mm_prepared)
-
+    semi_structured_sparse_dense_gemm_scaled, semi_structured_sparse_dense_gemm2)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -68,8 +65,6 @@ def test_semi_structured_fp8_compress(size):
 @pytest.mark.parametrize("mnk", MNK)
 @pytest.mark.parametrize("dtype", DTYPES)
 def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
-    # if dtype is torch.int8:
-    #     pytest.skip("cusparse does not support sparse x non transposed dense")
     M, N, K = mnk
     A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
@@ -82,6 +77,12 @@ def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
         C = dense_matmul(A_pruned, B, dtype)
         torch.testing.assert_close(C, C_sparse)
 
+    # Verify cache
+    B = get_random_mat(K, N, dtype)
+    C = dense_matmul(A_pruned, B, dtype)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B)
+    torch.testing.assert_close(C, C_sparse)
+
 
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
@@ -98,63 +99,27 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     C = dense_matmul(A_pruned, B.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
 
+    # Verify cache
+    B = get_random_mat(N, K, dtype)
+    C = dense_matmul(A_pruned, B.t(), dtype)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
+    torch.testing.assert_close(C, C_sparse)
 
-# TODO modelopt config has to be replaced with corresponding fp8_24 config
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
-    M, N, K = (32, 64, 32)
-    dtype = torch.float8_e4m3fn
-    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
-    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
-    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
-
-    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
-    M, N, K = (32, 64, 32)
-    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
-    A_pruned_fp8, scale_A = to_float8(A_pruned)
-    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
-    B_fp8, scale_B = to_float8(B)
-
-    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
-
-    C = torch._scaled_mm(A_pruned_fp8,
-                         B_fp8,
-                         scale_a=scale_A,
-                         scale_b=scale_B,
-                         out_dtype=torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
-                                                        B_fp8,
-                                                        scale_a=scale_A,
-                                                        scale_b=scale_B).to(
-                                                            torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
 
+# TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
     not is_semi_structured_supported()
     or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_matmul_prepared():
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul2():
     M, N, K = (32, 64, 32)
     dtype = torch.float8_e4m3fn
     A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
     B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
-    handle = semi_structured_fp8_prepare_mm(A.packed, B)
 
     C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_fp8_mm_prepared(int(handle)).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm2(A, B).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 5e1f410121ebe..14d3e76b0976b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -704,28 +704,23 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
 
-def semi_structured_fp8_mm(A_compressed: torch.Tensor,
-                           B_dense: torch.Tensor,
-                           bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    assert A_compressed.dtype == torch.float8_e4m3fn
+def semi_structured_fp8_mm(
+        A_compressed: torch.Tensor,
+        B_dense: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    bias)
+                                                    scale, bias)
 
 
-def semi_structured_fp8_prepare_mm(A_compressed: torch.Tensor,
-                                   B_dense: torch.Tensor,
-                                   bias: Optional[torch.Tensor] = None) -> int:
+def semi_structured_fp8_mm2(
+        A_compressed: torch.Tensor,
+        B_dense: torch.Tensor,
+        scale: Optional[float] = None,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert A_compressed.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_prepare_mm_fp8_semi_structured(
-        A_compressed, B_dense, bias)
-
-
-def semi_structured_fp8_mm_prepared(cacheId: int) -> torch.Tensor:
-    return torch.ops._C.cslt_mm_fp8_semi_structured_prepared(cacheId)
-
-
-def semi_structured_fp8_destroy(cacheId: int):
-    torch.ops._C.cslt_fp8_semi_structured_destroy(cacheId)
+    return torch.ops._C.cslt_mm_fp8_semi_structured2(A_compressed, B_dense,
+                                                     scale, bias)
 
 
 # int8
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 58af70318aac8..23cfd825e3448 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -9,7 +9,7 @@
 
 from vllm import _custom_ops as ops
 from vllm._custom_ops import (semi_structured_fp8_compress,
-                              semi_structured_fp8_mm)
+                              semi_structured_fp8_mm, semi_structured_fp8_mm2)
 from vllm.platforms import current_platform
 
 SparseSemiStructuredTensor._FORCE_CUTLASS = False
@@ -80,16 +80,24 @@ def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
     '''
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
-    if b_dense.is_contiguous() and a_packed.dtype in [torch.int8, torch.float8_e4m3fn]:
-        raise ValueError("cuSparseLt does not support contiguous dense matrix for int8 and fp8 types")
-    if a_packed.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(a_packed.packed,
-                                      b_dense,
-                                      bias=bias,
-                                      transpose_result=False)
-    else:
-        return torch.mm(a_packed, b_dense)
+    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
+    scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
+
+    # if a_sparse.dtype == torch.float8_e4m3fn:
+    #     scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    #     return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
+    # else:
+    #     return torch.mm(a_sparse, b_dense)
+
+
+def semi_structured_sparse_dense_gemm2(a_sparse: torch.Tensor,
+                                       b_dense: torch.Tensor):
+    assert a_sparse.dtype in [
+        torch.float8_e4m3fn
+    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
+    scale = 1.0
+    return semi_structured_fp8_mm2(a_sparse.packed, b_dense, scale=scale)
 
 
 def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,

From d7476e880872504f738ee81b6caf5228b368bd52 Mon Sep 17 00:00:00 2001
From: ilmarkov <markovilya197@gmail.com>
Date: Wed, 6 Nov 2024 15:17:27 +0000
Subject: [PATCH 26/92] Fixes and polishing after rebase

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 121 +++++++-------
 csrc/ops.h                                    |   4 +-
 .../fp8_semi_structured/cusparseLt.cpp        |  93 ++++++-----
 csrc/torch_bindings.cpp                       |   8 +-
 tests/kernels/test_semi_structured.py         | 155 +++++++++++++++++-
 vllm/_custom_ops.py                           |  17 +-
 .../sparsity/utils/cusparse_2_4_utils.py      | 143 ++++++++++------
 7 files changed, 358 insertions(+), 183 deletions(-)

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
index a3b013034e725..b66ef0fa7b29d 100644
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
@@ -1,8 +1,6 @@
 import argparse
 import copy
 import itertools
-# import pickle as pkl
-# import time
 from typing import Callable, Iterable, List, Tuple
 
 import torch
@@ -12,7 +10,7 @@
 
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
-    is_semi_structured_supported, semi_structured_sparse_dense_gemm2)
+    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -69,43 +67,54 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
                  semi_structured_sparse_dense_gemm,
                  compress_to_torch_sparse_semi_structured_mat(a), b))
 
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_fp16_fp16_2_4_noncached",
+                 semi_structured_sparse_dense_gemm,
+                 compress_to_torch_sparse_semi_structured_mat(a),
+                 b,
+                 cached=False))
+
     # cusparseLt bf16
+    a, b = make_rand_tensors(torch.bfloat16, m, n, k)
+    a_compressed = compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16))
+
     timers.append(
-        bench_fn(
-            label, sub_label, "cusparseLt_bf16_bf16_2_4",
-            semi_structured_sparse_dense_gemm,
-            compress_to_torch_sparse_semi_structured_mat(
-                a.to(dtype=torch.bfloat16)), b.to(torch.bfloat16)))
-
-    # a_compressed = compress_to_torch_sparse_semi_structured_mat(
-    #         a.to(dtype=torch.bfloat16))
-    # b = b.to(torch.bfloat16)
-    a, b = make_rand_tensors(torch.float16, m, n, k)
-    a_compressed = compress_to_torch_sparse_semi_structured_mat(
-        a.to(dtype=torch.bfloat16))
-    # warmup
-    semi_structured_sparse_dense_gemm2(a_compressed, b)
+        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4",
+                 semi_structured_sparse_dense_gemm, a_compressed, b))
+
     timers.append(
-        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4_v2",
-                 semi_structured_sparse_dense_gemm2, a_compressed, b))
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_bf16_bf16_2_4_noncached",
+                 semi_structured_sparse_dense_gemm,
+                 a_compressed,
+                 b,
+                 cached=False))
 
-    # a, b = make_rand_tensors(torch.int8, m, n, k)
+    a, b = make_rand_tensors(torch.int8, m, n, k)
     # cutlass i8
-    # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_i8_i8_matmul-w-scales",
-    #              dense_matmul, a, b, torch.int8))
-    # a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
-    # cusparseLt i8
-    # timers.append(
-    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
-    #              semi_structured_sparse_dense_gemm,
-    #              compress_to_torch_sparse_semi_structured_mat(a), b))
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_matmul", dense_matmul, a, b,
+                 torch.int8))
 
+    # cusparseLt i8
+    a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
     # warmup
-    # semi_structured_sparse_dense_gemm2(a_compressed, b)
-    # timers.append(
-    #     bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4_v2",
-    #              semi_structured_sparse_dense_gemm2, a_compressed, b))
+    semi_structured_sparse_dense_gemm(a_compressed, b)
+    timers.append(
+        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
+                 semi_structured_sparse_dense_gemm, a_compressed, b))
+
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "cusparseLt_i8_i8_2_4_noncached",
+                 semi_structured_sparse_dense_gemm,
+                 a_compressed,
+                 b,
+                 cached=False))
 
     if use_fp8:
         a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
@@ -114,27 +123,25 @@ def bench(m: int, k: int, n: int, label: str, sub_label: str,
             bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
                      dense_matmul, a, b, torch.float8_e4m3fn))
 
-        a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
         # cusparseLt fp8
-        # timers.append(
-        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-        #              semi_structured_sparse_dense_gemm,
-        #              a_compressed, b))
+        a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
 
         # warmup
-        semi_structured_sparse_dense_gemm2(a_compressed, b)
+        semi_structured_sparse_dense_gemm(a_compressed, b)
+
         timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_v2",
-                     semi_structured_sparse_dense_gemm2, a_compressed, b))
-
-        # handle = semi_structured_fp8_prepare_mm(a_compressed.packed, b)
-        # id = int(handle)
-        # scale = torch.tensor(1.0, device='cuda', dtype=torch.float32)
-        # # scale = None
-        # timers.append(
-        #     bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4_prepared",
-        #              semi_structured_fp8_mm_prepared, id, scale=scale))
-        # semi_structured_fp8_destroy(id)
+            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
+                     semi_structured_sparse_dense_gemm, a_compressed, b))
+
+        timers.append(
+            bench_fn(label,
+                     sub_label,
+                     "cusparseLt_fp8_fp8_2_4_noncached",
+                     semi_structured_sparse_dense_gemm,
+                     a_compressed,
+                     b,
+                     cached=False))
+
     return timers
 
 
@@ -168,11 +175,6 @@ def make_output(data: Iterable[TMeasurement],
     print(f"== All Results {base_description} ====")
     print_timers(data)
 
-    # pickle all the results
-    # timestamp = int(time.time()) if timestamp is None else timestamp
-    # with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-    #     pkl.dump(data, f)
-
 
 def run_model_bench(args):
     if not is_semi_structured_supported():
@@ -211,15 +213,6 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
         print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
         print_timers(data)
 
-    # timestamp = int(time.time())
-
-    # all_data = []
-    # for d in model_bench_data:
-    #     all_data.extend(d)
-    # pickle all data
-    # with open(f"model_bench-{timestamp}.pkl", "wb") as f:
-    #     pkl.dump(all_data, f)
-
 
 if __name__ == '__main__':
 
diff --git a/csrc/ops.h b/csrc/ops.h
index d821f17205644..2c537d9c78db7 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -220,7 +220,7 @@ void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
 #ifndef USE_ROCM
 torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
 
-torch::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<double>& scale_opt,
     const c10::optional<torch::Tensor>& bias_opt);
@@ -230,4 +230,6 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
     const c10::optional<double>& scale_opt,
     const c10::optional<torch::Tensor>& bias_opt);
 
+void cslt_clear_cache();
+
 #endif
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
index eafc4ac72736c..54fbc81345449 100644
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
@@ -3,33 +3,25 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#define STUB_FUNC_IMPL()                                                     \
-  torch::Tensor cslt_compress_fp8_semi_structured(                           \
-      const torch::Tensor& input) {                                          \
-  torch::Tensor cslt_compress_fp8_semi_structured(                           \
-      const torch::Tensor& input) {                                          \
-    TORCH_CHECK(false,                                                       \
-                "cusparseLt is not found or "                                \
-                "unsupported dtype for compressed matrix in current "        \
-                "version of cuSPARSELt.");                                   \
-  }                                                                          \
-                                                                             \
-  torch::Tensor cslt_mm_fp8_semi_structured(                                 \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-      const c10::optional<torch::Tensor>& scale_opt,                         \
-      const c10::optional<torch::Tensor>& bias_opt) {                        \
-    TORCH_CHECK(false,                                                       \
-                "Unsupported dtype for compressed matrix multiplication in " \
-                "current version of cuSPARSELt.");                           \
-  }                                                                          \
-  torch::Tensor cslt_mm_fp8_semi_structured2(                                \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B,       \
-      const c10::optional<torch::Tensor>& scale_opt,                         \
-      const c10::optional<torch::Tensor>& bias_opt) {                        \
-    TORCH_CHECK(false,                                                       \
-                "Unsupported dtype for compressed matrix multiplication in " \
-                "current version of cuSPARSELt.");                           \
-  }
+#define STUB_FUNC_IMPL()                                               \
+  torch::Tensor cslt_compress_fp8_semi_structured(                     \
+      const torch::Tensor& input) {                                    \
+    TORCH_CHECK(false, "cusparseLt is not found");                     \
+  }                                                                    \
+                                                                       \
+  torch::Tensor cslt_mm_semi_structured(                               \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
+      const c10::optional<torch::Tensor>& scale_opt,                   \
+      const c10::optional<torch::Tensor>& bias_opt) {                  \
+    TORCH_CHECK(false, "cusparseLt is not found");                     \
+  }                                                                    \
+  torch::Tensor cslt_mm_fp8_semi_structured2(                          \
+      const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
+      const c10::optional<torch::Tensor>& scale_opt,                   \
+      const c10::optional<torch::Tensor>& bias_opt) {                  \
+    TORCH_CHECK(false, "cusparseLt is not found");                     \
+  }                                                                    \
+  void cslt_clear_cache() { TORCH_CHECK(false, "cusparseLt is not found"); }
 
 #if defined(VLLM_CUSPARSELT_ENABLED)
 
@@ -206,14 +198,10 @@ void prepare_mm_semi_structured(const cacheID& tuple_id,
       &handle, entry.alg_sel_p, CUSPARSELT_MATMUL_SEARCH_ITERATIONS,
       &num_search_iters, sizeof(num_search_iters)));
 
-  // TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-  //     &handle, &plan, &entry.matmul, &alg_sel));
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
       &handle, entry.plan_p, entry.matmul_p, entry.alg_sel_p));
 
   size_t workspace_size;
-  // TORCH_CUDASPARSE_CHECK(
-  //     cusparseLtMatmulGetWorkspace(&handle, &global_plan, &workspace_size));
   TORCH_CUDASPARSE_CHECK(
       cusparseLtMatmulGetWorkspace(&handle, entry.plan_p, &workspace_size));
   AT_CUDA_CHECK(cudaMalloc((void**)&entry.workspace_ptr, workspace_size));
@@ -279,7 +267,7 @@ torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
   return compressed_tensor;
 }
 
-torch::Tensor cslt_mm_fp8_semi_structured(
+torch::Tensor cslt_mm_semi_structured(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<double>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt) {
@@ -342,8 +330,6 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
     const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
     const c10::optional<double>& alpha_opt,
     const c10::optional<torch::Tensor>& bias_opt) {
-  TORCH_CHECK(compressed_A.scalar_type() == at::ScalarType::Float8_e4m3fn,
-              "Only float8 e4m3 is supported in vllm:cslt_compress");
   namespace vc = vllm::cusparseLt;
   if (!vc::handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
@@ -355,40 +341,54 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
 
-  cudaDataType input_type = CUDA_R_8F_E4M3;
+  cudaDataType input_type;
   cudaDataType output_type;
   cudaDataType C_type;
-  cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
+  cusparseComputeType compute_type;
   auto compression_factor = 9;
-  auto out_dtype = dense_B.scalar_type();
-
-  switch (out_dtype) {
-    case at::ScalarType::Float8_e4m3fn:
-      output_type = CUDA_R_8F_E4M3;
-      C_type = CUDA_R_16F;
+  switch (compressed_A.scalar_type()) {
+    case at::ScalarType::Char:
+      input_type = CUDA_R_8I;
+      output_type = CUDA_R_8I;
+      C_type = CUDA_R_8I;
+      compute_type = CUSPARSE_COMPUTE_32I;
+      compression_factor = 10;
       break;
     case at::ScalarType::Half:
+      input_type = CUDA_R_16F;
       output_type = CUDA_R_16F;
       C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
       break;
     case at::ScalarType::BFloat16:
+      input_type = CUDA_R_16BF;
       output_type = CUDA_R_16BF;
       C_type = CUDA_R_16BF;
+      compute_type = CUSPARSE_COMPUTE_32F;
       break;
     case at::ScalarType::Float:
+      input_type = CUDA_R_32F;
       output_type = CUDA_R_32F;
       C_type = CUDA_R_32F;
+      compute_type = CUSPARSE_COMPUTE_32F;
+      break;
+    case at::ScalarType::Float8_e4m3fn:
+      input_type = CUDA_R_8F_E4M3;
+      output_type = CUDA_R_8F_E4M3;
+      C_type = CUDA_R_16F;
+      compute_type = CUSPARSE_COMPUTE_32F;
       break;
     default:
-      TORCH_CHECK(false,
-                  "Unsupported out_dtype passed, must be one of {fp16, bf16, "
-                  "float32} for fp8 inputs");
+      TORCH_CHECK(
+          false,
+          "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
       break;
   }
 
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+  auto out_dtype = dense_B.scalar_type();
 
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
@@ -467,6 +467,9 @@ torch::Tensor cslt_mm_fp8_semi_structured2(
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
   return res;
 }
+
+void cslt_clear_cache() { vllm::cusparseLt::cusparseLt_cache.clear(); }
+
   #else
 
 STUB_FUNC_IMPL()
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8064af5134761..e2e7fd5fe01d2 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -311,10 +311,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
            &cslt_compress_fp8_semi_structured);
 
   ops.def(
-      "cslt_mm_fp8_semi_structured(Tensor! compressed_A, Tensor! denseB,"
+      "cslt_mm_semi_structured(Tensor! compressed_A, Tensor! denseB,"
       "float!? scale, Tensor!? bias) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured", torch::kCUDA,
-           &cslt_mm_fp8_semi_structured);
+  ops.impl("cslt_mm_semi_structured", torch::kCUDA, &cslt_mm_semi_structured);
 
   ops.def(
       "cslt_mm_fp8_semi_structured2(Tensor! compressed_A, Tensor! denseB,"
@@ -322,6 +321,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("cslt_mm_fp8_semi_structured2", torch::kCUDA,
            &cslt_mm_fp8_semi_structured2);
 
+  ops.def("cslt_clear_cache() -> ()");
+  ops.impl("cslt_clear_cache", &cslt_clear_cache);
+
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index c67bee2d74b22..e107630979250 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -7,8 +7,10 @@
     decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
+    semi_structured_dense_sparse_T_gemm_scaled,
     semi_structured_sparse_dense_gemm,
-    semi_structured_sparse_dense_gemm_scaled, semi_structured_sparse_dense_gemm2)
+    semi_structured_sparse_dense_gemm_scaled,
+    clear_cache)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
@@ -70,18 +72,22 @@ def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
     A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
     B = get_random_mat(K, N, dtype)
     if dtype is torch.int8:
-        with pytest.raises(ValueError) as e:
+        with pytest.raises(ValueError):
             C_sparse = semi_structured_sparse_dense_gemm(A, B)
     else:
         C_sparse = semi_structured_sparse_dense_gemm(A, B)
         C = dense_matmul(A_pruned, B, dtype)
         torch.testing.assert_close(C, C_sparse)
 
-    # Verify cache
-    B = get_random_mat(K, N, dtype)
-    C = dense_matmul(A_pruned, B, dtype)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B)
-    torch.testing.assert_close(C, C_sparse)
+        # Verify cache
+        B = get_random_mat(K, N, dtype)
+        C = dense_matmul(A_pruned, B, dtype)
+        C_sparse = semi_structured_sparse_dense_gemm(A, B)
+        torch.testing.assert_close(C, C_sparse)
+
+        C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False)
+        torch.testing.assert_close(C, C_sparse)
+        clear_cache()
 
 
 @pytest.mark.skipif(
@@ -105,13 +111,17 @@ def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
     C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
     torch.testing.assert_close(C, C_sparse)
 
+    C_sparse = semi_structured_sparse_dense_gemm(A, B.t(), cached=False)
+    torch.testing.assert_close(C, C_sparse)
+    clear_cache()
+
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
     not is_semi_structured_supported()
     or not is_quant_method_supported("modelopt"),
     reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_matmul2():
+def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
     M, N, K = (32, 64, 32)
     dtype = torch.float8_e4m3fn
     A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
@@ -119,9 +129,21 @@ def test_torch_semi_structured_sparse_dense_T_fp8_matmul2():
     B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
 
     C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm2(A, B).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    # Cached version
+    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
+    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
 
+    # Noncached version
+    C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False).to(
+        torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
+
 
 @pytest.mark.skipif(
     not is_semi_structured_supported(),
@@ -138,6 +160,11 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
     C = dense_matmul(A, B_T_pruned.t(), dtype)
     torch.testing.assert_close(C, C_sparse)
 
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T, cached=False)
+    C = dense_matmul(A, B_T_pruned.t(), dtype)
+    torch.testing.assert_close(C, C_sparse)
+    clear_cache()
+
 
 # TODO modelopt config has to be replaced with corresponding fp8_24 config
 @pytest.mark.skipif(
@@ -154,3 +181,113 @@ def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
     C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
     C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
     torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
+    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
+    A_pruned_fp8, scale_A = to_float8(A_pruned)
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+    # cached
+    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
+    B_fp8, scale_B = to_float8(B)
+
+    C = torch._scaled_mm(A_pruned_fp8,
+                         B_fp8,
+                         scale_a=scale_A,
+                         scale_b=scale_B,
+                         out_dtype=torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+
+    # noncached
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
+                                                        B_fp8,
+                                                        scale_a=scale_A,
+                                                        scale_b=scale_B,
+                                                        cached=False).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported()
+    or not is_quant_method_supported("modelopt"),
+    reason="Semi structured fp8 matmul is not supported on this GPU type.")
+def test_torch_semi_structured_dense_sparse_T_fp8_scaled_matmul():
+    M, N, K = (32, 64, 32)
+    A = torch.rand((M, K), device='cuda', dtype=torch.float16)
+    A_fp8, scale_a = to_float8(A)
+    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=torch.float16)
+    B_T_pruned_fp8, scale_b = to_float8(B_T_pruned)
+    B_T_packed = compress_to_torch_sparse_semi_structured_mat(B_T_pruned_fp8)
+
+    C_sparse = semi_structured_dense_sparse_T_gemm_scaled(A_fp8,
+                                                          B_T_packed,
+                                                          scale_a=scale_a,
+                                                          scale_b=scale_b).to(
+                                                              torch.float32)
+    C = torch._scaled_mm(B_T_pruned_fp8,
+                         A_fp8.t(),
+                         scale_a=scale_b,
+                         scale_b=scale_a,
+                         out_dtype=torch.float32).t()
+    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
+    clear_cache()
+
+
+@pytest.mark.skipif(
+    not is_semi_structured_supported(),
+    reason="Semi structured matmul is not supported on this GPU type.")
+def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul():
+    dtype = torch.int8
+    M, N, K = (32, 64, 32)
+    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
+    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
+    B = get_random_mat(N, K, dtype)
+
+    scale_a = torch.tensor(2.0, dtype=torch.float32, device='cuda')
+    scale_b = torch.tensor(2.0, dtype=torch.float32, device='cuda')
+
+    C = dense_matmul(A_pruned,
+                     B.t(),
+                     dtype=dtype,
+                     scale_a=scale_a,
+                     scale_b=scale_b).to(torch.float32)
+    C_sparse = semi_structured_sparse_dense_gemm_scaled(A,
+                                                        B.t(),
+                                                        scale_a=scale_a,
+                                                        scale_b=scale_b).to(
+                                                            torch.float32)
+    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
+    clear_cache()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 14d3e76b0976b..d2f8cd76d5a4e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -704,13 +704,12 @@ def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.cslt_compress_fp8_semi_structured(input)
 
 
-def semi_structured_fp8_mm(
-        A_compressed: torch.Tensor,
-        B_dense: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return torch.ops._C.cslt_mm_fp8_semi_structured(A_compressed, B_dense,
-                                                    scale, bias)
+def semi_structured_mm(A_compressed: torch.Tensor,
+                       B_dense: torch.Tensor,
+                       scale: Optional[float] = None,
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    return torch.ops._C.cslt_mm_semi_structured(A_compressed, B_dense, scale,
+                                                bias)
 
 
 def semi_structured_fp8_mm2(
@@ -718,10 +717,12 @@ def semi_structured_fp8_mm2(
         B_dense: torch.Tensor,
         scale: Optional[float] = None,
         bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    assert A_compressed.dtype == torch.float8_e4m3fn
     return torch.ops._C.cslt_mm_fp8_semi_structured2(A_compressed, B_dense,
                                                      scale, bias)
 
+def semi_structured_clear_cache() -> None:
+    return torch.ops._C.cslt_clear_cache()
+
 
 # int8
 def scaled_int8_quant(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index 23cfd825e3448..f98631ab04a49 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -3,17 +3,14 @@
 from torch.sparse import (SparseSemiStructuredTensor,
                           SparseSemiStructuredTensorCUSPARSELT,
                           to_sparse_semi_structured)
-from torch.sparse import (SparseSemiStructuredTensor,
-                          SparseSemiStructuredTensorCUSPARSELT,
-                          to_sparse_semi_structured)
 
-from vllm import _custom_ops as ops
-from vllm._custom_ops import (semi_structured_fp8_compress,
-                              semi_structured_fp8_mm, semi_structured_fp8_mm2)
+from vllm._custom_ops import (cutlass_scaled_mm, semi_structured_fp8_compress,
+                              semi_structured_fp8_mm2, semi_structured_mm, semi_structured_clear_cache)
 from vllm.platforms import current_platform
 
 SparseSemiStructuredTensor._FORCE_CUTLASS = False
 
+
 def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
     '''
     Compresses original pruned (with zeros) tensor into packed version
@@ -21,8 +18,8 @@ def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
         pruned_tensor(torch.Tensor) - pruned but not packed tensor
     Returns: 
         torch.SparseSemiStructuredTensorCUSPARSELT: torch wrapped cusparseLt-packed tensor. 
-    '''
-    
+    ''' # noqa: E501
+
     if pruned_tensor.dtype == torch.float8_e4m3fn:
         packed = semi_structured_fp8_compress(pruned_tensor)
         return SparseSemiStructuredTensorCUSPARSELT(
@@ -48,13 +45,13 @@ def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
         packed_tensor - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
     Returns:
         pruned (torch.Tensor) - pruned torch.tensor
-    '''
+    ''' # noqa: E501
     if packed_tensor.dtype == torch.float8_e4m3fn:
-        return semi_structured_fp8_mm(packed_tensor.packed,
-                                      torch.eye(packed_tensor.shape[-1],
-                                                dtype=packed_tensor.dtype,
-                                                device=packed_tensor.device).t(),
-                                      transpose_result=False)
+        return semi_structured_mm(
+            packed_tensor.packed,
+            torch.eye(packed_tensor.shape[-1],
+                      dtype=packed_tensor.dtype,
+                      device=packed_tensor.device).t())
     else:
         # Fix of to_dense() function supporting int8
         # cuSparseLT for int8 requires dense matrix to be non-contiguous
@@ -67,60 +64,68 @@ def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
 
 def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
                                       b_dense: torch.Tensor,
-                                      bias: torch.Tensor = None):
+                                      bias: torch.Tensor = None,
+                                      cached: bool = True):
     '''
     Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices.
     In case of int8 and fp8 types, dense matrix B has to be non-contiguous.
     Args:
         a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
         b_dense (torch.Tensor) - dense matrix tensor.
-        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None. 
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
+
     Result:
         torch.Tensor - Result of matrix multiplication.
-    '''
+    ''' # noqa: E501
     assert a_packed.dtype in [
         torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
-    scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
-
-    # if a_sparse.dtype == torch.float8_e4m3fn:
-    #     scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    #     return semi_structured_fp8_mm(a_sparse.packed, b_dense, scale=scale)
-    # else:
-    #     return torch.mm(a_sparse, b_dense)
-
-
-def semi_structured_sparse_dense_gemm2(a_sparse: torch.Tensor,
-                                       b_dense: torch.Tensor):
-    assert a_sparse.dtype in [
-        torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_sparse.dtype}"
-    scale = 1.0
-    return semi_structured_fp8_mm2(a_sparse.packed, b_dense, scale=scale)
+    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
+    if b_dense.is_contiguous() and a_packed.dtype in [
+            torch.int8, torch.float8_e4m3fn
+    ]:
+        raise ValueError("cuSparseLt does not support"
+                         "contiguous dense matrix for int8 and fp8 types")
+
+    if cached:
+        return semi_structured_mm(a_packed.packed, b_dense, bias=bias)
+    else:
+        if a_packed.dtype == torch.float8_e4m3fn:
+            return semi_structured_fp8_mm2(a_packed.packed, b_dense, bias=bias)
+        else:
+            result = torch.mm(a_packed, b_dense)
+            if bias is not None:
+                result = torch.add(result, bias)
+            return result
 
 
 def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
                                         b_T_packed: torch.Tensor,
-                                        bias: torch.Tensor = None):
+                                        bias: torch.Tensor = None,
+                                        cached: bool = True):
     '''
     Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
     Args:
         a_dense (torch.Tensor) - dense matrix tensor.
         b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
     
     Returns:
         torch.Tensor - Result of matrix multiplication.
-    '''
-    return (semi_structured_sparse_dense_gemm(b_T_packed, a_dense.t(), bias)).t()
+    ''' # noqa: E501
+    return (semi_structured_sparse_dense_gemm(b_T_packed,
+                                              a_dense.t(),
+                                              bias=bias,
+                                              cached=cached)).t()
 
 
 def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
                                              b_dense: torch.Tensor,
                                              scale_a: torch.Tensor,
                                              scale_b: torch.Tensor,
-                                             bias: torch.Tensor = None):
+                                             bias: torch.Tensor = None,
+                                             cached: bool = False):
     '''
     Performs scaled matrix multiplication (a @ b) of transposed semi-structured sparse and dense fp8 matrices
     Args:
@@ -129,31 +134,63 @@ def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
         scale_a (torch.Tensor) - scaling factor for sparse matrix, must be in float32.
         scale_b (torch.Tensor) - scaling factor for dense matrix, must be in float32.
         bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
 
     Returns:
         torch.Tensor - Result of matrix multiplication.
-    '''
+    ''' # noqa: E501
 
-    assert (a_packed.dtype == torch.float8_e4m3fn
-            and b_dense.dtype == torch.float8_e4m3fn)
-    assert not b_dense.is_contiguous(
-    ), "cusparseLt requires dense matrix be non-contiguous"
     # cusparseLt requires alpha to be float
     assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
-    return semi_structured_fp8_mm(a_packed.packed,
+    scale = (scale_a * scale_b).item()
+    if cached:
+        return semi_structured_mm(a_packed.packed,
                                   b_dense,
-                                  alpha=scale_a * scale_b,
-                                  bias=bias,
-                                  transpose_result=False)
+                                  scale=scale,
+                                  bias=bias)
+    else:
+        return semi_structured_fp8_mm2(a_packed.packed,
+                                       b_dense,
+                                       bias=bias,
+                                       scale=scale)
+
+
+def semi_structured_dense_sparse_T_gemm_scaled(a_dense: torch.Tensor,
+                                               b_T_packed: torch.Tensor,
+                                               scale_a: torch.Tensor = None,
+                                               scale_b: torch.Tensor = None,
+                                               bias: torch.Tensor = None,
+                                               cached: bool = True):
+    '''
+    Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
+    Args:
+        a_dense (torch.Tensor) - dense matrix tensor.
+        b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
+        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
+        cached (bool) - whether to use cached(faster) version of cusparseLt wrapper.
+    
+    Returns:
+        torch.Tensor - Result of matrix multiplication.
+    '''  # noqa: E501
+    return (semi_structured_sparse_dense_gemm_scaled(b_T_packed,
+                                                     a_dense.t(),
+                                                     scale_a=scale_b,
+                                                     scale_b=scale_a,
+                                                     bias=bias,
+                                                     cached=cached)).t()
 
+def clear_cache():
+    semi_structured_clear_cache()
 
 # test utils
-def dense_matmul(A, B, dtype):
+def dense_matmul(A, B, dtype, scale_a=None, scale_b=None):
     if dtype in [torch.int8, torch.float8_e4m3fn]:
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        return ops.cutlass_scaled_mm(A, B, scale_a, scale_b,
-                                     torch.bfloat16).to(dtype)
+        if scale_a is None:
+            scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        if scale_b is None:
+            scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        return cutlass_scaled_mm(A, B, scale_a, scale_b,
+                                 torch.bfloat16).to(dtype)
     else:
         return A @ B
 

From 681ea5e427dcdfd5c6819ab4f9a861890b485350 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Wed, 23 Oct 2024 20:09:14 +0000
Subject: [PATCH 27/92] add sparse 2:4 weight loading suport

---
 requirements-cpu.txt                          |  2 +-
 .../compressed_tensors/compressed_tensors.py  | 21 +++++-
 .../compressed_tensors/schemes/__init__.py    |  4 ++
 .../schemes/compressed_tensors_24.py          | 66 +++++++++++++++++++
 vllm/model_executor/parameter.py              |  1 +
 5 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 27ca8ca5dbc58..8e47a850fc029 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.4.0+cpu; platform_machine != "ppc64le"
+torch == 2.5.0+cpu; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ecc345f116c37..ed69307bceecf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -18,7 +18,7 @@
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, CompressedTensors24)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
@@ -93,6 +93,7 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
+        """
         for _, quant_config in config["config_groups"].items():
             targets = quant_config.get("targets")
             for target in targets:
@@ -115,11 +116,26 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                         target_scheme_map[target][
                             "input_activations"] = QuantizationArgs.parse_obj(
                                 quant_config.get("input_activations"))
+        """
+        sparsity_config = config.get("sparsity_config")
+        targets = sparsity_config.get("targets")
+        sparsity_format = sparsity_config.get("format")
+        ignore = sparsity_config.get("ignore")
+        
+        for t in targets:
+            target_scheme_map[t] = sparsity_format
 
+        """
         return cls(target_scheme_map=target_scheme_map,
                    ignore=ignore,
                    quant_format=quant_format,
                    kv_cache_scheme=config.get("kv_cache_scheme"))
+        """
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=sparsity_format,
+        )
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -318,6 +334,7 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
+        """
         matched_target = find_matched_target(
             layer_name=layer_name,
             module=layer,
@@ -332,6 +349,8 @@ def get_scheme(
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
+        """
+        scheme = CompressedTensors24()
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 5d259ec72051c..369466bbe68ef 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -6,6 +6,9 @@
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
+from .compressed_tensors_24 import (
+    CompressedTensors24
+)
 
 __all__ = [
     "CompressedTensorsScheme",
@@ -16,4 +19,5 @@
     "CompressedTensorsW8A8Fp8",
     "WNA16_SUPPORTED_BITS",
     "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
new file mode 100644
index 0000000000000..d8d7e7a7f0c1e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -0,0 +1,66 @@
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.parameter import ModelWeightParameter
+import torch
+from typing import List, Callable, Optional
+
+__all__ = ["CompressedTensors24"]
+
+class CompressedTensors24(CompressedTensorsScheme):
+    def __init__(self):
+        pass
+    
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                    output_partition_sizes: List[int],
+                    input_size_per_partition: int,
+                    params_dtype: torch.dtype, weight_loader: Callable,
+                    **kwargs):
+        
+
+        print("output_partition_sizes",output_partition_sizes)
+        print("input_size_per_partition", input_size_per_partition)
+        print("\n")
+
+        # packed dim is dim 1/along input dim
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition // 2,
+            dtype=torch.float8_e4m3fn),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader)
+
+        # meta dim changes based on dtype
+        meta = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes), 
+            input_size_per_partition // 16,
+            dtype=torch.int32),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader)
+
+        # per tensor quantization ---> map to channel?
+        """
+        weight_scale = torch.nn.Parameter()
+        input_scale = torch.nn.Parameter()
+        """
+        layer.register_parameter("weight_packed", weight)
+        layer.register_parameter("meta", meta)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Any preprocessing for the kernel
+        # e.g mapp per tensor scales to channel
+        # apply marlin format to the weights before kernel call
+        pass 
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        print("in forward")
+        breakpoint()
+                
\ No newline at end of file
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 7a6d7c90f34d5..1bd7901037b67 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -105,6 +105,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
                                              tp_rank * shard_size, shard_size)
+                                             
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 

From ecf878fa78a48890cc835e4d903f047e03aa5a4f Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Tue, 29 Oct 2024 20:49:59 +0000
Subject: [PATCH 28/92] Some more changes!

---
 .../compressed_tensors/compressed_tensors.py  | 13 +++-
 .../schemes/compressed_tensors_24.py          | 71 +++++++++++++++----
 vllm/model_executor/parameter.py              |  3 +-
 3 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ed69307bceecf..b786f28e649d8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -24,6 +24,7 @@
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
+from compressed_tensors.compressors import ModelCompressor
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
@@ -34,13 +35,16 @@ def __init__(self,
                  target_scheme_map: Dict[str, Any],
                  ignore: List[str],
                  quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None):
+                 kv_cache_scheme: Optional[Dict[str, Any]] = None,
+                 model_compressor: Optional[ModelCompressor] = None,
+                 ):
 
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
+        self.model_compressor = model_compressor
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -125,6 +129,8 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         for t in targets:
             target_scheme_map[t] = sparsity_format
 
+        model_compressor = ModelCompressor.from_compression_config(config)
+
         """
         return cls(target_scheme_map=target_scheme_map,
                    ignore=ignore,
@@ -135,6 +141,7 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
             target_scheme_map=target_scheme_map,
             ignore=ignore,
             quant_format=sparsity_format,
+            model_compressor=model_compressor,
         )
 
     @classmethod
@@ -350,7 +357,9 @@ def get_scheme(
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
         """
-        scheme = CompressedTensors24()
+        scheme = CompressedTensors24(
+            model_compressor=self.model_compressor,
+        )
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index d8d7e7a7f0c1e..5ad0f69456cb5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -3,12 +3,19 @@
 from vllm.model_executor.parameter import ModelWeightParameter
 import torch
 from typing import List, Callable, Optional
+from compressed_tensors.compressors import ModelCompressor
+from torch.nn import Parameter
+from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
+    compress_to_torch_sparse_semi_structured_mat,
+    semi_structured_sparse_dense_gemm
+    )
 
 __all__ = ["CompressedTensors24"]
 
 class CompressedTensors24(CompressedTensorsScheme):
-    def __init__(self):
-        pass
+    def __init__(self, model_compressor: Optional[ModelCompressor] = None):
+        self.model_compressor = model_compressor
+        
     
     @classmethod
     def get_min_capability(cls) -> int:
@@ -19,26 +26,36 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                     input_size_per_partition: int,
                     params_dtype: torch.dtype, weight_loader: Callable,
                     **kwargs):
-        
-
-        print("output_partition_sizes",output_partition_sizes)
-        print("input_size_per_partition", input_size_per_partition)
-        print("\n")
+    
 
         # packed dim is dim 1/along input dim
         weight = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes),
             input_size_per_partition // 2,
-            dtype=torch.float8_e4m3fn),
+            dtype=torch.bfloat16),
             input_dim=1,
             output_dim=0,
             weight_loader=weight_loader)
 
+
+        meta_dtype_map = {
+            torch.int8: torch.int32,
+            torch.float8_e4m3fn: torch.int32,
+            torch.half: torch.int16,
+            torch.bfloat16: torch.int16,
+            torch.float16: torch.int16,
+            torch.float: torch.int16,
+            torch.float32: torch.int16,
+            torch.int32: torch.int16,
+        }
+
+        meta_dtype = meta_dtype_map[weight.dtype]
+
         # meta dim changes based on dtype
         meta = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes), 
             input_size_per_partition // 16,
-            dtype=torch.int32),
+            dtype=meta_dtype),
             input_dim=1,
             output_dim=0,
             weight_loader=weight_loader)
@@ -50,17 +67,47 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         """
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("meta", meta)
+        
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Any preprocessing for the kernel
         # e.g mapp per tensor scales to channel
         # apply marlin format to the weights before kernel call
-        pass 
+        # decompress
+
+        if hasattr(layer, "weight_packed"):
+            weight = layer.weight_packed.data
+            meta = layer.meta.data
+
+            # decompress
+
+            weight_data = {
+                "weight_packed": weight,
+                "meta": meta
+            }
+
+            decompressed_weight = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
+            decompressed_weight = decompressed_weight.t().contiguous()
+            compressed = compress_to_torch_sparse_semi_structured_mat(decompressed_weight)
+            layer.weight_packed = Parameter(compressed, requires_grad=False)
+            
+             
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        print("in forward")
-        breakpoint()
+        
+        weight = layer.weight_packed.data
+
+        # apply the kernel
+        output =  semi_structured_sparse_dense_gemm(weight, x)
+
+        if bias is not None:
+            output.add_(bias) # In-place add
+        
+        return output.t().contiguous()
+
+
+
                 
\ No newline at end of file
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 1bd7901037b67..f9666b3870fb1 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -104,8 +104,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
         param_data = param_data.narrow(self.output_dim, shard_offset,
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
-                                             
+                                             tp_rank * shard_size, shard_size)                                 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 

From 80952dca770b2ef3a184d66d93c616526b3f2b3d Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Thu, 31 Oct 2024 19:32:47 +0000
Subject: [PATCH 29/92] Cleanup

---
 .../schemes/compressed_tensors_24.py          | 97 +++++++++++--------
 1 file changed, 54 insertions(+), 43 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 5ad0f69456cb5..6643396014e89 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -7,7 +7,7 @@
 from torch.nn import Parameter
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat,
-    semi_structured_sparse_dense_gemm
+    semi_structured_dense_sparse_T_gemm
     )
 
 __all__ = ["CompressedTensors24"]
@@ -26,88 +26,99 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                     input_size_per_partition: int,
                     params_dtype: torch.dtype, weight_loader: Callable,
                     **kwargs):
-    
-
-        # packed dim is dim 1/along input dim
-        weight = ModelWeightParameter(data=torch.empty(
+        
+        weights_dtype = params_dtype
+        weights = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes),
             input_size_per_partition // 2,
-            dtype=torch.bfloat16),
+            dtype=weights_dtype),
             input_dim=1,
             output_dim=0,
             weight_loader=weight_loader)
 
+        bits_per_weight_element = weights.itemsize * 8 
+        
+        meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
 
-        meta_dtype_map = {
-            torch.int8: torch.int32,
-            torch.float8_e4m3fn: torch.int32,
-            torch.half: torch.int16,
-            torch.bfloat16: torch.int16,
-            torch.float16: torch.int16,
-            torch.float: torch.int16,
-            torch.float32: torch.int16,
-            torch.int32: torch.int16,
-        }
-
-        meta_dtype = meta_dtype_map[weight.dtype]
 
-        # meta dim changes based on dtype
+        meta_input_size = (
+            input_size_per_partition // 32
+            if bits_per_weight_element == 8
+            else input_size_per_partition // 16
+        )
         meta = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes), 
-            input_size_per_partition // 16,
+            meta_input_size,
             dtype=meta_dtype),
             input_dim=1,
             output_dim=0,
             weight_loader=weight_loader)
 
-        # per tensor quantization ---> map to channel?
-        """
-        weight_scale = torch.nn.Parameter()
-        input_scale = torch.nn.Parameter()
-        """
-        layer.register_parameter("weight_packed", weight)
+        # TODO: replace weight_packed name, with something
+        # more meaningful, like sparse24_packed, this will
+        # require changes on compressed_tensors side
+
+        layer.register_parameter("weight_packed", weights)
         layer.register_parameter("meta", meta)
         
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # Any preprocessing for the kernel
-        # e.g mapp per tensor scales to channel
-        # apply marlin format to the weights before kernel call
-        # decompress
+        """
+        Apply any transformations to the weights after loading
+        them from disk
+
+        :param layer: The layer with the weights to be processed
+        """
+        
+        # TODO: right now this is hard coded for 24 compressor
+        # replace by a better way to identify targetted params
+        # using COMPRESSION_PARAMS defined by sparse compressors
+        # and decompress the weights accordingly
 
         if hasattr(layer, "weight_packed"):
+            # TODO: this name will also be changed to sparse24_packed
             weight = layer.weight_packed.data
             meta = layer.meta.data
 
-            # decompress
-
             weight_data = {
                 "weight_packed": weight,
                 "meta": meta
             }
 
             decompressed_weight = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
-            decompressed_weight = decompressed_weight.t().contiguous()
+            decompressed_weight = decompressed_weight
             compressed = compress_to_torch_sparse_semi_structured_mat(decompressed_weight)
             layer.weight_packed = Parameter(compressed, requires_grad=False)
-            
-             
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4 
+        sparse compressed weights, given the input tensor
+        and bias
+
+        :param layer: The layer with 2:4 sparse compressed 
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer 
+        """
+        result = semi_structured_dense_sparse_T_gemm(
+            a_dense=x,
+            b_T_packed=layer.weight_packed.data, 
+            bias=bias,
+            )
         
-        weight = layer.weight_packed.data
-
-        # apply the kernel
-        output =  semi_structured_sparse_dense_gemm(weight, x)
-
-        if bias is not None:
-            output.add_(bias) # In-place add
+        has_nans = torch.any(torch.isnan(result))
         
-        return output.t().contiguous()
+        assert not has_nans
 
+        print("Result: ", result)
+        print("+" * 10)
+        return result 
+    
 
 
                 
\ No newline at end of file

From 8462c9d75aa333efcc3f126f0af21dd541fc348a Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 1 Nov 2024 21:21:25 +0000
Subject: [PATCH 30/92] get uncompressed to work; update gemm to use
 contiguous; use alex's utils instead of our decompressor

---
 .../compressed_tensors/compressed_tensors.py  |   1 +
 .../schemes/compressed_tensors_24.py          | 109 ++++++++++--------
 .../sparsity/utils/cusparse_2_4_utils.py      |   2 +-
 3 files changed, 65 insertions(+), 47 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b786f28e649d8..598dea3d91fd1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -359,6 +359,7 @@ def get_scheme(
         """
         scheme = CompressedTensors24(
             model_compressor=self.model_compressor,
+            layer_name=layer_name
         )
 
         return scheme
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 6643396014e89..b96c915dceca8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -9,14 +9,15 @@
     compress_to_torch_sparse_semi_structured_mat,
     semi_structured_dense_sparse_T_gemm
     )
-
+from torch.sparse import to_sparse_semi_structured
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import sparse_semi_structured_to_dense_cutlass, sparse_semi_structured_from_dense_cutlass
 __all__ = ["CompressedTensors24"]
 
 class CompressedTensors24(CompressedTensorsScheme):
-    def __init__(self, model_compressor: Optional[ModelCompressor] = None):
+    def __init__(self, model_compressor: Optional[ModelCompressor] = None, layer_name = None):
         self.model_compressor = model_compressor
-        
-    
+        self.layer_name = layer_name
+
     @classmethod
     def get_min_capability(cls) -> int:
         return 80
@@ -27,6 +28,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                     params_dtype: torch.dtype, weight_loader: Callable,
                     **kwargs):
         
+        compressed = True  # toggle based on the case we're running
         weights_dtype = params_dtype
         weights = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes),
@@ -36,30 +38,41 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
             output_dim=0,
             weight_loader=weight_loader)
 
-        bits_per_weight_element = weights.itemsize * 8 
-        
-        meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
-
-
-        meta_input_size = (
-            input_size_per_partition // 32
-            if bits_per_weight_element == 8
-            else input_size_per_partition // 16
-        )
-        meta = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes), 
-            meta_input_size,
-            dtype=meta_dtype),
+        # parameter to store uncompressed weight or decompressed weight
+        weight_unpacked = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=weights_dtype),
             input_dim=1,
             output_dim=0,
             weight_loader=weight_loader)
-
-        # TODO: replace weight_packed name, with something
-        # more meaningful, like sparse24_packed, this will
-        # require changes on compressed_tensors side
-
-        layer.register_parameter("weight_packed", weights)
-        layer.register_parameter("meta", meta)
+        
+        # For the uncompressed case
+        if compressed:
+            bits_per_weight_element = weights.itemsize * 8 
+            meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
+
+            meta_input_size = (
+                input_size_per_partition // 32
+                if bits_per_weight_element == 8
+                else input_size_per_partition // 16
+            )
+            meta = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes), 
+                meta_input_size,
+                dtype=meta_dtype),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader)
+
+            # TODO: replace weight_packed name, with something
+            # more meaningful, like sparse24_packed, this will
+            # require changes on compressed_tensors side
+
+            layer.register_parameter("weight_packed", weights)
+            layer.register_parameter("meta", meta)
+
+        layer.register_parameter("weight", weight_unpacked)
         
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
@@ -74,21 +87,29 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # replace by a better way to identify targetted params
         # using COMPRESSION_PARAMS defined by sparse compressors
         # and decompress the weights accordingly
-
         if hasattr(layer, "weight_packed"):
             # TODO: this name will also be changed to sparse24_packed
-            weight = layer.weight_packed.data
+            weight_packed_data = layer.weight_packed.data
             meta = layer.meta.data
 
             weight_data = {
-                "weight_packed": weight,
+                "weight_packed": weight_packed_data,
                 "meta": meta
             }
-
-            decompressed_weight = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
-            decompressed_weight = decompressed_weight
-            compressed = compress_to_torch_sparse_semi_structured_mat(decompressed_weight)
-            layer.weight_packed = Parameter(compressed, requires_grad=False)
+            #decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data).contiguous()
+            # Temporarily swap in to use Alex's method. Seems like the compression might be wrong?
+            decompress = sparse_semi_structured_to_dense_cutlass(weight_packed_data, meta)
+            compressed = compress_to_torch_sparse_semi_structured_mat(decompress)
+            layer.weight = Parameter(compressed, requires_grad=False)
+            
+        else:
+            # assume uncompressd case
+            # Proof that Alex's methods work: we can compress and decompress to get accurate generation using his methods below
+            # Would be equivalent to uncommenting out the next two lines and passing decompress into compress_to_torch_sparse_semi_structured_mat which also works
+            #comp, meta = sparse_semi_structured_from_dense_cutlass(layer.weight)
+            #decompress = sparse_semi_structured_to_dense_cutlass(comp, meta)
+            compressed = compress_to_torch_sparse_semi_structured_mat(layer.weight)
+            layer.weight = Parameter(compressed, requires_grad=False)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
@@ -105,20 +126,16 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
-        result = semi_structured_dense_sparse_T_gemm(
-            a_dense=x,
-            b_T_packed=layer.weight_packed.data, 
-            bias=bias,
-            )
-        
-        has_nans = torch.any(torch.isnan(result))
-        
-        assert not has_nans
 
-        print("Result: ", result)
-        print("+" * 10)
-        return result 
-    
+        """ debugging code
+        a_sparse = to_sparse_semi_structured(layer.weight)
+        result = torch.mm(a_sparse, x.t().contiguous())
+        return result.t().contiguous()
+        """
+        return semi_structured_dense_sparse_T_gemm(
+            a_dense=x, 
+            b_T_packed=layer.weight.data
+        )
 
 
                 
\ No newline at end of file
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
index f98631ab04a49..ea4bf25acd7e5 100644
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
@@ -117,7 +117,7 @@ def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
     return (semi_structured_sparse_dense_gemm(b_T_packed,
                                               a_dense.t(),
                                               bias=bias,
-                                              cached=cached)).t()
+                                              cached=cached)).t().contiguous()
 
 
 def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,

From 0a3e5068667f4b254bb67eff527aa838c961cbe5 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 4 Nov 2024 23:28:59 +0000
Subject: [PATCH 31/92] patch

---
 .../schemes/compressed_tensors_24.py          | 39 +++++++++++++++----
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index b96c915dceca8..930bc75f04b47 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -92,14 +92,39 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             weight_packed_data = layer.weight_packed.data
             meta = layer.meta.data
 
-            weight_data = {
-                "weight_packed": weight_packed_data,
-                "meta": meta
-            }
-            #decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data).contiguous()
+            qkv_sizes = [2048, 256, 256]
+            gate_up_sizes = [5632, 5632]
+            split_weights = None 
+            split_meta = None
+
+            def _process_split(input_weight, input_meta):
+                decompress = sparse_semi_structured_to_dense_cutlass(input_weight, input_meta)
+                return decompress
+
+            print(self.layer_name)
+            if "qkv" in self.layer_name:
+                split_weights = torch.split(weight_packed_data, qkv_sizes)
+                split_meta = torch.split(meta, qkv_sizes)
+            elif "gate_up" in self.layer_name:
+                split_weights = torch.split(weight_packed_data, gate_up_sizes)
+                split_meta = torch.split(meta, gate_up_sizes)
+            
+            if split_weights:
+                all_compress = []
+                for i in range(len(split_weights)):
+                    print(split_weights[i].shape, split_meta[i].shape)
+                    compress_i = _process_split(split_weights[i], split_meta[i])
+                    all_compress.append(compress_i)
+                
+                compressed = torch.cat(all_compress)
+                compressed = compress_to_torch_sparse_semi_structured_mat(compressed)
+            else:
+                decompress = sparse_semi_structured_to_dense_cutlass(weight_packed_data, meta)
+                compressed = compress_to_torch_sparse_semi_structured_mat(decompress)
+            
+            #decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
             # Temporarily swap in to use Alex's method. Seems like the compression might be wrong?
-            decompress = sparse_semi_structured_to_dense_cutlass(weight_packed_data, meta)
-            compressed = compress_to_torch_sparse_semi_structured_mat(decompress)
+            
             layer.weight = Parameter(compressed, requires_grad=False)
             
         else:

From 2e2897264b9108b5afb86ff534bc4888c6e42797 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 4 Nov 2024 23:34:40 +0000
Subject: [PATCH 32/92] use our decompressor

---
 .../schemes/compressed_tensors_24.py                  | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 930bc75f04b47..2844018cc98cb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -98,7 +98,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             split_meta = None
 
             def _process_split(input_weight, input_meta):
-                decompress = sparse_semi_structured_to_dense_cutlass(input_weight, input_meta)
+                weight_data = {
+                    "weight_packed": input_weight,
+                    "meta": input_meta
+                }
+                decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
                 return decompress
 
             print(self.layer_name)
@@ -119,12 +123,9 @@ def _process_split(input_weight, input_meta):
                 compressed = torch.cat(all_compress)
                 compressed = compress_to_torch_sparse_semi_structured_mat(compressed)
             else:
-                decompress = sparse_semi_structured_to_dense_cutlass(weight_packed_data, meta)
+                decompress = _process_split(weight_packed_data, meta)
                 compressed = compress_to_torch_sparse_semi_structured_mat(decompress)
             
-            #decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
-            # Temporarily swap in to use Alex's method. Seems like the compression might be wrong?
-            
             layer.weight = Parameter(compressed, requires_grad=False)
             
         else:

From 28f0abb13a05ef73a0fe3f70f648c8b65d947ac3 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 6 Nov 2024 18:54:20 +0000
Subject: [PATCH 33/92] Some more work

---
 .../schemes/compressed_tensors_24.py          | 148 ++++++++++++++++--
 1 file changed, 131 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 2844018cc98cb..1ade2f000c756 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,22 +1,30 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.parameter import ModelWeightParameter
+from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
 import torch
 from typing import List, Callable, Optional
 from compressed_tensors.compressors import ModelCompressor
 from torch.nn import Parameter
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat,
-    semi_structured_dense_sparse_T_gemm
+    semi_structured_dense_sparse_T_gemm,
+    semi_structured_sparse_dense_gemm_scaled
     )
 from torch.sparse import to_sparse_semi_structured
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import sparse_semi_structured_to_dense_cutlass, sparse_semi_structured_from_dense_cutlass
+from vllm import _custom_ops as ops
+from typing import Tuple
+
 __all__ = ["CompressedTensors24"]
 
 class CompressedTensors24(CompressedTensorsScheme):
     def __init__(self, model_compressor: Optional[ModelCompressor] = None, layer_name = None):
         self.model_compressor = model_compressor
         self.layer_name = layer_name
+        self.quantized = True  # toggle based on the case we're running
+        self.compressed = False  # toggle based on the case we're running
+
+
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -27,8 +35,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                     input_size_per_partition: int,
                     params_dtype: torch.dtype, weight_loader: Callable,
                     **kwargs):
-        
-        compressed = True  # toggle based on the case we're running
+        layer.logical_widths = output_partition_sizes
         weights_dtype = params_dtype
         weights = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes),
@@ -47,8 +54,36 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
             output_dim=0,
             weight_loader=weight_loader)
         
-        # For the uncompressed case
-        if compressed:
+        if self.quantized:
+
+            # assume per tensor static quantization
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                    len(output_partition_sizes), dtype=torch.float),
+                                                    weight_loader=weight_loader)
+
+            weight_zero_point = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float8_e4m3fn),
+                                                weight_loader=weight_loader)
+            
+            
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                    len(output_partition_sizes), dtype=torch.float),
+                                                    weight_loader=weight_loader)
+            
+            input_zero_point = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float8_e4m3fn),
+                                                weight_loader=weight_loader)
+
+
+            layer.register_parameter("weight_scale", weight_scale)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("input_zero_point", input_zero_point)
+            layer.register_parameter("weight_zero_point", weight_zero_point)
+    
+        if self.compressed:
+            # store compression specific things to be used
+            # later during decompression
+
             bits_per_weight_element = weights.itemsize * 8 
             meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
 
@@ -87,7 +122,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # replace by a better way to identify targetted params
         # using COMPRESSION_PARAMS defined by sparse compressors
         # and decompress the weights accordingly
-        if hasattr(layer, "weight_packed"):
+        if self.compressed and hasattr(layer, "weight_packed"):
             # TODO: this name will also be changed to sparse24_packed
             weight_packed_data = layer.weight_packed.data
             meta = layer.meta.data
@@ -129,12 +164,29 @@ def _process_split(input_weight, input_meta):
             layer.weight = Parameter(compressed, requires_grad=False)
             
         else:
-            # assume uncompressd case
-            # Proof that Alex's methods work: we can compress and decompress to get accurate generation using his methods below
-            # Would be equivalent to uncommenting out the next two lines and passing decompress into compress_to_torch_sparse_semi_structured_mat which also works
-            #comp, meta = sparse_semi_structured_from_dense_cutlass(layer.weight)
-            #decompress = sparse_semi_structured_to_dense_cutlass(comp, meta)
-            compressed = compress_to_torch_sparse_semi_structured_mat(layer.weight)
+            # uncompressed case
+            # quantize the weights to fp8 and store them
+
+            dq_weight = layer.weight.data
+            weight_scale = layer.weight_scale.data
+
+            if len(weight_scale) != 1:
+                # needed for cases where modules are merged
+                # to reduce the number of scales to one
+                scale, q_weight = quantize_with_max_scale(
+                    dq_weight, weight_scale, layer.logical_widths
+                )
+            else:
+                # if modules are not merged, we can directly
+                # use the scale provided, and quantize the weights
+                q_weight, scale = ops.scaled_fp8_quant(dq_weight, weight_scale)
+
+            layer.weight_scale = Parameter(scale, requires_grad=False)
+
+            # Temporary check to ensure that the weights are 2:4 sparse
+            assert check_24(q_weight), "Not 2:4 sparse"
+            
+            compressed = compress_to_torch_sparse_semi_structured_mat(q_weight)
             layer.weight = Parameter(compressed, requires_grad=False)
 
     def apply_weights(self,
@@ -158,10 +210,72 @@ def apply_weights(self,
         result = torch.mm(a_sparse, x.t().contiguous())
         return result.t().contiguous()
         """
-        return semi_structured_dense_sparse_T_gemm(
-            a_dense=x, 
-            b_T_packed=layer.weight.data
+
+        if not self.quantized:
+            return semi_structured_dense_sparse_T_gemm(
+                a_dense=x, 
+                b_T_packed=layer.weight.data
+            )
+        
+        input_scale = layer.input_scale.data
+        weight_scale = layer.weight_scale.data
+        weight = layer.weight.data
+
+        # Quantize the input tensor to fp8
+        # can use the max scale for the input tensor
+        # as the merged modules have a same scale
+        # repeated for all the partitions
+        input_scale = input_scale.max()
+        q_input, input_scale = ops.scaled_fp8_quant(x, input_scale)
+        
+        if q_input.is_contiguous():
+            # Make q_input non-contiguous
+            # as expected by the kernel
+            q_input = q_input.t().contiguous().t()
+
+
+        assert not q_input.is_contiguous(), "Input is contiguous, the Kernel expects non-contiguous input"
+        output =  semi_structured_sparse_dense_gemm_scaled(
+            a_packed=weight,
+            b_dense=q_input,
+            scale_a=weight_scale,
+            scale_b=input_scale,
+            bias=bias
         )
+        output = output.t().to(x.dtype)
+        print()
+        print(f"{self.layer_name} executed")
+        print("\t", "Input shape:", x.shape, "weight shape:", weight.shape, "output shape:", output.shape)
+        return output
+
+
+def quantize_with_max_scale(
+        weight: torch.Tensor, weight_scale: torch.Tensor,
+        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for quanitzation.
+    max_w_scale = weight_scale.max()
+
+    # QKV / MLP is fused in the on disk checkpoint if any of the
+    # weight scales are still set to the default since we initialize
+    # N weight scales for N shards but we only load 1 weight scale
+    # from disk in this case. Skip requantization in this case (since)
+    # we already are quantized with the single scale.
+    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
+        torch.float8_e4m3fn).min)
+    q_weight = torch.empty_like(weight).to(torch.float8_e4m3fn)
+    # If unfused checkpoint, need quantize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            end = start + logical_width
+            q_weight[start:end, :], _ = ops.scaled_fp8_quant(
+                weight[start:end, :], max_w_scale)
+            start = end    
+    return max_w_scale, q_weight
 
+def check_24(tensor):
+    new_tensor = tensor.view(-1, 4)    
+    zero_counts = (new_tensor == 0).sum(dim=1)
+    return (zero_counts >= 2).all().item()
 
-                
\ No newline at end of file

From c7a97a88a169c2fc714918de7a53d776cbed83d3 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Thu, 7 Nov 2024 15:24:15 +0000
Subject: [PATCH 34/92] Use new scaled_T function

---
 .../schemes/compressed_tensors_24.py              | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 1ade2f000c756..f6da79771672b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -8,7 +8,8 @@
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
     compress_to_torch_sparse_semi_structured_mat,
     semi_structured_dense_sparse_T_gemm,
-    semi_structured_sparse_dense_gemm_scaled
+    semi_structured_sparse_dense_gemm_scaled,
+    semi_structured_dense_sparse_T_gemm_scaled,
     )
 from torch.sparse import to_sparse_semi_structured
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import sparse_semi_structured_to_dense_cutlass, sparse_semi_structured_from_dense_cutlass
@@ -235,14 +236,14 @@ def apply_weights(self,
 
 
         assert not q_input.is_contiguous(), "Input is contiguous, the Kernel expects non-contiguous input"
-        output =  semi_structured_sparse_dense_gemm_scaled(
-            a_packed=weight,
-            b_dense=q_input,
-            scale_a=weight_scale,
-            scale_b=input_scale,
+        output =  semi_structured_dense_sparse_T_gemm_scaled(
+            a_dense=q_input,
+            b_T_packed=weight,
+            scale_a=input_scale,
+            scale_b=weight_scale,
             bias=bias
         )
-        output = output.t().to(x.dtype)
+        output = output.to(x.dtype)
         print()
         print(f"{self.layer_name} executed")
         print("\t", "Input shape:", x.shape, "weight shape:", weight.shape, "output shape:", output.shape)

From ccadad0fef6d4de8606b4b48477a457054783649 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Fri, 8 Nov 2024 16:57:01 +0000
Subject: [PATCH 35/92] Add multiprocessing for kernel sweep benchmarking

---
 CMakeLists.txt                                |   7 +-
 .../cutlass_benchmarks/dense_mm/bench_v1.py   | 191 +++++
 .../cutlass_benchmarks/dense_mm/bench_v2.py   | 369 +++++++++
 .../cutlass_benchmarks/dense_mm/utils.py      |  36 +
 .../dense_mm/w8a8_benchmarks.py               | 211 +++++
 .../dense_mm/weight_shapes.py                 |  75 ++
 .../bench_v1.py}                              | 304 +------
 .../cutlass_benchmarks/sparse_mm/bench_v2.py  | 437 ++++++++++
 .../sparse_mm/mm_benchmarks.py                | 215 +++++
 .../cutlass_benchmarks/sparse_mm/utils.py     |  87 ++
 .../sparse_mm/weight_shapes.py                |  75 ++
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 389 ---------
 .../cutlass_benchmarks/weight_shapes.py       |  43 -
 benchmarks/kernels/weight_shapes.py           |  32 +
 .../cutlass_w8a8/generator/README.md          | 143 ++++
 .../generator/autogen_manifest.py             | 164 ++++
 .../cutlass_w8a8/generator/generator.py       | 145 ++++
 .../cutlass_w8a8/generator/generator_types.py | 125 +++
 .../cutlass_w8a8/generator/kernel_compiler.py | 128 +++
 .../generator/kernel_generator.py             | 245 ++++++
 .../generator/scaled_mm_c3x.jinja             |  56 ++
 .../generator/scaled_mm_c3x_fnprototype.jinja |   6 +
 .../generator/scaled_mm_c3x_streamk.jinja     |  87 ++
 .../scaled_mm_c3x_streamk_fnprototype.jinja   |   7 +
 .../generator/simple_gemm_c3x.jinja           |  48 ++
 .../simple_gemm_c3x_fnprototype.jinja         |   4 +
 .../cutlass_w8a8/generator/tools/heatmap.py   | 242 ++++++
 .../generator/tools/select_kernels.py         | 244 ++++++
 .../cutlass_w8a8/generator/tools/utils.py     |  63 ++
 .../cutlass_w8a8/generator/utils.py           |  48 ++
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 659 ++-------------
 .../cutlass_w8a8/scaled_mm_c3x.cuh            | 777 ++++++++++++++++++
 csrc/sparse/cutlass/generator/README.md       | 143 ++++
 .../cutlass/generator/autogen_manifest.py     | 145 ++++
 csrc/sparse/cutlass/generator/generator.py    | 145 ++++
 .../cutlass/generator/generator_types.py      |  77 ++
 .../cutlass/generator/kernel_compiler.py      | 128 +++
 .../cutlass/generator/kernel_generator.py     | 249 ++++++
 .../cutlass/generator/scaled_mm_c3x.jinja     |  57 ++
 .../generator/scaled_mm_c3x_fnprototype.jinja |   7 +
 .../sparse/cutlass/generator/tools/heatmap.py | 242 ++++++
 .../cutlass/generator/tools/select_kernels.py | 244 ++++++
 csrc/sparse/cutlass/generator/tools/utils.py  |  63 ++
 csrc/sparse/cutlass/generator/utils.py        |  48 ++
 csrc/sparse/cutlass/sparse_compressor.cu      |  12 +-
 ...arse_mm_c3x.cu => sparse_scaled_mm_c3x.cu} |  22 +-
 ...mmon_gemm.cuh => sparse_scaled_mm_c3x.cuh} |  62 +-
 ..._mm_entry.cu => sparse_scaled_mm_entry.cu} |   0
 nm_cutlass_c.cmake                            |  44 +
 setup.py                                      |  10 +
 50 files changed, 6031 insertions(+), 1329 deletions(-)
 create mode 100644 benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
 create mode 100644 benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
 create mode 100644 benchmarks/cutlass_benchmarks/dense_mm/utils.py
 create mode 100644 benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
 create mode 100644 benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
 rename benchmarks/cutlass_benchmarks/{test_benchmarks.py => sparse_mm/bench_v1.py} (50%)
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/utils.py
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
 delete mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
 delete mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/README.md
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/generator.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/generator_types.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/kernel_generator.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/tools/utils.py
 create mode 100644 csrc/quantization/cutlass_w8a8/generator/utils.py
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
 create mode 100644 csrc/sparse/cutlass/generator/README.md
 create mode 100644 csrc/sparse/cutlass/generator/autogen_manifest.py
 create mode 100644 csrc/sparse/cutlass/generator/generator.py
 create mode 100644 csrc/sparse/cutlass/generator/generator_types.py
 create mode 100644 csrc/sparse/cutlass/generator/kernel_compiler.py
 create mode 100644 csrc/sparse/cutlass/generator/kernel_generator.py
 create mode 100644 csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja
 create mode 100644 csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja
 create mode 100644 csrc/sparse/cutlass/generator/tools/heatmap.py
 create mode 100644 csrc/sparse/cutlass/generator/tools/select_kernels.py
 create mode 100644 csrc/sparse/cutlass/generator/tools/utils.py
 create mode 100644 csrc/sparse/cutlass/generator/utils.py
 rename csrc/sparse/cutlass/{sparse_mm_c3x.cu => sparse_scaled_mm_c3x.cu} (94%)
 rename csrc/sparse/cutlass/{util/common_gemm.cuh => sparse_scaled_mm_c3x.cuh} (93%)
 rename csrc/sparse/cutlass/{sparse_mm_entry.cu => sparse_scaled_mm_entry.cu} (100%)
 create mode 100644 nm_cutlass_c.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f51a9323c873b..aa149a4cfcf2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,7 +227,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/sparse/cutlass/sparse_mm_entry.cu"
+    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_compressor.cu")
 
   set_gencode_flags_for_srcs(
@@ -318,7 +318,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/sparse/cutlass/sparse_mm_c3x.cu")
+    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -458,6 +458,9 @@ define_gpu_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
+include(nm_cutlass_c.cmake)
+build_nm_cutlass_c()
+
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
new file mode 100644
index 0000000000000..d2f532c6bf18c
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
@@ -0,0 +1,191 @@
+## Cutlass benchmark V1
+
+from typing import Callable, Iterable
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
+
+import vllm._custom_ops as ops
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass with azp per-tensor
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj))
+
+    # cutlass with azp per-tensor + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, None, bias))
+
+    # cutlass with azp per-token
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp))
+
+    # cutlass with azp per-token + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+             sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
new file mode 100644
index 0000000000000..e65b7f45407ba
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
@@ -0,0 +1,369 @@
+import dataclasses
+import random
+from typing import Any, Callable, Iterable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_n_rand_tensors
+
+import vllm._custom_ops as ops
+
+
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+
+
+@dataclasses.dataclass
+class ArgPool:
+    '''
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    '''
+    values: Iterable[Any]
+
+
+class BenchMM:
+
+    class ArgsIterator:
+
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+
+        def reset(self):
+            self.idx = 0
+
+        @property
+        def n_args(self):
+            return self.n
+
+    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
+                 label: str, sub_label: str, description: str, fn: Callable,
+                 *args, **kwargs):
+
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(
+            *args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list,
+                                               self.kwargs_list)
+
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+
+        # benchmark run params
+        self.min_run_time = 1
+
+    def collapse_argpool(self, *args, **kwargs):
+        kwargs = kwargs if kwargs is not None else {}
+        assert kwargs is None or all([
+            not isinstance(v, ArgPool) for k, v in kwargs.items()
+        ]), 'ArgPools in kwargs are not supported yet'
+
+        arg_pool_indices = [
+            i for i, x in enumerate(args) if isinstance(x, ArgPool)
+        ]
+        if len(arg_pool_indices) == 0:
+            return [args], [kwargs]
+
+        # make sure all the Arg pools have the same number of choices
+        arg_pool_size = len(args[arg_pool_indices[0]].values)
+        assert all(
+            [len(args[i].values) == arg_pool_size for i in arg_pool_indices])
+
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(arg_pool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+
+        # collapse the arg pools by simply choosing the ith value
+        for i in range(arg_pool_size):
+            assert isinstance(args_list[i], tuple)
+            # get as list
+            args_i = list(args_list[i])
+            # collapse - make replacements
+            for arg_pool_idx in arg_pool_indices:
+                val_from_pool = args_i[arg_pool_idx].values[i]
+                args_i[arg_pool_idx] = val_from_pool
+            # store back as tuple
+            args_list[i] = tuple(args_i)
+
+        return args_list, kwargs_list
+
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(5):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {'g': self.g}
+
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = '''
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    '''
+            stmt = '''
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    '''
+            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+
+            setup = ""
+            stmt = '''
+                    fn(*args, **kwargs)
+                   '''
+            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
+
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        #assert timer.meets_confidence()
+        #assert not timer.has_warnings, f"Warnings {timer._warnings}"
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")
+
+
+def get_autogen_functions():
+    import importlib
+    from importlib.util import find_spec
+
+    # import vllm nm_cutlass modules so torch._C can find it
+    m_idx = 0
+    m_name = f'vllm._nm_cutlass_{m_idx}_C'
+    while find_spec(m_name):
+        print(f"attempting import {m_name}")
+        importlib.import_module(m_name)
+        m_idx += 1
+        m_name = f'vllm._nm_cutlass_{m_idx}_C'
+
+    dispatch_names = torch._C._dispatch_get_all_op_names()
+    autogen_dispatch_names = [x for x in dispatch_names if 'autogen' in x]
+    assert all([x.startswith('_nm_cutlass') for x in autogen_dispatch_names])
+    autogen_dispatch_modules_names = [(getattr(torch.ops,
+                                               x.split('::')[0]),
+                                       x.split('::')[1])
+                                      for x in autogen_dispatch_names]
+    name_fn = [(name, getattr(m, name))
+               for m, name in autogen_dispatch_modules_names]
+    print(f"#autogen functions found {len(name_fn)}")
+    return name_fn
+
+
+def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
+              with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+
+    arg_pool_size = with_arg_pool if with_arg_pool else 1
+    cuda_graph_params: Optional[CudaGraphBenchParams] = None
+    if with_cuda_graph:
+        num_ops_in_cuda_graph = with_cuda_graph if with_cuda_graph else None
+        cuda_graph_params = CudaGraphBenchParams(num_ops_in_cuda_graph)
+
+    assert dtype == torch.float8_e4m3fn
+
+    # Make input As and Bs
+    As, Bs = make_n_rand_tensors(arg_pool_size, torch.float8_e4m3fn, m, n, k)
+    bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
+    bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
+    # shuffle As and Bs to prevent any suspicion of pattern exploitation
+    random.shuffle(As)
+    random.shuffle(Bs)
+    random.shuffle(bf16_As)
+    random.shuffle(bf16_Bs)
+
+    # Make scales and biases
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    with BenchMM(cuda_graph_params, label, sub_label,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales", torch.mm,
+                 ArgPool(bf16_As), ArgPool(bf16_Bs)) as bench:
+        timers.append(bench.run())
+
+    ## pytorch impl: bf16 output, without fp8 fast accum
+    with BenchMM(cuda_graph_params,
+                 label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 ArgPool(As),
+                 ArgPool(Bs),
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16) as bench:
+        timers.append(bench.run())
+
+    ## pytorch impl: bf16 output, with fp8 fast accum
+    with BenchMM(cuda_graph_params,
+                 label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 ArgPool(As),
+                 ArgPool(Bs),
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True) as bench:
+        timers.append(bench.run())
+
+    ## cutlass impl: bf16 output
+    with BenchMM(cuda_graph_params, label, sub_label,
+                 "cutlass_fp8_fp8_bf16_scaled_mm", ops.cutlass_scaled_mm,
+                 ArgPool(As), ArgPool(Bs), scale_a, scale_b,
+                 torch.bfloat16) as bench:
+        timers.append(bench.run())
+
+    def attempt_run(fn, *args, **kwargs) -> bool:
+        try:
+            fn(*args, **kwargs)
+            return True
+        except Exception as e:
+            print(f"Failed to run {autogen_name} because {e} ...")
+            return False
+
+    autogen_name_fn = get_autogen_functions()
+    for autogen_name, autogen_fn in autogen_name_fn:
+        print(f"Bench autogen {autogen_name}")
+        out = torch.empty((m, n), dtype=torch.bfloat16, device="cuda")
+        if "scaled_mm_streamk" in autogen_name:
+
+            # Reduction mode and decomposition mode
+            run_options = [("Deterministic", "StreamK"),
+                           ("Nondeterministic", "StreamK")]
+            for run_option in run_options:
+                reduction_mode, decomposition_mode = run_option
+                if not attempt_run(autogen_fn, out, As[0], Bs[0],
+                                   reduction_mode, decomposition_mode,
+                                   scale_a, scale_b):
+                    continue
+
+                description = (f'{autogen_name}_'
+                              f'{reduction_mode}_'
+                              f'{decomposition_mode}')
+
+                with BenchMM(cuda_graph_params, label,
+                             sub_label, description, autogen_fn, out,
+                             ArgPool(As), ArgPool(Bs),
+                             reduction_mode, decomposition_mode,
+                             scale_a, scale_b) as bench:
+                    timers.append(bench.run())
+
+        elif "scaled_mm" in autogen_name:
+            if not attempt_run(autogen_fn, out, As[0], Bs[0], scale_a, scale_b):
+                continue
+            with BenchMM(cuda_graph_params, label,
+                         sub_label, autogen_name, autogen_fn, out, ArgPool(As),
+                         ArgPool(Bs), scale_a, scale_b) as bench:
+                timers.append(bench.run())
+        else:
+            assert "simple_gemm" in autogen_name
+            if not attempt_run(autogen_fn, out, As[0], Bs[0]):
+                continue
+            with BenchMM(cuda_graph_params, label, sub_label, autogen_name,
+                         autogen_fn, out, ArgPool(As), ArgPool(Bs)) as bench:
+                timers.append(bench.run())
+
+    return timers
+
+
+def bench_v2(dtype: torch.dtype, with_cuda_graph: Optional[int],
+             with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
+             sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
+                         sub_label)
+    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/utils.py b/benchmarks/cutlass_benchmarks/dense_mm/utils.py
new file mode 100644
index 0000000000000..c8fcd50a51d31
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/dense_mm/utils.py
@@ -0,0 +1,36 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+def make_n_rand_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        ABs.append(make_rand_tensors(dtype, m, n, k))
+    As, Bs = zip(*ABs)
+    return list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
new file mode 100644
index 0000000000000..a597988cd2840
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
@@ -0,0 +1,211 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from bench_v1 import bench_v1
+from bench_v2 import bench_v2
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    dtype = args.dtype
+
+    use_bench_v2 = args.with_cuda_graph or args.with_arg_pool
+    for m, k, n in MKNs:
+        if use_bench_v2:
+            label = f"scaled-{dtype}-gemm"
+            label = f"{label}-cugraph_{args.with_cuda_graph}" \
+                  if args.with_cuda_graph else label
+            label = f"{label}-argpool_{args.with_arg_pool}" \
+                if args.with_arg_pool else label
+            timers = bench_v2(args.dtype, args.with_cuda_graph,
+                              args.with_arg_pool, m, k, n, label,
+                              f"MKN=({m}x{k}x{n})")
+        else:
+            timers = bench_v1(args.dtype, m, k, n, f"scaled-{dtype}-gemm",
+                              f"MKN=({m}x{k}x{n})")
+
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            if tp_split_dim is not None:
+                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    parser.add_argument(
+        '--with-cuda-graph',
+        type=int,
+        default=None,
+        help="Number of ops/matmuls in a cudagraph execution. When set"
+        "cuda-graphs is enabled")
+    parser.add_argument(
+        '--with-arg-pool',
+        type=int,
+        default=None,
+        help="Number of A and B tensors to use as arg-pool. When not set,"
+        "it defaults to 1")
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
new file mode 100644
index 0000000000000..77f15891d84b2
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
@@ -0,0 +1,75 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
+                                                              8192], None),
+                                      ([8192, 14336], None),
+                                      ([7168, 8192], None)],
+    # The shape space is very big when benchmarking a large set of kernels.
+    # For example: Let,
+    #  - #kernels to benchmark be 1700
+    #  - #models to benchmark be 4 (each model has 4 shapes)
+    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
+    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
+    # to run, then the benchmark suite would take,
+    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
+    # Below, we exploit some observation on the benchmark shapes to create a
+    # representative set.
+    #
+    # From previous benchmarking runs, we observe that perf if stratified as,
+    # N - small, medium, large and K - small and large. We also observe that
+    # in the model shapes, when K is small, we have small, medium and large Ns.
+    # when K is large, we only have small Ns.
+    #
+    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
+    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
+    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
+    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
+    #         22016, 27648, 28672]
+    "llama-representative-set": [
+        ([4096, 4096], None),  # small K, small N
+        ([4096, 8192], None),  # small K, medium N
+        ([4096, 22016], None),  # small K, large N
+        ([14336, 4096], None),  # large K, small N
+        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
+    ],
+}
diff --git a/benchmarks/cutlass_benchmarks/test_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
similarity index 50%
rename from benchmarks/cutlass_benchmarks/test_benchmarks.py
rename to benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
index 8fc2ff395fdb1..9c516fc6762a7 100644
--- a/benchmarks/cutlass_benchmarks/test_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -1,59 +1,13 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
+## Cutlass benchmark V1
+
+from typing import Callable, Iterable
 
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.bfloat16)
+from utils import make_rand_sparse_tensors
 
-def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.float16)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    # a = torch.randn((m, k), device='cuda') * 5
-    # b = torch.randn((n, k), device='cuda').t() * 5
-
-    # Create ones
-    a = torch.ones((m, k), device='cuda') * 2
-    b = torch.ones((n, k), device='cuda').t() * 3
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-    if dtype == torch.float16:
-        return to_fp16(a), to_fp16(b)
-    if dtype == torch.bfloat16:
-        return to_bf16(a), to_bf16(b)
-
-    raise ValueError("unsupported dtype")
+import vllm._custom_ops as ops
 
 
 # bench
@@ -78,22 +32,14 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
 def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
-
-    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
-    print(f'a: {a[-1, -23:]}')
-    print(f'a_compressed: {a_compressed[0, :12]}')
-    print(f'e: {e[-1, -23:]}')
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
 
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
-    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
-
     timers = []
+
     # pytorch impl - bfloat16
     timers.append(
         bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
@@ -108,25 +54,25 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl: bf16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.bfloat16))
 
     # cutlass with bias: bf16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
                  bias))
     
     # cutlass impl: fp16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.float16))
 
     # cutlass with bias: fp16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
                  bias.to(dtype=torch.float16)))
 
@@ -136,21 +82,12 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
-
-    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
-    print(f'a: {a[-1, -23:]}')
-    print(f'a_compressed: {a_compressed[0, :12]}')
-    print(f'e: {e[-1, -23:]}')
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
 
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
-    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
-
     timers = []
 
     # pytorch impl w. bf16
@@ -211,23 +148,23 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl: bf16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.bfloat16))
     # cutlass impl: fp16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
                  bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
                  bias.to(dtype=torch.float16)))
 
@@ -237,21 +174,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float16
-    a, b = make_rand_tensors(torch.float16, m, n, k)
-    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
-
-    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
-    print(f'a: {a[-1, -23:]}')
-    print(f'a_compressed: {a_compressed[0, :12]}')
-    print(f'e: {e[-1, -23:]}')
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.float16, m, n, k)
 
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
-    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
-
     timers = []
 
     # # pytorch impl w. bf16
@@ -286,24 +214,24 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl: bf16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
                  bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
                  bias.to(dtype=torch.float16)))
 
@@ -313,21 +241,12 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.bfloat16
-    a, b = make_rand_tensors(torch.bfloat16, m, n, k)
-    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
-
-    print(f'Default shape: {a.shape}, compressed shape: {a_compressed.shape}, e shape: {e.shape}')
-    print(f'a: {a[-1, -23:]}')
-    print(f'a_compressed: {a_compressed[0, :12]}')
-    print(f'e: {e[-1, -23:]}')
+    a_compressed, e, a, b = make_rand_sparse_tensors(torch.bfloat16, m, n, k)
 
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    print(f'Default matmul: {torch.mm(a.to(dtype=torch.float16, device="cuda"), b.to(dtype=torch.float16, device="cuda"))[0:2, :12]}')
-    print(f'Cutlass matmul: {ops.cutlass_scaled_sparse_mm(a_compressed, e, b, scale_a, scale_b, torch.float16)[0:2, :12]}')
-
     timers = []
 
     # # pytorch impl w. bf16
@@ -362,31 +281,31 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
     # cutlass impl: bf16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
                  torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_mm",
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
                  bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_mm_bias",
+        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
                  ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
                  bias.to(dtype=torch.float16)))
 
     return timers
 
 
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
           sub_label: str) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
         return bench_int8(dtype, m, k, n, label, sub_label)
@@ -397,172 +316,3 @@ def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     if dtype == torch.bfloat16:
         return bench_bf16(dtype, m, k, n, label, sub_label)
     raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        if dt == "fp16":
-            return torch.float16
-        if dt == "bf16":
-            return torch.bfloat16
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/test_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/test_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/test_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
new file mode 100644
index 0000000000000..370e770ba31c1
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -0,0 +1,437 @@
+import dataclasses
+import random
+from typing import Any, Callable, Iterable, Optional, Tuple, Dict
+
+import multiprocessing as mp
+from multiprocessing import Process, Queue
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_n_rand_sparse_tensors
+
+import vllm._custom_ops as ops
+import traceback
+
+
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+
+
+@dataclasses.dataclass
+class ArgPool:
+    '''
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    '''
+    values: Iterable[Any]
+
+
+class BenchMM:
+
+    class ArgsIterator:
+
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+
+        def reset(self):
+            self.idx = 0
+
+        @property
+        def n_args(self):
+            return self.n
+
+    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
+                 label: str, sub_label: str, description: str, fn: Callable,
+                 *args, **kwargs):
+
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(
+            *args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list,
+                                               self.kwargs_list)
+
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+
+        # benchmark run params
+        self.min_run_time = 1
+
+    def collapse_argpool(self, *args, **kwargs):
+        kwargs = kwargs if kwargs is not None else {}
+        assert kwargs is None or all([
+            not isinstance(v, ArgPool) for k, v in kwargs.items()
+        ]), 'ArgPools in kwargs are not supported yet'
+
+        arg_pool_indices = [
+            i for i, x in enumerate(args) if isinstance(x, ArgPool)
+        ]
+        if len(arg_pool_indices) == 0:
+            return [args], [kwargs]
+
+        # make sure all the Arg pools have the same number of choices
+        arg_pool_size = len(args[arg_pool_indices[0]].values)
+        assert all(
+            [len(args[i].values) == arg_pool_size for i in arg_pool_indices])
+
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(arg_pool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+
+        # collapse the arg pools by simply choosing the ith value
+        for i in range(arg_pool_size):
+            assert isinstance(args_list[i], tuple)
+            # get as list
+            args_i = list(args_list[i])
+            # collapse - make replacements
+            for arg_pool_idx in arg_pool_indices:
+                val_from_pool = args_i[arg_pool_idx].values[i]
+                args_i[arg_pool_idx] = val_from_pool
+            # store back as tuple
+            args_list[i] = tuple(args_i)
+
+        return args_list, kwargs_list
+
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(5):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {'g': self.g}
+
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = '''
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    '''
+            stmt = '''
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    '''
+            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+
+            setup = ""
+            stmt = '''
+                    fn(*args, **kwargs)
+                   '''
+            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
+
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        #assert timer.meets_confidence()
+        #assert not timer.has_warnings, f"Warnings {timer._warnings}"
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")
+
+
+def get_autogen_functions():
+    import importlib
+    from importlib.util import find_spec
+
+    # import vllm nm_cutlass modules so torch._C can find it
+    m_idx = 0
+    m_name = f'vllm._nm_cutlass_{m_idx}_C'
+    while find_spec(m_name):
+        # print(f"attempting import {m_name}")
+        importlib.import_module(m_name)
+        m_idx += 1
+        m_name = f'vllm._nm_cutlass_{m_idx}_C'
+
+    dispatch_names = torch._C._dispatch_get_all_op_names()
+    autogen_dispatch_names = [x for x in dispatch_names if 'autogen' in x]
+    assert all([x.startswith('_nm_cutlass') for x in autogen_dispatch_names])
+    autogen_dispatch_modules_names = [(getattr(torch.ops,
+                                               x.split('::')[0]),
+                                       x.split('::')[1])
+                                      for x in autogen_dispatch_names]
+    name_fn = [(name, getattr(m, name))
+               for m, name in autogen_dispatch_modules_names]
+    # print(f"#autogen functions found {len(name_fn)}")
+    return name_fn
+
+
+def _run(kernel_config, queue):
+    try:
+        # Initialize CUDA tensors
+        arg_pool_size = kernel_config.get('arg_pool_size', 1)
+        m, k, n = kernel_config['m'], kernel_config['k'], kernel_config['n']
+        dtype = kernel_config['dtype']
+        
+        # Create tensors
+        AComps, Es, As, Bs = make_n_rand_sparse_tensors(arg_pool_size, dtype, m, n, k)
+        bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
+        bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        out = torch.zeros((m, n), dtype=torch.bfloat16, device="cuda")
+
+        # Setup benchmark params
+        cuda_graph_params = None
+        if cgops := kernel_config.get('cuda_graph_ops'):
+            cuda_graph_params = CudaGraphBenchParams(cgops)
+        
+        label = kernel_config['label']
+        sub_label = kernel_config['sub_label']
+        
+        # Initialize benchmark based on kernel type
+        bench = None
+        kernel_type = kernel_config['kernel_type']
+        
+        if kernel_type == 'pytorch_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "pytorch_bf16_bf16_bf16_matmul-no-scales", 
+                            torch.mm,
+                            ArgPool(bf16_As), ArgPool(bf16_Bs))
+        
+        elif kernel_type == 'pytorch_scaled_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "pytorch_fp8_fp8_bf16_scaled_mm",
+                            torch._scaled_mm,
+                            ArgPool(As), ArgPool(Bs),
+                            scale_a=scale_a, scale_b=scale_b,
+                            out_dtype=torch.bfloat16)
+        
+        elif kernel_type == 'pytorch_scaled_mm_fast':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                            torch._scaled_mm,
+                            ArgPool(As), ArgPool(Bs),
+                            scale_a=scale_a, scale_b=scale_b,
+                            out_dtype=torch.bfloat16,
+                            use_fast_accum=True)
+        
+        elif kernel_type == 'cutlass_scaled_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "cutlass_fp8_fp8_bf16_scaled_mm", 
+                            ops.cutlass_scaled_mm,
+                            ArgPool(As), ArgPool(Bs), scale_a, scale_b,
+                            torch.bfloat16)
+        
+        elif kernel_type == 'cutlass_sparse_mm':
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm", 
+                            ops.cutlass_scaled_sparse_mm,
+                            ArgPool(AComps), ArgPool(Es), ArgPool(Bs), 
+                            scale_a, scale_b, torch.bfloat16)
+        
+        elif kernel_type == 'autogen_kernel':
+            # Get the autogen kernel
+            kernel_num = kernel_config['kernel_num']
+            autogen_fn = None
+            
+            # Get the kernel in autogen functions
+            kernel_name, autogen_fn = get_autogen_functions()[kernel_num]
+            
+            if autogen_fn is None:
+                raise ValueError(f"Autogen kernel {kernel_name} not found")
+            
+            # Create appropriate benchmark based on kernel type
+            if "scaled_sparse_mm" in kernel_name:
+                bench = BenchMM(cuda_graph_params, label, sub_label,
+                                kernel_name, autogen_fn, out, 
+                                ArgPool(AComps), ArgPool(Es), ArgPool(Bs),
+                                scale_a, scale_b)
+            else:
+                bench = BenchMM(cuda_graph_params, label, sub_label,
+                                kernel_name, autogen_fn, out,
+                                ArgPool(As), ArgPool(Bs))
+
+        # Run the benchmark
+        result = bench.run()
+        queue.put((True, result))
+        
+    except Exception as e:
+        print(f"Error in process: {str(e)}")
+        print(traceback.format_exc())
+        queue.put((False, None))
+
+
+def run_kernel_in_process(kernel_config: Dict) -> Tuple[bool, Optional[TMeasurement]]:
+    """
+    Run a single kernel benchmark in a separate process.
+    kernel_config contains everything needed to construct the test case:
+    {
+        'kernel_type': str,  # e.g. 'pytorch_mm', 'pytorch_scaled_mm', 'autogen_kernel', etc.
+        'kernel_name': str,  # For autogen kernels, the actual kernel name
+        'm': int,
+        'k': int, 
+        'n': int,
+        'dtype': torch.dtype,
+        'cuda_graph_ops': Optional[int],
+        'arg_pool_size': int,
+        'label': str,
+        'sub_label': str
+    }
+    """
+
+    queue = Queue()
+    p = Process(target=_run, args=(kernel_config, queue,))
+    p.start()
+    success, result = queue.get()
+    p.join()
+    return success, result
+
+
+def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
+              with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    
+    # Check if context is not set
+    try:
+        mp.set_start_method('spawn', force=True)
+    except RuntimeError:
+        pass
+    
+    timers = []
+
+    # Base configuration for all kernels
+    base_config = {
+        'm': m,
+        'k': k,
+        'n': n,
+        'dtype': dtype,
+        'cuda_graph_ops': with_cuda_graph,
+        'arg_pool_size': with_arg_pool if with_arg_pool else 1,
+        'label': label,
+        'sub_label': sub_label
+    }
+    
+    # Run standard kernels
+    standard_kernels = [
+        {'kernel_type': 'pytorch_mm'},
+        {'kernel_type': 'pytorch_scaled_mm'},
+        {'kernel_type': 'pytorch_scaled_mm_fast'},
+        {'kernel_type': 'cutlass_scaled_mm'},
+        {'kernel_type': 'cutlass_sparse_mm'}
+    ]
+    
+    for kernel in standard_kernels:
+        print(f"Bench {kernel['kernel_type']}")
+        config = {**base_config, **kernel}
+        success, result = run_kernel_in_process(config)
+        if success and result is not None:
+            timers.append(result)
+    
+    # Run autogen kernels
+    autogen_name_fn = get_autogen_functions()
+    # autogen_name_fn = autogen_name_fn[284:288]
+    # i_range = [284, 285, 286, 287]
+    i_range = range(len(autogen_name_fn))
+    
+    for i in i_range:
+        autogen_name, _ = autogen_name_fn[i]
+        print(f"Bench autogen {i+1}/{len(autogen_name_fn)} {autogen_name}")
+        
+        config = {
+            **base_config,
+            'kernel_type': 'autogen_kernel',
+            'kernel_num': i
+        }
+        
+        success, result = run_kernel_in_process(config)
+        if success and result is not None:
+            timers.append(result)
+    
+    return timers
+
+
+def bench_v2(dtype: torch.dtype, with_cuda_graph: Optional[int],
+             with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
+             sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
+                         sub_label)
+    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
new file mode 100644
index 0000000000000..82567a57b303a
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
@@ -0,0 +1,215 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from bench_v1 import bench_v1
+from bench_v2 import bench_v2
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    dtype = args.dtype
+
+    use_bench_v2 = args.with_cuda_graph or args.with_arg_pool
+    for m, k, n in MKNs:
+        if use_bench_v2:
+            label = f"scaled-sparse-{dtype}-gemm"
+            label = f"{label}-cugraph_{args.with_cuda_graph}" \
+                  if args.with_cuda_graph else label
+            label = f"{label}-argpool_{args.with_arg_pool}" \
+                if args.with_arg_pool else label
+            timers = bench_v2(args.dtype, args.with_cuda_graph,
+                              args.with_arg_pool, m, k, n, label,
+                              f"MKN=({m}x{k}x{n})")
+        else:
+            timers = bench_v1(args.dtype, m, k, n, f"scaled-sparse-{dtype}-gemm",
+                              f"MKN=({m}x{k}x{n})")
+
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            if tp_split_dim is not None:
+                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        if dt == "fp16":
+            return torch.float16
+        if dt == "bf16":
+            return torch.bfloat16
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
+    parser.add_argument(
+        '--with-cuda-graph',
+        type=int,
+        default=None,
+        help="Number of ops/matmuls in a cudagraph execution. When set"
+        "cuda-graphs is enabled")
+    parser.add_argument(
+        '--with-arg-pool',
+        type=int,
+        default=None,
+        help="Number of A and B tensors to use as arg-pool. When not set,"
+        "it defaults to 1")
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
new file mode 100644
index 0000000000000..33a630b2278b0
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
@@ -0,0 +1,87 @@
+# Cutlass bench utils
+from typing import Iterable, Tuple
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+    
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+    
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
+    
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    # # Initialize a to all ones
+    # a = torch.ones((m, k), device='cuda')
+    # # Initialize b to all ones
+    # b = torch.ones((n, k), device='cuda').t()
+
+    a = prune_to_2_4(a)
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
+
+    # Compressed A, Metadata, Original A, B
+    return a_compressed, e, a, b
+
+
+def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
+                        m: int, n: int, k: int) -> \
+                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+    ABs = []
+    for _ in range(num_tensors):
+        a_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if a_comp is not None:
+            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
+    AComps, Es, As, Bs = zip(*ABs)
+    return list(AComps), list(Es), list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
new file mode 100644
index 0000000000000..2999244bf9b95
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
@@ -0,0 +1,75 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
+                                                              8192], None),
+                                      ([8192, 14336], None),
+                                      ([7168, 8192], None)],
+    # The shape space is very big when benchmarking a large set of kernels.
+    # For example: Let,
+    #  - #kernels to benchmark be 1700
+    #  - #models to benchmark be 4 (each model has 4 shapes)
+    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
+    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
+    # to run, then the benchmark suite would take,
+    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
+    # Below, we exploit some observation on the benchmark shapes to create a
+    # representative set.
+    #
+    # From previous benchmarking runs, we observe that perf if stratified as,
+    # N - small, medium, large and K - small and large. We also observe that
+    # in the model shapes, when K is small, we have small, medium and large Ns.
+    # when K is large, we only have small Ns.
+    #
+    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
+    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
+    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
+    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
+    #         22016, 27648, 28672]
+    "llama-representative-set": [
+        # ([4096, 4096], None),  # small K, small N
+        ([4096, 8192], None),  # small K, medium N
+        ([4096, 22016], None),  # small K, large N
+        ([14336, 4096], None),  # large K, small N
+        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
+    ],
+}
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
deleted file mode 100644
index 63cf5d50cac75..0000000000000
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ /dev/null
@@ -1,389 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
-    min_run_time = 1
-
-    globals = {
-        "args": args,
-        "kwargs": kwargs,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
-
-    timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-
-    timers = []
-
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
-
-    return timers
-
-
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
deleted file mode 100644
index 25ec9d6028627..0000000000000
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-}
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index 25ec9d6028627..77f15891d84b2 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -40,4 +40,36 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
+    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
+                                                              8192], None),
+                                      ([8192, 14336], None),
+                                      ([7168, 8192], None)],
+    # The shape space is very big when benchmarking a large set of kernels.
+    # For example: Let,
+    #  - #kernels to benchmark be 1700
+    #  - #models to benchmark be 4 (each model has 4 shapes)
+    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
+    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
+    # to run, then the benchmark suite would take,
+    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
+    # Below, we exploit some observation on the benchmark shapes to create a
+    # representative set.
+    #
+    # From previous benchmarking runs, we observe that perf if stratified as,
+    # N - small, medium, large and K - small and large. We also observe that
+    # in the model shapes, when K is small, we have small, medium and large Ns.
+    # when K is large, we only have small Ns.
+    #
+    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
+    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
+    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
+    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
+    #         22016, 27648, 28672]
+    "llama-representative-set": [
+        ([4096, 4096], None),  # small K, small N
+        ([4096, 8192], None),  # small K, medium N
+        ([4096, 22016], None),  # small K, large N
+        ([14336, 4096], None),  # large K, small N
+        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
+    ],
 }
diff --git a/csrc/quantization/cutlass_w8a8/generator/README.md b/csrc/quantization/cutlass_w8a8/generator/README.md
new file mode 100644
index 0000000000000..523d767074820
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/README.md
@@ -0,0 +1,143 @@
+## Cutlass Kernel Generator and Benchmark Sweeps
+
+#### Basic Idea
+ - Expose a C++ interface for the function to benchmark. The interface must be
+   templated with the hyper-parameters we desire to sweep over.
+ - Generate .cu files using jinja templates that use the exposed interface.
+   Look at `scaled_mm_c3x.jinja`
+ - Generate torch bindings for the functions in the .cu files.
+ - Build vllm to include all the generated .cu files. Look at `nm_cutlass_c.cmake`
+ - Run the benchmarking script to sweep over problem shapes and all the generated
+   cutlass kernels. Look at `benchmarks/cutlass_benchmarks/bench_v2.py`
+
+#### Important Files
+ - scaled_mm_c3x.jinja / simple_gemm_c3x.jinja : Jinja templated files for functions to generate.
+ - scaled_mm_c3x_fnprototype.jinja / simple_gemm_c3x_fnprototype.jinja : Jinja templated files for the C++ function declarations.
+ - generator_types.py : This file contains all the information regarding the function type we intend to generate.
+        For example, at the time of writing, we have ScaledMMGeneratorType and SimpleGemmGeneratorType.
+        The ScaledMMGeneratorType points to the correct jinja templates to use and also defines the
+        correct torch biniding `ops.impl` and `ops.def` string. This is where we register new GeneratorTypes
+        if we add more function-generators in the future.
+ - autogen_manifest.py : Defines hyper-parameter sets.
+ - kernel_generator.py : All utilities that are responsible for filling out the jinja templates
+        based on the given set of hyper-parameter args.
+ - generator.py : Bridges autogen_manifest.py and kernel_generator.py. This is the `main` driver
+        scripts that we use to generate kernels.
+ - kernel_compiler.py : Not all sets of hyperparameters are valid. The KernelCompiler, attempts an
+        nvcc compile on the generated kernel file and kernel_generator/generator accepts/rejects
+        the generated kernel based this compilation status.
+
+#### Adding a new function to generate
+
+##### Step 1
+    - Like mentioned before, expose a C++ interface for the function to generate. The interface
+    must be templated with the hyper-parameters we desire to sweep over.
+
+##### Step 2
+    - Create jinja templates.
+        1. Create a jinja template file that is representative of the kernel we wish to generate. 
+        2. Create a separate jinja template file that has the function declaration.
+    - Refer to `scaled_mm_c3x.jinja` and `scaled_mm_c3x_fnprototype.jinja`
+
+##### Step 3
+    - Create a GeneratorType in generator_types.py
+    - The GeneratorType is the datastructure that communicates,
+        1. What jinja template files to use
+        2. What is the torch_bindings `ops.def` and `ops.impl` arguments
+    - Refer to ScaledMMGeneratorType
+
+##### Step 4
+    - In autogen_manifest, create a list of hyper-parameter sets that are to be translated into kernel files.
+    - Look at the construction of Cutlass3xArgsTest in autogen_manifest.py
+
+##### Commands to generate kernels:
+    - Example command:
+    python3 csrc/quantization/cutlass_w8a8/generator/generator.py --generator-type scaled_mm --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test --cuda-dir /usr/local/cuda-12.5 --cutlass-args-list Cutlass3xArgsTest
+
+    Here: 
+        - --generator-type : The description of the desired GeneratorType in generator_types.py
+        - --vllm-root-dir : The root-dir of your vllm project
+        - --py-venv-dir : The root-dir of your python environment
+        - --cuda-dir : cuda dir to use
+        - --cutlass-args-list : the name of the list of hyper-parameter sets that you created in autogen_manifest.py
+
+    Expectations:
+     The generator attempts to generate one kernel for every hyper-parameter set.
+        - The generator looks generates the kernel file
+        - The generator attempts to compile the generated kernel file
+        - If compilation succeeds, it keeps the generated kernel file. Deletes it otherwise.
+
+    The generator records the status of the compilation for each kernel it tries to compile. If some kernel is known to 
+    have succeeded in a previous run, it simply generates it and doesnot attempt a re-compile.
+
+##### Commands to build
+    - The normal vllm build command should work.
+    - i.e. either `pip3 install -e .` or `python3 setup.py --build_ext --inplace`
+    Expectation:
+        Compilation should be successful and you should see .so files like, `_nm_cutlass_*_C.so` in the vllm folder
+
+##### How to benchmark
+The benchmarking scripts have been updated to grab all the auto-generated cutlass kernels. Look at 
+`get_autogen_functions` in `benchmarks/cutlass_benchmarks/bench_v2.py`.
+
+Example command:
+python3 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 --with-arg-pool 32 --with-cuda-graph 32 square_bench --dim-start 128 --dim-end 256 --dim-increment 128
+
+Expectations:
+    You should see output similar to, 
+     ```
+     attempting import vllm._nm_cutlass_0_C
+     #autogen functions found 3
+    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecializedCooperative_PersistentScheduler_kGemm_float_fp8
+    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
+    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedPingpongFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
+     ```
+
+##### Benchmark Heatmaps and Optimal Kernel Set Selection
+Typically a hyper-parameter sweep produces 100s of kernels. It could be hard to read the terminal outputs
+of benchmarking scripts. The w8a8_benchmarks.py script when used with the model_bench command, produces
+a pickle file that contains the benchmark information for all the {kernel, gemm-shape} pairs benchmarked.
+
+###### Kernel Selection Problem
+When we run a hyper-parameter sweep, we are interested in finding a minimal a set of kernels that is the
+optimal for the gemm-shapes benchmarked. `tools/select_kernels.py` solves this optimization problem.
+
+Example:
+ python3 select_kernels.py --input-pkl ./model_bench-torch.float8_e4m3fn-1729989172.pkl --min-gemm-efficiency 0.98
+
+ This example invocation of the select_kernels.py script,
+  - Reads the input pickle file and gathers the benchmark information of all the {kernel, gemm-shape} pairs.
+  - Normalizes the benchmark information with respect to gemm shapes. i.e. the best performing
+    kernel for some gemm-shape is given a value of 1.0. A kernel with a value of `x` ( `x` < 1.0)
+    indicates that that kernel's performance is `x` times that of the optimal kernel.
+  - The script ignores all the {kernel, gemm-shape} pairs where the kernel efficiency is < min_gemm_efficiency.
+    In this case the script only considers the {kernel, gemm-shape} pairs where the normalized value
+    is in range [0.98, 1.0]
+  - The script then determines the optimal and minimal kernel set.
+
+###### Visualization problem
+Reading the w8a8_benchmarks.py terminal output can get overwhelming. The script `tools/heatmap.py`
+consumes a model_bench pickle file and produces a heatmap for better consumption of the results.
+
+Example:
+  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --plot-all-ops
+
+  Normalizes all the {kernel, gemm-shape} information in the model_bench pickle file (refer to "Kernel Selection Problem"
+  for how the data is normalized). and renders the normalized benchmark information as a heatmap.
+
+Example:
+  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --select-kernels
+
+  Effectively runs select_kernel.py on the input pkl file and renders the selected kernels as heatmap.
+
+
+
+
+
+
+
+tools/select_kernel.py :  
+
+
+
+
diff --git a/csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py b/csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py
new file mode 100644
index 0000000000000..a4f8297b71944
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py
@@ -0,0 +1,164 @@
+import copy
+from dataclasses import dataclass
+from itertools import product
+from typing import Tuple
+
+
+@dataclass
+class Cutlass3xArgs:
+    dtype_str: str
+    arch: int
+    tile_shape: Tuple[int, int, int]
+    cluster_shape: Tuple[int, int, int]
+    kernel_schedule: str
+    epilogue_schedule: str
+    tile_schedule: str
+    gemm_mode: str
+    acc_type: str
+
+    def with_tile_shape(self, ts):
+        clone = copy.deepcopy(self)
+        clone.tile_shape = ts
+        return clone
+
+    def with_cluster_shape(self, cs):
+        clone = copy.deepcopy(self)
+        clone.cluster_shape = cs
+        return clone
+
+    def with_tile_schedule(self, ts):
+        clone = copy.deepcopy(self)
+        clone.tile_schedule = ts
+        return clone
+
+    def with_kernel_schedule(self, ks):
+        clone = copy.deepcopy(self)
+        clone.kernel_schedule = ks
+        return clone
+
+    def with_epilogue_schedule(self, es):
+        clone = copy.deepcopy(self)
+        clone.epilogue_schedule = es
+        return clone
+
+    def with_gemm_mode(self, gm):
+        clone = copy.deepcopy(self)
+        clone.gemm_mode = gm
+        return clone
+
+    def with_acc_type(self, acc):
+        clone = copy.deepcopy(self)
+        clone.acc_type = acc
+        return clone
+
+    def with_dtype_str(self, dtype_str):
+        clone = copy.deepcopy(self)
+        clone.dtype_str = dtype_str
+        return clone
+
+
+DefaultCutlass3xArgsFP8 = Cutlass3xArgs(
+    "fp8", 90, (128, 128, 128), (1, 2, 1),
+    "cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative",
+    "cutlass::epilogue::TmaWarpSpecializedCooperative",
+    "cutlass::gemm::PersistentScheduler",
+    "cutlass::gemm::GemmUniversalMode::kGemm", "float")
+
+## Kernel Schedules
+## All
+# struct KernelMultistage { };
+# struct KernelCpAsyncWarpSpecialized { };
+# struct KernelCpAsyncWarpSpecializedPingpong { };
+# struct KernelCpAsyncWarpSpecializedCooperative { };
+# struct KernelTma { };
+# struct KernelTmaWarpSpecialized { };
+# struct KernelTmaWarpSpecializedPingpong { };
+# struct KernelTmaWarpSpecializedCooperative { };
+# struct KernelPtrArrayTmaWarpSpecializedCooperative { };
+## FP8
+# struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
+# struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { }; # noqa
+# struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { }; #noqa
+# struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };  #noqa
+
+## Epilogue policies
+# struct NoSmemWarpSpecialized {};
+# struct PtrArrayNoSmemWarpSpecialized {};
+# struct TmaWarpSpecialized {};
+# struct TmaWarpSpecializedCooperative {};
+
+## Tile scheduler
+# struct PersistentScheduler { };
+# struct StreamKScheduler { };
+
+## Kgemms
+# kGemm
+# kGemmSplitKParallel,
+# kBatched,
+# kArray,
+# kGrouped,
+# kInvalid
+
+cluster_shapes = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (4, 1, 1),
+                  (1, 4, 1), (8, 1, 1), (1, 8, 1), (4, 4, 1)]
+tile_shapes_m = [64, 128, 256]
+tile_shapes_n = [64, 128, 256]
+tile_shapes_k = [32, 64, 128, 256]
+tile_shapes = list(product(tile_shapes_m, tile_shapes_n, tile_shapes_k))
+
+kernel_schedules = [
+    "cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum",
+    "cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum",
+    "cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum"
+]
+
+epilogue_schedules = [
+    "cutlass::epilogue::TmaWarpSpecialized",
+    "cutlass::epilogue::TmaWarpSpecializedCooperative"
+]
+
+tile_schedules = [
+    "cutlass::gemm::PersistentScheduler", "cutlass::gemm::StreamKScheduler"
+]
+
+gemm_modes = ["cutlass::gemm::GemmUniversalMode::kGemm"]
+
+acc_types = ["float"]
+
+#epilogue_schedules_v2 = ["cutlass::epilogue::NoSmemWarpSpecialized"]
+gemm_modes_v2 = ["cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel"]
+acc_types_v2 = ["cutlass::half_t"]
+
+## Make Cutlass3xArgsTest
+
+Cutlass3xArgsTest = []
+
+for ts, cs, ks, es, tile_schedule, gm, at in product(
+        tile_shapes, cluster_shapes, kernel_schedules, epilogue_schedules,
+        tile_schedules, gemm_modes, acc_types):
+    Cutlass3xArgsTest.append(
+        DefaultCutlass3xArgsFP8.with_tile_shape(ts).with_cluster_shape(cs).
+        with_kernel_schedule(ks).with_epilogue_schedule(es).with_tile_schedule(
+            tile_schedule).with_gemm_mode(gm).with_acc_type(at))
+
+Cutlass3xArgsTest = Cutlass3xArgsTest[:5]
+
+## Make StreamK args
+
+Cutlass3xStreamKArgsTest = []
+
+# This is the only schedule that supports actual stream k scheduling.
+streamk_kernel_schedules = [
+    "cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum"
+]
+
+# "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension." #noqa
+streamk_tile_shapes = [x for x in tile_shapes if x[0] >= 128]
+
+for ts, cs, ks, es, tile_schedule, gm, at in product(
+        streamk_tile_shapes, cluster_shapes, streamk_kernel_schedules, epilogue_schedules,
+        ["cutlass::gemm::StreamKScheduler"], gemm_modes, acc_types):
+    Cutlass3xStreamKArgsTest.append(
+        DefaultCutlass3xArgsFP8.with_tile_shape(ts).with_cluster_shape(cs).
+        with_kernel_schedule(ks).with_epilogue_schedule(es).with_tile_schedule(
+            tile_schedule).with_gemm_mode(gm).with_acc_type(at))
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/generator/generator.py b/csrc/quantization/cutlass_w8a8/generator/generator.py
new file mode 100644
index 0000000000000..99209cd04de6f
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/generator.py
@@ -0,0 +1,145 @@
+import pprint
+from dataclasses import dataclass
+from multiprocessing.pool import ThreadPool
+from typing import List, Optional
+
+import autogen_manifest
+from autogen_manifest import Cutlass3xArgs
+from generator_types import GeneratorType, GeneratorTypes
+from kernel_compiler import KernelCompiler
+from kernel_generator import GeneratorOutput, KernelGenerator
+from tqdm import tqdm
+
+
+@dataclass
+class GenerateFromArgInput:
+    generator_type: Optional[GeneratorType] = None
+    args: Optional[Cutlass3xArgs] = None
+    kernel_compiler: Optional[KernelCompiler] = None
+
+
+def generate_from_arg(input: GenerateFromArgInput) -> GeneratorOutput:
+    """
+    Kernel generation for a single Cutlass3xArg
+    """
+    generator_type, args, kernel_compiler = (input.generator_type, input.args,
+                                             input.kernel_compiler)
+    return KernelGenerator.generate(generator_type, args, kernel_compiler)
+
+
+def generate_from_args_mt(generator_type: GeneratorType,
+                          args: List[Cutlass3xArgs],
+                          kernel_compiler: KernelCompiler,
+                          num_threads: int = 32) -> GeneratorOutput:
+    """
+    Kernel generator for a list of Cutlass3xArgs with multi-threading.
+    """
+    generator_outputs = GeneratorOutput()
+    # create thread pool with {num_threads} threads
+    pool = ThreadPool(processes=num_threads)
+    inputs = [
+        GenerateFromArgInput(generator_type, x, kernel_compiler) for x in args
+    ]
+    result = pool.map_async(generate_from_arg, inputs)
+    for r in result.get():
+        generator_outputs.merge(r)
+    return generator_outputs
+
+
+def main(args):
+    pprint.pprint(args)
+
+    cutlass_args_list = getattr(autogen_manifest, args.cutlass_args_list)
+    print(f"Generating {len(cutlass_args_list)} cuda files ...")
+
+    generator_type: GeneratorType = GeneratorType.from_str(args.generator_type)
+
+    additional_compile_args = [x.strip() for x in args.additional_compile_args]
+    kernel_compiler: KernelCompiler = KernelCompiler(
+        vllm_root_dir=args.vllm_root_dir,
+        py_venv_dir=args.py_venv_dir,
+        cuda_dir=args.cuda_dir,
+        py_version=args.py_version,
+        additional_args=additional_compile_args,
+        test_compile=args.test_compile)
+    kernel_compiler.init_compile_cache()
+
+    generator_outputs = GeneratorOutput()
+    batch_size = 100  # Compile-and-Generate batch_size items at a time
+    for idx in tqdm(range(0, len(cutlass_args_list), batch_size)):
+        print(f"Total {len(cutlass_args_list)}"
+              f" | Success {len(generator_outputs.success_file_names)}"
+              f"| Fail {len(generator_outputs.failed_file_names)}")
+
+        chunk_generator_output = generate_from_args_mt(
+            generator_type, cutlass_args_list[idx:idx + batch_size],
+            kernel_compiler)
+        generator_outputs.merge(chunk_generator_output)
+
+        # Store intermediate results
+        # fill-out ops.h
+        KernelGenerator.write_ops(generator_type, generator_outputs.file_paths,
+                                  generator_outputs.fn_names,
+                                  generator_outputs.fn_decls)
+        # store result batch
+        kernel_compiler.cache.add(generator_outputs.success_file_names,
+                                  generator_outputs.failed_file_names)
+        kernel_compiler.cache.store()
+
+
+if __name__ == "__main__":
+
+    import argparse
+    parser = argparse.ArgumentParser(description='''
+            Autogen cutlass kernels
+            Example: 
+            python3 csrc/quantization/cutlass_w8a8/generator/generator.py \
+                 --generator-type scaled_mm \
+                 --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ \
+                 --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test \
+                 --cuda-dir /usr/local/cuda-12.5
+            ''')
+
+    parser.add_argument("--generator-type",
+                        required=True,
+                        choices=[x.description() for x in GeneratorTypes])
+    parser.add_argument("--cutlass-args-list",
+                        required=True,
+                        type=str,
+                        default=None,
+                        help='''
+                        The cutlass args list variable name constructed in
+                        autogen_manifest.py. The variable name is imported
+                        as,
+                        getattr(autogen_manifest, args.cutlass_args_list)
+                        ''')
+    parser.add_argument('--test-compile',
+                        action='store_true',
+                        help='''
+                        Runs as usual but,
+                            - Prints compiler errors
+                            - Doesn't update the kernel compiler cache.
+                        ''')
+    parser.add_argument("--vllm-root-dir",
+                        required=True,
+                        type=str,
+                        default=None,
+                        help="Root directory of vllm source code")
+    parser.add_argument("--py-venv-dir",
+                        required=True,
+                        type=str,
+                        default=None,
+                        help="py venv root directory")
+    parser.add_argument("--cuda-dir",
+                        type=str,
+                        default=None,
+                        help="CUDA dir example: /usr/local/cuda-12.5")
+    parser.add_argument(
+        "--py-version",
+        type=str,
+        default="3.10",
+        help="Python version to use. Used in fetching the python includes")
+    parser.add_argument("--additional-compile-args", nargs='*', default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/csrc/quantization/cutlass_w8a8/generator/generator_types.py b/csrc/quantization/cutlass_w8a8/generator/generator_types.py
new file mode 100644
index 0000000000000..40aba7edaf280
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/generator_types.py
@@ -0,0 +1,125 @@
+"""
+Generator function types.
+
+Defines necessary information about each function type to generate.
+"""
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List
+
+from utils import get_script_dir
+
+
+class GeneratorType(ABC):
+    SCRIPT_DIR = get_script_dir()
+
+    @staticmethod
+    def description() -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def fn_defn_jinja_filepath(self) -> Path:
+        # Function definition jinja - the entrypoint to the function to
+        # generate.
+        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x.jinja for
+        # an example.
+        raise NotImplementedError
+
+    @abstractmethod
+    def fn_decl_jinja_filepath(self) -> Path:
+        # Function decl jinja - the c++ function declaration of the function
+        # to generate.
+        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x_fnprototype.jinja #noqa
+        # for an example.
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def ops_def(self, fn_name: str) -> str:
+        # torch binding ops.def template.
+        raise NotImplementedError
+
+    @abstractmethod
+    def ops_impl(self, fn_name: str) -> str:
+        # torch binding ops.impl template.
+        raise NotImplementedError
+
+    @staticmethod
+    def from_str(s: str) -> "GeneratorType":
+        if ScaledMMGenerator.description() == s:
+            return ScaledMMGenerator()
+        if SimpleGemmGenerator.description() == s:
+            return SimpleGemmGenerator()
+        if ScaledMMStreamKGenerator.description() == s:
+            return ScaledMMStreamKGenerator()
+        raise ValueError("Unknown generator type string {s}")
+
+
+class ScaledMMGenerator(GeneratorType):
+
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def description():
+        return "scaled_mm"
+
+    def fn_defn_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x.jinja"
+
+    def fn_decl_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_fnprototype.jinja"
+
+    def ops_def(self, fn_name: str) -> str:
+        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor b, Tensor a_scales, Tensor b_scales) -> ()");'  #noqa
+
+    def ops_impl(self, fn_name: str) -> str:
+        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
+
+
+class SimpleGemmGenerator(GeneratorType):
+
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def description():
+        return "simple_gemm"
+
+    def fn_defn_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "simple_gemm_c3x.jinja"
+
+    def fn_decl_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "simple_gemm_c3x_fnprototype.jinja"
+
+    def ops_def(self, fn_name: str) -> str:
+        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor b) -> ()");'
+
+    def ops_impl(self, fn_name: str) -> str:
+        # The {} should be filled in by the caller using the function name.
+        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
+
+
+class ScaledMMStreamKGenerator(GeneratorType):
+
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def description():
+        return "scaled_mm_streamk"
+
+    def fn_defn_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_streamk.jinja"
+
+    def fn_decl_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_streamk_fnprototype.jinja"
+
+    def ops_def(self, fn_name: str) -> str:
+        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor b, str reduction_mode, str decomposition_mode, Tensor a_scales, Tensor b_scales) -> ()");'  #noqa
+
+    def ops_impl(self, fn_name: str) -> str:
+        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
+
+GeneratorTypes: List[GeneratorType] = [ScaledMMGenerator, SimpleGemmGenerator, ScaledMMStreamKGenerator]
diff --git a/csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py b/csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py
new file mode 100644
index 0000000000000..b4fa1cf434567
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py
@@ -0,0 +1,128 @@
+"""
+Utilities to invoke the kernel compiler.
+When generating cutlass kernels, we attempt an nvcc compile to make sure that
+there won't be any issues at vllm build time.
+"""
+
+import pickle as pkl
+import subprocess
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional
+
+# Global compile cache path that stores information about which kernels
+# compiled successfully and which failed.
+CACHE_FILE_PATH = Path('./kernels_compile_cache.pkl')
+
+
+class KernelCompileCache:
+
+    def __init__(self, test_compile=False):
+        # If test_compile is true, we override the cache operations so it
+        # is a no-op.
+        self.test_compile = test_compile
+
+        # self.bad_kernels are kernels that failed compilation
+        # self.good_kernels are kernels that succeeded compilation
+        if not CACHE_FILE_PATH.exists() or self.test_compile:
+            self.bad_kernels = []
+            self.good_kernels = []
+        else:
+            # Load from cache
+            data = None
+            with open(str(CACHE_FILE_PATH), 'rb') as f:
+                data = pkl.load(f)
+            self.bad_kernels, self.good_kernels = data
+        print(f"#bad kernels {len(self.bad_kernels)},"
+              f"#good kernels {len(self.good_kernels)} loaded from cache ...")
+
+    def is_bad_kernel(self, kernel_file_name: str):
+        if self.test_compile:
+            return False
+        return kernel_file_name in self.bad_kernels
+
+    def is_good_kernel(self, kernel_file_name: str):
+        if self.test_compile:
+            return False
+        return kernel_file_name in self.good_kernels
+
+    def add(self, success: List[str], fail: List[str]):
+        self.good_kernels.extend(success)
+        self.bad_kernels.extend(fail)
+        # Remove duplicates
+        self.good_kernels = list(set(self.good_kernels))
+        self.bad_kernels = list(set(self.bad_kernels))
+
+    def store(self):
+        if self.test_compile:
+            return
+        print(f"Storing #badkernels {len(self.bad_kernels)}, "
+              f"#goodkernels {len(self.good_kernels)}")
+        with open(str(CACHE_FILE_PATH), 'wb+') as f:
+            pkl.dump((self.bad_kernels, self.good_kernels), f)
+
+
+@dataclass
+class KernelCompiler:
+    # vllm source code directory path
+    vllm_root_dir: Optional[str] = None
+    # python venv directory path
+    py_venv_dir: Optional[str] = None
+    # cuda directory path. example : /usr/local/cuda-12.5
+    cuda_dir: Optional[str] = None
+    #python version
+    py_version: str = '3.10'
+    # any additional flags
+    additional_args: List[str] = field(default_factory=lambda: [])
+    # kernel compile cache. Cache that holds history of which kernels
+    # succeeded and failed compilation.
+    cache: Optional[KernelCompileCache] = None
+    # Print nvcc compile information and override cache updates.
+    test_compile: bool = False
+
+    def init_compile_cache(self):
+        self.cache = KernelCompileCache(self.test_compile)
+
+    def compile(self, cu_file: str, gencode_arch: str) -> bool:
+        compile_command_base = [
+            'nvcc',
+            '-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1',
+            f'-I{self.vllm_root_dir}/csrc',
+            f'-I{self.vllm_root_dir}/.deps/cutlass-src/include',  #noqa
+            '-isystem',
+            f'/usr/include/python{self.py_version}',
+            '-isystem',
+            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include',
+            '-isystem',
+            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include/torch/csrc/api/include',  #noqa
+            '-isystem',
+            f'{self.cuda_dir}/include',
+            '-gencode',
+            f'arch=compute_{gencode_arch},code=sm_{gencode_arch}',
+            '-DONNX_NAMESPACE=onnx_c2',
+            '-Xcudafe',
+            '-DNDEBUG',
+            '-std=c++17',
+            '-Xcompiler=-fPIC',
+            '--expt-relaxed-constexpr',
+            '--threads=1',
+            '-D_GLIBCXX_USE_CXX11_ABI=0'] + self.additional_args
+        if gencode_arch == 90:
+            compile_command_base += ['-gencode', 'arch=compute_90a,code=sm_90a']
+
+        result = subprocess.run(compile_command_base + ['-c', cu_file],
+                                capture_output=True)
+
+        if self.test_compile:
+            print(f"Compiling {cu_file} : \n"
+                  f"   Successful compilation: {result.returncode == 0}\n"
+                  f"   stdout : {result.stdout}\n"
+                  f"   stderr : {result.stderr}\n")
+
+        if result.returncode == 0:
+            # Cleanup generated object code on successful compile.
+            object_file_path = Path("./" + Path(cu_file).stem + '.o')
+            assert object_file_path.exists(), object_file_path
+            object_file_path.unlink()
+
+        return result.returncode == 0
diff --git a/csrc/quantization/cutlass_w8a8/generator/kernel_generator.py b/csrc/quantization/cutlass_w8a8/generator/kernel_generator.py
new file mode 100644
index 0000000000000..cdef17804f9b5
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/kernel_generator.py
@@ -0,0 +1,245 @@
+"""
+Kernel Generator classes / functions.
+"""
+
+import shutil
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Tuple
+
+import jinja2
+import utils
+from autogen_manifest import Cutlass3xArgs
+from generator_types import GeneratorType
+from kernel_compiler import KernelCompiler
+
+
+@dataclass
+class GeneratorOutput:
+    # Used in torch_bindings generation
+    file_paths: List[str] = field(default_factory=lambda: [])
+    fn_names: List[str] = field(default_factory=lambda: [])
+    fn_decls: List[str] = field(default_factory=lambda: [])
+    # Used in cache update
+    failed_file_names: List[str] = field(default_factory=lambda: [])
+    success_file_names: List[str] = field(default_factory=lambda: [])
+
+    def merge(self, output: "GeneratorOutput"):
+        self.file_paths.extend(output.file_paths)
+        self.fn_names.extend(output.fn_names)
+        self.fn_decls.extend(output.fn_decls)
+        self.failed_file_names.extend(output.failed_file_names)
+        self.success_file_names.extend(output.success_file_names)
+
+
+## Abstract generator
+
+
+class KernelGenerator_(ABC):
+    SCRIPT_DIR = utils.get_script_dir()
+    GENERATE_DIR = SCRIPT_DIR / "generated"
+
+    @staticmethod
+    def write_torch_bindings(generator_type: GeneratorType,
+                             fn_names: List[str], fn_decls: List[str],
+                             ops_macro: str, dir_path: str):
+        s = "#pragma once\n"
+        s += "#include<torch/torch.h>\n"
+        s += f"#define {ops_macro} \\\n"
+        for fn_name in fn_names:
+            s += generator_type.ops_def(fn_name) + '\\\n'
+            s += generator_type.ops_impl(fn_name) + '\\\n'
+        s += "\n"
+
+        for fn_decl in fn_decls:
+            s += f'{fn_decl}\n'
+
+        # write ops.h
+        file_path = Path(dir_path) / "ops.h"
+        with open(str(file_path), 'w+') as f:
+            f.write(s)
+
+        # write torch_bindings.cpp
+        s = ""
+        s += '\n#include "core/registration.h"'
+        s += '\n#include <torch/library.h>'
+        s += '\n#include "ops.h"'
+        s += '\nTORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {'
+        s += f'\n {ops_macro}'
+        s += '\n}'
+        s += '\nREGISTER_EXTENSION(TORCH_EXTENSION_NAME)'
+        s += '\n'
+
+        tb_path = Path(dir_path) / "torch_bindings.cpp"
+        with open(str(tb_path), 'w+') as f:
+            f.write(s)
+
+    @staticmethod
+    def write_ops(generator_type: GeneratorType,
+                  file_paths: List[str],
+                  fn_names: List[str],
+                  fn_decls: List[str],
+                  ops_macro: str,
+                  batch_size: int = 100):
+        """
+        batch_size defines the number of files per .so.
+        If there are a 1000 filenames, then with batch_size 100, we generate
+        10 directories, each directory containing 100 kernels. Each directory
+        is converted into a .so during vllm compile.
+        """
+
+        assert len(file_paths) == len(fn_names)
+        assert len(file_paths) == len(fn_decls)
+
+        dir_name = 0
+        for i in range(0, len(file_paths), batch_size):
+
+            dir_path: Path = KernelGenerator_.GENERATE_DIR / f'{dir_name}'
+            dir_path.mkdir(exist_ok=True)
+
+            # Move files to dir
+            for file_path in file_paths[i:i + batch_size]:
+                if Path(file_path).exists():
+                    shutil.move(file_path, str(dir_path))
+
+            KernelGenerator_.write_torch_bindings(generator_type,
+                                                  fn_names[i:i + batch_size],
+                                                  fn_decls[i:i + batch_size],
+                                                  ops_macro, dir_path)
+
+            dir_name += 1  #noqa
+
+    @staticmethod
+    def last_namespace(s):
+        return s.split('::')[-1]
+
+    @staticmethod
+    @abstractmethod
+    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
+                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
+        ...
+
+
+class KernelGenerator(KernelGenerator_):
+    OPS_MACRO = "CUTLASS_DEFS"
+
+    @staticmethod
+    def generate_name(description: str, args: Cutlass3xArgs):
+
+        return 'autogen_{}_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}'.format(
+            description, args.arch, args.tile_shape[0], args.tile_shape[1],
+            args.tile_shape[2], args.cluster_shape[0], args.cluster_shape[1],
+            args.cluster_shape[2],
+            KernelGenerator_.last_namespace(args.kernel_schedule),
+            KernelGenerator_.last_namespace(args.epilogue_schedule),
+            KernelGenerator_.last_namespace(args.tile_schedule),
+            KernelGenerator_.last_namespace(args.gemm_mode),
+            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str)
+
+    @staticmethod
+    def generate_filename(description: str, args: Cutlass3xArgs):
+
+        f = '{}/autogen_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}_{}'.format(
+            KernelGenerator_.GENERATE_DIR, description, args.tile_shape[0],
+            args.tile_shape[1], args.tile_shape[2], args.cluster_shape[0],
+            args.cluster_shape[1], args.cluster_shape[2],
+            KernelGenerator_.last_namespace(args.kernel_schedule),
+            KernelGenerator_.last_namespace(args.epilogue_schedule),
+            KernelGenerator_.last_namespace(args.tile_schedule),
+            KernelGenerator_.last_namespace(args.gemm_mode),
+            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str,
+            args.arch)
+
+        f = f + ".cu"
+        return f
+
+    @staticmethod
+    def generate_kernel_file(generator_type: GeneratorType,
+                             args: Cutlass3xArgs) -> Tuple[str, str]:
+        """
+        Generate a .cu file that respects args and return,
+         - The function name of the generated function.
+         - The c++ function declaration of the generated function.
+        The return values are used in generating the torch bindings.
+        """
+
+        # Make the generate dir
+        KernelGenerator_.GENERATE_DIR.mkdir(exist_ok=True)
+
+        # Get jinja templates
+        jenv = jinja2.Environment(loader=jinja2.FileSystemLoader("/"))
+        fn_defn_template = jenv.get_template(
+            str(generator_type.fn_defn_jinja_filepath()))
+        fn_decl_template = jenv.get_template(
+            str(generator_type.fn_decl_jinja_filepath()))
+
+        # Generate code
+        fn_name = KernelGenerator.generate_name(generator_type.description(),
+                                                args)
+        fn_decl = fn_decl_template.render(_name=fn_name)
+        code: str = fn_defn_template.render(
+            _name=fn_name,
+            _torch_input_dtype=utils.to_torch_dtype_str(args.dtype_str),
+            _cutlass_input_dtype=utils.to_cutlass_dtype_str(args.dtype_str),
+            _tile_shape=utils.get_as_cutlass3x_gemm_shape(args.tile_shape),
+            _cluster_shape=utils.get_as_cutlass3x_gemm_shape(
+                args.cluster_shape),
+            _kernel_schedule=args.kernel_schedule,
+            _epilogue_schedule=args.epilogue_schedule,
+            _tile_schedule=args.tile_schedule,
+            _gemm_mode=args.gemm_mode,
+            _acc_type=args.acc_type)
+
+        filename = KernelGenerator.generate_filename(
+            generator_type.description(), args)
+        if utils.file_contents_same(filename, code):
+            return (fn_name, fn_decl)
+
+        # write code
+        with open(filename, "w+") as f:
+            f.write(code)
+
+        return (fn_name, fn_decl)
+
+    @staticmethod
+    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
+                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
+        generator_output = GeneratorOutput()
+
+        filepath = KernelGenerator.generate_filename(
+            generator_type.description(), args)
+        filename = Path(filepath).name
+
+        if kernel_compiler.cache.is_bad_kernel(filename):
+            # We know that this kernel wouldn't compile. Abort
+            return generator_output
+
+        fn_name, fn_decl = KernelGenerator.generate_kernel_file(
+            generator_type, args)
+
+        if not kernel_compiler.cache.is_good_kernel(filename):
+            # We dont have any information about this kernel in the cache.
+            # try compiling
+            compile_success = kernel_compiler.compile(filepath,
+                                                      gencode_arch=args.arch)
+            if compile_success:
+                generator_output.success_file_names.append(filename)
+            else:
+                generator_output.failed_file_names.append(filename)
+                if not kernel_compiler.test_compile:
+                    # Remove generated file
+                    Path(filepath).unlink()
+                    return generator_output
+
+        generator_output.file_paths.append(filepath)
+        generator_output.fn_names.append(fn_name)
+        generator_output.fn_decls.append(fn_decl)
+
+        return generator_output
+
+    @staticmethod
+    def write_ops(generator_type: GeneratorType, file_paths: List[str],
+                  fn_names: List[str], fn_decls: List[str]):
+        return KernelGenerator_.write_ops(generator_type, file_paths, fn_names,
+                                          fn_decls, KernelGenerator.OPS_MACRO)
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja
new file mode 100644
index 0000000000000..ea6b25fb5f91c
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja
@@ -0,0 +1,56 @@
+#include <stddef.h>
+#include <torch/all.h>
+#include "cutlass/cutlass.h"
+#include "quantization/cutlass_w8a8/scaled_mm_c3x.cuh"
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &b,
+                torch::Tensor const &a_scales,
+                torch::Tensor const &b_scales) {
+
+  using TileShape =  {{ _tile_shape }};
+  using ClusterShape = {{ _cluster_shape }};
+  using KernelSchedule = typename {{ _kernel_schedule }};
+  using EpilogueSchedule = typename {{ _epilogue_schedule }};
+  using TileSchedule = typename {{ _tile_schedule }};
+  using AccType = {{ _acc_type }};
+  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
+
+  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
+  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+      cutlass_3x_gemm<cutlass::float_e4m3_t,
+                      cutlass::bfloat16_t,
+                      ScaledEpilogue,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      EpilogueSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_gemm_caller<Cutlass3xGemm>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    using Cutlass3xGemm =
+      cutlass_3x_gemm<cutlass::float_e4m3_t,
+                      cutlass::half_t,
+                      ScaledEpilogue,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      EpilogueSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_gemm_caller<Cutlass3xGemm>(
+        out, a, b, a_scales, b_scales);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja
new file mode 100644
index 0000000000000..c671bfc155c09
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja
@@ -0,0 +1,6 @@
+
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &b,
+                torch::Tensor const &a_scales,
+                torch::Tensor const &b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja
new file mode 100644
index 0000000000000..17bfdcc92ddb6
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja
@@ -0,0 +1,87 @@
+#include <stddef.h>
+#include <torch/all.h>
+#include "cutlass/cutlass.h"
+#include "quantization/cutlass_w8a8/scaled_mm_c3x.cuh"
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &b,
+                std::string reduction_mode_,
+                std::string decomposition_mode_,
+                torch::Tensor const &a_scales,
+                torch::Tensor const &b_scales) {
+
+  assert (reduction_mode_ == "Deterministic" ||
+          reduction_mode_ == "Nondeterministic");
+  assert (decomposition_mode_ == "Heuristic" ||
+          decomposition_mode_ == "SplitK" ||
+          decomposition_mode_ == "DataParallel" ||
+          decomposition_mode_ == "StreamK");
+
+  using TileShape =  {{ _tile_shape }};
+  using ClusterShape = {{ _cluster_shape }};
+  using KernelSchedule = typename {{ _kernel_schedule }};
+  using EpilogueSchedule = typename {{ _epilogue_schedule }};
+  using TileSchedule = typename {{ _tile_schedule }};
+  using AccType = {{ _acc_type }};
+  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
+
+  ReductionMode reduction_mode{ReductionMode::Deterministic};
+  if (reduction_mode_ == "Deterministic") {
+    reduction_mode = ReductionMode::Deterministic;
+  }
+  else if (reduction_mode_ == "Nondeterministic") {
+    reduction_mode = ReductionMode::Nondeterministic;
+  }
+
+  DecompositionMode decomposition_mode{DecompositionMode::Heuristic};
+  if (decomposition_mode_ == "Heuristic") {
+    decomposition_mode = DecompositionMode::Heuristic;
+  }
+  else if (decomposition_mode_ == "SplitK") {
+    decomposition_mode = DecompositionMode::SplitK;
+  }
+  else if (decomposition_mode_ == "DataParallel") {
+    decomposition_mode = DecompositionMode::DataParallel;
+  }
+  else if (decomposition_mode_ == "StreamK") {
+    decomposition_mode = DecompositionMode::StreamK;
+  }
+
+  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
+  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+      cutlass_3x_gemm<cutlass::float_e4m3_t,
+                      cutlass::bfloat16_t,
+                      ScaledEpilogue,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      EpilogueSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_gemm_caller_streamk<Cutlass3xGemm>(
+        out, a, b, reduction_mode, decomposition_mode, a_scales, b_scales);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    using Cutlass3xGemm =
+      cutlass_3x_gemm<cutlass::float_e4m3_t,
+                      cutlass::half_t,
+                      ScaledEpilogue,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      EpilogueSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_gemm_caller_streamk<Cutlass3xGemm>(
+        out, a, b, reduction_mode, decomposition_mode, a_scales, b_scales);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja
new file mode 100644
index 0000000000000..be5f7cb351209
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja
@@ -0,0 +1,7 @@
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &b,
+                std::string reduction_mode_,
+                std::string decomposition_mode_,
+                torch::Tensor const &a_scales,
+                torch::Tensor const &b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja b/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja
new file mode 100644
index 0000000000000..ba1a427bfea1e
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja
@@ -0,0 +1,48 @@
+#include <stddef.h>
+#include <torch/all.h>
+#include "cutlass/cutlass.h"
+#include "quantization/cutlass_w8a8/scaled_mm_c3x.cuh"
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &b) {
+
+  using TileShape =  {{ _tile_shape }};
+  using ClusterShape = {{ _cluster_shape }};
+  using KernelSchedule = typename {{ _kernel_schedule }};
+  using AccType = {{ _acc_type }};
+  using TileSchedule = typename {{ _tile_schedule }};
+  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
+
+  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
+  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+      cutlass_3x_simple_gemm<cutlass::float_e4m3_t,
+                      cutlass::bfloat16_t,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+
+    using Cutlass3xGemm =
+      cutlass_3x_simple_gemm<cutlass::float_e4m3_t,
+                      cutlass::half_t,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_simple_gemm_caller<Cutlass3xGemm>(
+        out, a, b);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja b/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja
new file mode 100644
index 0000000000000..12feea36beede
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja
@@ -0,0 +1,4 @@
+
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &b);
diff --git a/csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py b/csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py
new file mode 100644
index 0000000000000..3d8296df38d10
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py
@@ -0,0 +1,242 @@
+import pickle as pkl
+from pathlib import Path
+from typing import List, Optional
+
+import matplotlib.pyplot as plt
+import numpy as np
+from select_kernels import select_kernels
+from utils import Data, make_heatmap_data, measurement_to_data
+
+
+def plot_heatmap(data: np.array,
+                 y_labels: List[str],
+                 x_labels: List[str],
+                 save_filename='heatmap.png'):
+    # min because of some matplotlib render restrictions.
+    fig_size_x = min(len(x_labels), 320)
+    fig_size_y = len(y_labels)
+    fig, ax = plt.subplots(figsize=(fig_size_x, fig_size_y))
+    im = ax.imshow(data, cmap="Reds", vmin=0.0, vmax=1.0, interpolation=None)
+
+    cbar = ax.figure.colorbar(im, ax=ax, cmap="Reds")
+    cbar.ax.set_ylabel("Hot == Closer to peak perf.", rotation=90, va="top")
+
+    # Show all ticks and label them with the respective list entries
+    ax.set_xticks(np.arange(len(x_labels)), labels=x_labels)
+    ax.set_yticks(np.arange(len(y_labels)), labels=y_labels)
+
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=90)
+
+    # Loop over data dimensions and create text annotations.
+    for i in range(len(y_labels)):
+        for j in range(len(x_labels)):
+            ax.text(j,
+                    i,
+                    data[i, j],
+                    ha="center",
+                    va="center",
+                    color="w",
+                    fontsize=10.0)
+
+    #ax.colorbar()
+
+    ax.set_title("GEMM shape vs Best cutlass op")
+    #ax.set_aspect('equal')
+    fig.tight_layout()
+
+    #fig.set_dpi(300)
+    #plt.show()
+    print(f"Save location : {save_filename}")
+    fig.savefig(save_filename, dpi=100)
+    #fig.savefig(save_filename, dpi=10)
+
+
+def select_top_k_kernels(gemm_ops: np.array,
+                         gemm_problems: List[str],
+                         ops: List[str],
+                         k: int = 100) -> List[str]:
+    """
+    Simple top_k kernel selection. 
+    Gather the top-k best performing kernels for each gemm problem and
+    return the union.
+    """
+    n_rows = len(gemm_problems)
+
+    max_kernels_per_gemm_shape = 100  # k-value
+    gemm_efficiency_threshold = 0.90
+
+    selected_ops = []
+    for r in range(n_rows):
+        gemm_ops_list = np.copy(gemm_ops[r])
+        sorted_indices = list(reversed(np.argsort(gemm_ops_list).tolist()))
+
+        selected_shape_ops = []
+        for x in sorted_indices:
+            if 'autogen' not in ops[x]:
+                # select only autogen kernels/ops
+                continue
+            if len(selected_shape_ops) >= max_kernels_per_gemm_shape:
+                break
+            # we have reached the min requirement. Decide to break based on
+            # the gemm_efficiency threshold.
+            if gemm_ops_list[x] < gemm_efficiency_threshold:
+                break
+            else:
+                selected_shape_ops.append(ops[x])
+
+        selected_ops.append(selected_shape_ops)
+
+        op_scores = []
+        for idx in range(len(selected_shape_ops)):
+            if 'autogen' not in ops[sorted_indices[idx]]:
+                continue
+            op_scores.append(gemm_ops_list[sorted_indices[idx]])
+        print(f"Gemm problem {gemm_problems[r]} "
+              f"- #kernels {len(selected_shape_ops)} "
+              f"- selected kernel range [ {min(op_scores)} , "
+              f"{max(op_scores)} ] ")
+
+    # Merge all ops to create a final list
+    selected_ops = [set(x) for x in selected_ops]
+    selected_ops_set = set()
+    for x in selected_ops:
+        selected_ops_set = selected_ops_set.union(x)
+
+    print(f"#Selected ops set {len(selected_ops_set)}")
+    for x in selected_ops_set:
+        print(x)
+    return list(selected_ops_set)
+
+
+def remove_less_performant_kernels(gemm_ops: np.array, ops: List[str]):
+    """
+    Removes kernel that are relatively less performant from gemm_ops.
+    """
+    n_ops = gemm_ops.shape[1]
+    assert n_ops == len(ops)
+
+    gemm_ops_predicated = gemm_ops < 0.75
+    ops_predicated = np.all(gemm_ops_predicated, axis=0)
+
+    bad_cols = list(range(n_ops))
+    bad_cols = list(filter(lambda x: ops_predicated[x], bad_cols))
+    bad_cols = sorted(list(set(bad_cols)), reverse=True)
+    for bc in bad_cols:
+        ops.pop(bc)
+        gemm_ops = np.delete(gemm_ops, bc, 1)
+
+    return gemm_ops, ops
+
+
+def plot(gemm_ops: np.array,
+         gemm_problems: List[str],
+         ops: List[str],
+         save_filename: str,
+         prune_ops: bool = False):
+    if prune_ops:
+        gemm_ops, ops = remove_less_performant_kernels(gemm_ops, ops)
+        print(f"Pruned gemm_ops {gemm_ops.shape}")
+
+    plot_heatmap(gemm_ops, gemm_problems, ops, save_filename)
+
+
+def select_kernels_and_plot(gemm_problems: List[str], ops: List[str],
+                            data: List[str], save_filename: str):
+
+    autogen_ops = list(filter(lambda x: x.startswith('autogen'), ops))
+    cutlass_ops = list(filter(lambda x: x.startswith('cutlass'), ops))
+    pytorch_ops = list(filter(lambda x: x.startswith('pytorch'), ops))
+    assert len(autogen_ops) + len(cutlass_ops) + len(pytorch_ops) == len(ops)
+
+    print("Selecting the autogen kernels ..")
+    # select the best autogen kernels
+    gemm_autogenops = make_heatmap_data(gemm_problems, autogen_ops, data)
+    selected_autogen_ops = select_kernels(gemm_autogenops,
+                                          gemm_problems,
+                                          autogen_ops,
+                                          min_gemm_efficiency=0.98)
+
+    # prepare plot data
+    selected_ops = selected_autogen_ops + cutlass_ops + pytorch_ops
+    gemm_ops = make_heatmap_data(gemm_problems, selected_ops, data)
+    print("Plotting autogen kernels ...")
+    plot(gemm_ops, gemm_problems, selected_ops, save_filename)
+
+
+def from_measurements(args):
+    pkl_files: List[str] = args.input_pkl
+    save_file: Optional[str] = args.save_file
+    data: List[Data] = []
+
+    for pkl_file in pkl_files:
+        with open(pkl_file, 'rb') as f:
+            pkl_data = pkl.load(f)
+            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
+
+    ops: List[str] = list(map(lambda x: x.description, data))
+    ops = sorted(list(set(ops)))
+
+    gemm_problems: List[str] = list(map(lambda x: (x.m, x.n, x.k), data))
+    gemm_problems = sorted(list(set(gemm_problems)))
+
+    print(f"#gemm_problems {len(gemm_problems)}")
+    print(f"#gemm_ops {len(ops)}")
+
+    # plot all data as heat map
+    if args.plot_all_ops:
+        gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
+        out_file: str = pkl_file.replace(
+            '.pkl', '_heatmap.png') if save_file is None else save_file
+        plot(gemm_ops, gemm_problems, ops, save_filename=out_file)
+
+    if args.select_kernels:
+        out_file = None
+        if save_file:
+            out_file = Path(save_file).with_suffix("_selected.png")
+        else:
+            out_file = pkl_file.replace('.pkl', 'selected_heatmap.png')
+        select_kernels_and_plot(gemm_problems, ops, data, out_file)
+
+
+def main(args):
+    from_measurements(args)
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='''
+        Plot bench measurements pkl.
+        Example invocation: 
+        Plot all the ops in model bench pickle file:
+            python3 heatmap.py \
+              --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
+              --plot-all-ops
+        Run select kernel on the input-pkl and plot the selected ops.
+            python3 heatmap.py \
+               --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
+               --select-kernels
+        ''')
+
+    parser.add_argument("--input-pkl",
+                        "-i",
+                        nargs="+",
+                        required=True,
+                        type=str,
+                        help=("This is typically the pickle file output by "
+                              "w8a8_benchmarks.py 's model_bench command"))
+    parser.add_argument("--save-file", "-o", required=False, type=str)
+    parser.add_argument("--select-kernels",
+                        action='store_true',
+                        help="Run kernel selection and plot the heatmap "
+                        "for the selected kernels")
+    parser.add_argument("--plot-all-ops",
+                        action='store_true',
+                        help="plot heatmap for all ops")
+    args = parser.parse_args()
+
+    if not args.plot_all_ops and not args.select_kernels:
+        print("Argument error : Please provide at least one argument among"
+              "[--plot-all-ops, --select-kernels]")
+
+    main(args)
diff --git a/csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py b/csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py
new file mode 100644
index 0000000000000..aea3289ed232d
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py
@@ -0,0 +1,244 @@
+import pickle as pkl
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+from utils import Data, make_heatmap_data, measurement_to_data
+
+
+@dataclass
+class Interval:
+    s: int  # start of interval
+    e: int  # end of interval
+    eff: float  # efficiency of the kernel in that range.
+
+    def x_in_interval(self, x: int) -> bool:
+        return self.s <= x and x <= self.e
+
+    def is_overlap(self, s, e):
+        return s <= self.e and self.s <= e
+
+
+@dataclass
+class KernelIntervals:
+    name: str
+    intervals: List[Interval]
+
+    def spanning_interval(self, pi: int) -> Optional[Interval]:
+        for i in self.intervals:
+            if i.x_in_interval(pi):
+                return i
+        return None
+
+
+class SelectKernelMeta:
+
+    def __init__(self, gemm_ops: np.array, gemm_problems: List[str],
+                 ops: List[str], min_gemm_efficiency: float):
+        self.gemm_ops = np.copy(gemm_ops)
+        self.gemm_problems = gemm_problems
+        self.ops = ops
+        self.min_gemm_efficiency = min_gemm_efficiency
+
+        self.n_problems = len(self.gemm_problems)
+        self.n_kernels = len(self.ops)
+
+        # Convert to kernel ranges
+        self.problem_indices = {x: idx for idx, x in enumerate(gemm_problems)}
+        self.kernel_indices = {x: idx for idx, x in enumerate(ops)}
+
+        self.kernel_intervals: List[KernelIntervals] = []
+        for ki in range(self.n_kernels):
+            self.kernel_intervals.append(self.make_kernel_intervals(ki))
+
+    def avg_efficiency(self, p_s: int, p_e: int, ki: int) -> float:
+        """
+        Average efficiency of the ki kernel for the gemm shapes in
+        range [p_s, p_e]
+        """
+        vals = self.gemm_ops[:, ki].tolist()[p_s:p_e + 1]
+        return sum(vals) / len(vals)
+
+    # TODO (varun) : Revisit kernel scores to use only the intervals we actually
+    # use for specific kernels.
+    def kernel_set_score(self, p_s: int, p_e: int, kernel_indices: set[int]):
+        """
+        Compute a score for a set of kernels for the gemm shape indices in
+        range [p_s, p_e]
+        """
+        if len(kernel_indices) == 0:
+            return 0.0
+        ki_scores = []
+        for ki in kernel_indices:
+            interval_scores = []
+            for i in self.kernel_intervals[ki].intervals:
+                if i.is_overlap(p_s, p_e):
+                    interval_scores.append(i.eff)
+            assert len(interval_scores) > 0
+            ki_scores.append(sum(interval_scores) / len(interval_scores))
+        assert len(ki_scores) > 0
+        return sum(ki_scores) / len(ki_scores)
+
+    def make_kernel_intervals(self, ki: int) -> KernelIntervals:
+        s = None
+        e = None
+        kernel_intervals: KernelIntervals = KernelIntervals(self.ops[ki], [])
+        for pi in range(self.n_problems):
+            if self.gemm_ops[pi][ki] < self.min_gemm_efficiency:
+                # record range
+                if e:
+                    assert s is not None
+                    kernel_intervals.intervals.append(
+                        Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
+                s, e = None, None
+            else:
+                s = pi if s is None else s
+                e = pi
+        if e:
+            assert s is not None
+            kernel_intervals.intervals.append(
+                Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
+        # sort intervals in the kernel
+        kernel_intervals.intervals = sorted(kernel_intervals.intervals,
+                                            key=lambda x: x.s)
+        return kernel_intervals
+
+
+def map_gemm_to_kernel(kernel_indices: List[int],
+                       meta: SelectKernelMeta) -> Dict[int, int]:
+    """
+    For every gemm problem in meta.gemm_problems, select a kernel from
+    kernel_indices and return as a dict.
+    """
+    gemm_to_kernel_map = {}
+
+    for pi in range(meta.n_problems):
+        kernels_for_pi = []
+        for ki in kernel_indices:
+            if meta.kernel_intervals[ki].spanning_interval(pi):
+                kernels_for_pi.append(ki)
+        assert len(kernels_for_pi) != 0
+
+        # select the kernel with max efficiency
+        eff_ki = [(meta.gemm_ops[pi][ki], ki) for ki in kernels_for_pi]
+        max_eff_ki = max(eff_ki, key=lambda x: x[0])[1]
+        gemm_to_kernel_map[pi] = max_eff_ki
+
+    return gemm_to_kernel_map
+
+
+def select_kernels_dp(
+        p_s: int,
+        p_e: int,  # Problem start index and problem end index
+        meta: SelectKernelMeta,
+        solution_cache: Dict[Tuple[int, int], set]) -> set[int]:
+    """
+    Compute the best set of kernels for the gemm problem shapes,
+    meta.gemm_problems[p_s:p_e].
+    """
+    if p_s > p_e:
+        return set([])
+    assert p_s <= p_e
+    assert p_s >= 0 and p_e >= 0
+    assert p_s < meta.n_problems and p_e < meta.n_problems
+
+    if solution_cache.get((p_s, p_e), None) is not None:
+        return solution_cache.get((p_s, p_e))
+
+    spanning_kernels: List[Tuple[int, Interval]] = []
+    for ki in range(meta.n_kernels):
+        span_i = meta.kernel_intervals[ki].spanning_interval(p_s)
+        assert span_i is None or (span_i.s <= p_s and span_i.e >= p_s)
+        if span_i is not None:
+            spanning_kernels.append((ki, span_i))
+
+    assert len(spanning_kernels) != 0, \
+            (f"Cannot find a spanning kernel in range ({p_s}, {p_e})"
+            f"- gemm {meta.gemm_problems[p_s]} to {meta.gemm_problems[p_e]}"
+            f". Try reducing the min_gemm_efficiency")
+    ki_solutions: List[set[int]] = []
+    for ki, span in spanning_kernels:
+        ki_solutions.append(
+            set([ki]).union(
+                select_kernels_dp(span.e + 1, p_e, meta, solution_cache)))
+
+    # find the solution with minimum number of kernels.
+    sol = min(ki_solutions, key=lambda x: len(x))
+    solution_cache[(p_s, p_e)] = sol
+    return sol
+
+
+def select_kernels(gemm_ops: np.array, gemm_problems: List[str],
+                   ops: List[str], min_gemm_efficiency: float) -> List[str]:
+    """
+    Given a list of gemm problem shapes, gemm_problems, a list of autogen
+    kernel operations ops, normalized benchmarking information and a
+    minimum operation efficiency to consider, this function, finds that
+    smallest set of kernels such that kernels in the satisfies the
+    min_gemm_efficiency for all the gemm shapes. 
+    """
+    solution_cache = {}
+    meta = SelectKernelMeta(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
+    kernels = select_kernels_dp(0, meta.n_problems - 1, meta, solution_cache)
+
+    gemm_to_kernel_map = map_gemm_to_kernel(list(kernels), meta)
+
+    print(f"#kernels found {len(kernels)}")
+    for pi in range(meta.n_problems):
+        print(f"Problem {meta.gemm_problems[pi]} - "
+              f"Kernel {meta.ops[gemm_to_kernel_map[pi]]} "
+              f"eff. ({gemm_ops[pi][gemm_to_kernel_map[pi]]}) ")
+
+    kernel_names = [ops[ki] for ki in kernels]
+    return kernel_names
+
+
+def from_measurements(pkl_files: List[str], min_gemm_efficiency: float):
+    data: List[Data] = []
+
+    for pkl_file in pkl_files:
+        with open(pkl_file, 'rb') as f:
+            pkl_data = pkl.load(f)
+            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
+
+    ops = list(map(lambda x: x.description, data))
+    ops = sorted(list(set(ops)))
+    # have only autogen kernels
+    ops = list(filter(lambda x: 'autogen' in x, ops))
+
+    gemm_problems = list(map(lambda x: (x.m, x.n, x.k), data))
+    gemm_problems = sorted(list(set(gemm_problems)))
+
+    print(f"#gemm_problems {len(gemm_problems)}")
+    print(f"#gemm_ops {len(ops)}")
+
+    gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
+    select_kernels(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
+
+
+def main(pkl_files: List[str], min_gemm_efficiency: float):
+    from_measurements(pkl_files, min_gemm_efficiency)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=("Select minimal set of kernels in some model_bench "
+                     "pkl file such that the set of kernels satisfy"
+                     "the min-gemm-efficiency for all the gemm shapes in"
+                     "the model_bench"))
+    parser.add_argument("--input-pkl",
+                        "-i",
+                        nargs="+",
+                        required=True,
+                        type=str)
+    parser.add_argument(
+        "--min-gemm-efficiency",
+        type=float,
+        default=0.95,
+        help="Gemms that are less than this for a particular gemm shape is"
+        "disregarded")
+    args = parser.parse_args()
+
+    main(args.input_pkl, args.min_gemm_efficiency)
diff --git a/csrc/quantization/cutlass_w8a8/generator/tools/utils.py b/csrc/quantization/cutlass_w8a8/generator/tools/utils.py
new file mode 100644
index 0000000000000..74d5c9ddbd08d
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/tools/utils.py
@@ -0,0 +1,63 @@
+from dataclasses import dataclass
+from typing import List
+
+import numpy as np
+from torch.utils.benchmark import Measurement as TMeasurement
+
+
+@dataclass
+class Data:
+    m: int
+    k: int
+    n: int
+    description: str
+    time: float
+    tflops: float
+
+
+def parse_mkn(mkn_str: str):
+    # mkn_str : MKN=(16x1024x512)
+    mkn_tuple = mkn_str.split("=")[1]
+    # mkn_tuple : (16x1024x512)
+    mkn_prod = mkn_tuple[1:-1]
+    # mkn_prod : 16x1024x512
+    mkn_tuple = tuple(mkn_prod.split("x"))
+    return (int(mkn_tuple[0]), int(mkn_tuple[1]), int(mkn_tuple[2]))
+
+
+def measurement_to_data(measurement: TMeasurement) -> Data:
+    m, k, n = parse_mkn(measurement.sub_label)
+    t_ops = 2 * m * k * n / 1024 / 1024 / 1024 / 1024
+    tflops = t_ops / measurement.median
+    return Data(m, k, n, measurement.task_spec.description, measurement.median,
+                tflops)
+
+
+def make_heatmap_data(gemm_problems: List[str], ops: List[str],
+                      data: List[Data]) -> np.array:
+    """
+        gemm_problems : List of gemm problem shapes
+        ops : List of operations (kernels)
+        data : List of Data that contains benchmark information for all
+            op-gemmshape pairs.
+        Normalize all the benchmark information w.r.t. to its gemm-shape
+        and return the normalized benchmark information as a numpy array.
+    """
+    gemm_ops: List[List[float]] = [[0.0] * len(ops)
+                                   for _ in range(len(gemm_problems))]
+    for op_idx, op in enumerate(ops):
+        op_data = list(filter(lambda x: x.description == op, data))
+        for gemm_idx, gemm in enumerate(gemm_problems):
+            m, n, k = gemm
+            selected = list(
+                filter(lambda x: x.m == m and x.n == n and x.k == k, op_data))
+            if len(selected) >= 1:
+                gemm_ops[gemm_idx][op_idx] = float(selected[0].tflops)
+
+    for gemm_idx in range(len(gemm_problems)):
+        max_tflops = max(gemm_ops[gemm_idx])
+        for op_idx in range(len(ops)):
+            gemm_ops[gemm_idx][op_idx] = round(
+                gemm_ops[gemm_idx][op_idx] / max_tflops, 2)
+
+    return np.array(gemm_ops)
diff --git a/csrc/quantization/cutlass_w8a8/generator/utils.py b/csrc/quantization/cutlass_w8a8/generator/utils.py
new file mode 100644
index 0000000000000..8121412e42a47
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/generator/utils.py
@@ -0,0 +1,48 @@
+"""
+Utils used in generating cutlass kernels.
+"""
+
+import os
+from pathlib import Path
+from typing import Tuple
+
+## Utilities ####
+
+
+def to_torch_dtype_str(dtype_str):
+    if dtype_str == "int8":
+        return "torch::kInt8"
+    if dtype_str == "fp8":
+        return "torch::kFloat8_e4m3fn"
+    raise ValueError("unknown type")
+
+
+def to_cutlass_dtype_str(dtype_str):
+    if dtype_str == "int8":
+        return "int8_t"
+    if dtype_str == "fp8":
+        return "cutlass::float_e4m3_t"
+    raise ValueError("unknown type")
+
+
+def get_script_dir() -> Path:
+    return Path(os.path.dirname(os.path.realpath(__file__)))
+
+
+def get_as_cutlass_gemm_shape(shape: Tuple[int, int, int]):
+    return f'cutlass::gemm::GemmShape<{shape[0]}, {shape[1]}, {shape[2]}>'
+
+
+def get_as_cutlass3x_gemm_shape(shape: Tuple[int, int, int]):
+    return f'Shape<_{shape[0]}, _{shape[1]}, _{shape[2]}>'
+
+
+def file_contents_same(filepath, contents):
+    if not Path(filepath).exists():
+        return
+
+    f_contents = None
+    with open(filepath, "r") as f:
+        f_contents = f.read()
+
+    return f_contents == contents
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index ff68a4a0e62ec..84e1f367c8722 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,589 +1,7 @@
-// clang-format will break include orders
-// clang-format off
-#include <cudaTypedefs.h>
-
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
+#include <stddef.h>
 #include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
 #include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
-
-/*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
-*/
-
-namespace {
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-
-/*
- * This class provides the common load descriptors for the
- * ScaledEpilogue[...] classes
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  template <typename T>
-  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // This utility function constructs the arguments for the load descriptors
-  // from a tensor. It can handle both row and column, as well as row/column or
-  // scalar cases.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(torch::Tensor const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
-    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
-      return Arguments{data_ptr, tensor.numel() != 1};
-    } else {
-      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
-                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
-      return Arguments{data_ptr};
-    }
-  }
-
-  // This overload handles the case where there might not be a tensor, in which
-  // case a nullptr is passed and a constant (0) is used.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
-    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
-                  std::is_same_v<Descriptor, RowLoad<T, true>>);
-    return Arguments{data_ptr};
-  }
-};
-
-/*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch.scaled_mm_.
-
-   A and B may be both either int8 or fp8_e4m3. A can be
-   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
-  }
-};
-
-/*
- * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
- * This bias can also be used in the per-tensor azp case, where the activation
- * zero point (azp) is used to compute an azp correction term,
- * which is folded into the bias.
- *
- * The bias tensor must be per-output channel.
- * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
-  }
-};
-
-/*
- * This epilogue directly supports per-tensor azp in int32 form.
- * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
- * term, which should already be multiplied with the scalar azp.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // This is the full AZP term, azp * J @ B, shape (1,n)
-  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute float(accum - azp_adj), both operands are int32_t
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-/*
- * This epilogue supports per-token azp by computing and applying
- * the correction term using a rank-1 update. If the term were materialized,
- * it would require O(m*n) space, and this way it only requires O(m+n) space.
- * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
- * point for each row of A.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // Per-token azp term, shape (m,1)
-  using Azp = typename SUPER::template ColLoad<int32_t>;
-
-  // This is the AZP adjustment term, J @ B, shape (1,n)
-  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute azp * azp_adj
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, int32_t, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
-
-  // Compute float(accum - azp*azp_adj), all operands are int32_t
-  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAcc =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-}  // namespace
+#include "scaled_mm_c3x.cuh"
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
@@ -748,4 +166,75 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-#endif
+// hyper-parameter sweep kernels
+
+void cutlass_scaled_mm_sm90_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     torch::Tensor const& a_scales,
+                                     torch::Tensor const& b_scales,
+                                     c10::optional<torch::Tensor> const& bias) {
+  assert(!bias);
+
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using AccType = float;
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+        cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t,
+                        ScaledEpilogue, TileShape, ClusterShape, KernelSchedule,
+                        EpilogueSchedule, AccType,
+                        cutlass::gemm::PersistentScheduler,
+                        cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_gemm_caller<Cutlass3xGemm>(out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+
+    using Cutlass3xGemm =
+        cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::half_t, ScaledEpilogue,
+                        TileShape, ClusterShape, KernelSchedule,
+                        EpilogueSchedule, AccType,
+                        cutlass::gemm::PersistentScheduler,
+                        cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_gemm_caller<Cutlass3xGemm>(out, a, b, a_scales, b_scales);
+  }
+}
+
+void cutlass_simple_gemm_sm90_dispatch(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b) {
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using AccType = float;
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+        cutlass_3x_simple_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t,
+                               TileShape, ClusterShape, KernelSchedule, AccType,
+                               cutlass::gemm::PersistentScheduler,
+                               cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+
+    using Cutlass3xGemm =
+        cutlass_3x_simple_gemm<cutlass::float_e4m3_t, cutlass::half_t,
+                               TileShape, ClusterShape, KernelSchedule, AccType,
+                               cutlass::gemm::PersistentScheduler,
+                               cutlass::gemm::GemmUniversalMode::kGemm>;
+
+    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
+  }
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
new file mode 100644
index 0000000000000..9b1dd748bfbbe
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
@@ -0,0 +1,777 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+  #endif
+  }
+};
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzp
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasAzpToken
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD, true>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
+                                         EVTComputeScaleB, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, AccType,
+                                AccType>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
+                                      epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+using ReductionMode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::ReductionMode;
+using DecompositionMode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+using RasterOrderOptions = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions; 
+
+template <typename Gemm, typename... EpilogueArgs>
+inline void cutlass_gemm_caller_streamk(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                ReductionMode reduction_mode,
+                                DecompositionMode decomposition_mode,
+                                EpilogueArgs&&... epilogue_params) {
+
+  static_assert(std::is_same<typename Gemm::KernelType::TileSchedulerTag, cutlass::gemm::StreamKScheduler>::value, "Must be streamk scheduler");
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::TileSchedulerArguments tile_scheduler_args(
+    1,
+    1,
+    RasterOrderOptions::Heuristic,
+    decomposition_mode
+  );
+  tile_scheduler_args.reduction_mode = reduction_mode;
+
+  // Copied from examples...
+  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
+                                      epilogue_args, hw_info, tile_scheduler_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, int32_t>;
+};
+
+template <typename ElementAB_, typename ElementD_, typename TileShape,
+          typename ClusterShape, typename KernelSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+struct cutlass_3x_simple_gemm {
+  static const GemmUniversalMode Mode = Mode_;
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, AccType,
+                                AccType>::type;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      TileSchedule>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+inline void cutlass_simple_gemm_caller(torch::Tensor& out,
+                                       torch::Tensor const& a,
+                                       torch::Tensor const& b) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
+                                      epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/generator/README.md b/csrc/sparse/cutlass/generator/README.md
new file mode 100644
index 0000000000000..523d767074820
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/README.md
@@ -0,0 +1,143 @@
+## Cutlass Kernel Generator and Benchmark Sweeps
+
+#### Basic Idea
+ - Expose a C++ interface for the function to benchmark. The interface must be
+   templated with the hyper-parameters we desire to sweep over.
+ - Generate .cu files using jinja templates that use the exposed interface.
+   Look at `scaled_mm_c3x.jinja`
+ - Generate torch bindings for the functions in the .cu files.
+ - Build vllm to include all the generated .cu files. Look at `nm_cutlass_c.cmake`
+ - Run the benchmarking script to sweep over problem shapes and all the generated
+   cutlass kernels. Look at `benchmarks/cutlass_benchmarks/bench_v2.py`
+
+#### Important Files
+ - scaled_mm_c3x.jinja / simple_gemm_c3x.jinja : Jinja templated files for functions to generate.
+ - scaled_mm_c3x_fnprototype.jinja / simple_gemm_c3x_fnprototype.jinja : Jinja templated files for the C++ function declarations.
+ - generator_types.py : This file contains all the information regarding the function type we intend to generate.
+        For example, at the time of writing, we have ScaledMMGeneratorType and SimpleGemmGeneratorType.
+        The ScaledMMGeneratorType points to the correct jinja templates to use and also defines the
+        correct torch biniding `ops.impl` and `ops.def` string. This is where we register new GeneratorTypes
+        if we add more function-generators in the future.
+ - autogen_manifest.py : Defines hyper-parameter sets.
+ - kernel_generator.py : All utilities that are responsible for filling out the jinja templates
+        based on the given set of hyper-parameter args.
+ - generator.py : Bridges autogen_manifest.py and kernel_generator.py. This is the `main` driver
+        scripts that we use to generate kernels.
+ - kernel_compiler.py : Not all sets of hyperparameters are valid. The KernelCompiler, attempts an
+        nvcc compile on the generated kernel file and kernel_generator/generator accepts/rejects
+        the generated kernel based this compilation status.
+
+#### Adding a new function to generate
+
+##### Step 1
+    - Like mentioned before, expose a C++ interface for the function to generate. The interface
+    must be templated with the hyper-parameters we desire to sweep over.
+
+##### Step 2
+    - Create jinja templates.
+        1. Create a jinja template file that is representative of the kernel we wish to generate. 
+        2. Create a separate jinja template file that has the function declaration.
+    - Refer to `scaled_mm_c3x.jinja` and `scaled_mm_c3x_fnprototype.jinja`
+
+##### Step 3
+    - Create a GeneratorType in generator_types.py
+    - The GeneratorType is the datastructure that communicates,
+        1. What jinja template files to use
+        2. What is the torch_bindings `ops.def` and `ops.impl` arguments
+    - Refer to ScaledMMGeneratorType
+
+##### Step 4
+    - In autogen_manifest, create a list of hyper-parameter sets that are to be translated into kernel files.
+    - Look at the construction of Cutlass3xArgsTest in autogen_manifest.py
+
+##### Commands to generate kernels:
+    - Example command:
+    python3 csrc/quantization/cutlass_w8a8/generator/generator.py --generator-type scaled_mm --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test --cuda-dir /usr/local/cuda-12.5 --cutlass-args-list Cutlass3xArgsTest
+
+    Here: 
+        - --generator-type : The description of the desired GeneratorType in generator_types.py
+        - --vllm-root-dir : The root-dir of your vllm project
+        - --py-venv-dir : The root-dir of your python environment
+        - --cuda-dir : cuda dir to use
+        - --cutlass-args-list : the name of the list of hyper-parameter sets that you created in autogen_manifest.py
+
+    Expectations:
+     The generator attempts to generate one kernel for every hyper-parameter set.
+        - The generator looks generates the kernel file
+        - The generator attempts to compile the generated kernel file
+        - If compilation succeeds, it keeps the generated kernel file. Deletes it otherwise.
+
+    The generator records the status of the compilation for each kernel it tries to compile. If some kernel is known to 
+    have succeeded in a previous run, it simply generates it and doesnot attempt a re-compile.
+
+##### Commands to build
+    - The normal vllm build command should work.
+    - i.e. either `pip3 install -e .` or `python3 setup.py --build_ext --inplace`
+    Expectation:
+        Compilation should be successful and you should see .so files like, `_nm_cutlass_*_C.so` in the vllm folder
+
+##### How to benchmark
+The benchmarking scripts have been updated to grab all the auto-generated cutlass kernels. Look at 
+`get_autogen_functions` in `benchmarks/cutlass_benchmarks/bench_v2.py`.
+
+Example command:
+python3 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 --with-arg-pool 32 --with-cuda-graph 32 square_bench --dim-start 128 --dim-end 256 --dim-increment 128
+
+Expectations:
+    You should see output similar to, 
+     ```
+     attempting import vllm._nm_cutlass_0_C
+     #autogen functions found 3
+    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecializedCooperative_PersistentScheduler_kGemm_float_fp8
+    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
+    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedPingpongFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
+     ```
+
+##### Benchmark Heatmaps and Optimal Kernel Set Selection
+Typically a hyper-parameter sweep produces 100s of kernels. It could be hard to read the terminal outputs
+of benchmarking scripts. The w8a8_benchmarks.py script when used with the model_bench command, produces
+a pickle file that contains the benchmark information for all the {kernel, gemm-shape} pairs benchmarked.
+
+###### Kernel Selection Problem
+When we run a hyper-parameter sweep, we are interested in finding a minimal a set of kernels that is the
+optimal for the gemm-shapes benchmarked. `tools/select_kernels.py` solves this optimization problem.
+
+Example:
+ python3 select_kernels.py --input-pkl ./model_bench-torch.float8_e4m3fn-1729989172.pkl --min-gemm-efficiency 0.98
+
+ This example invocation of the select_kernels.py script,
+  - Reads the input pickle file and gathers the benchmark information of all the {kernel, gemm-shape} pairs.
+  - Normalizes the benchmark information with respect to gemm shapes. i.e. the best performing
+    kernel for some gemm-shape is given a value of 1.0. A kernel with a value of `x` ( `x` < 1.0)
+    indicates that that kernel's performance is `x` times that of the optimal kernel.
+  - The script ignores all the {kernel, gemm-shape} pairs where the kernel efficiency is < min_gemm_efficiency.
+    In this case the script only considers the {kernel, gemm-shape} pairs where the normalized value
+    is in range [0.98, 1.0]
+  - The script then determines the optimal and minimal kernel set.
+
+###### Visualization problem
+Reading the w8a8_benchmarks.py terminal output can get overwhelming. The script `tools/heatmap.py`
+consumes a model_bench pickle file and produces a heatmap for better consumption of the results.
+
+Example:
+  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --plot-all-ops
+
+  Normalizes all the {kernel, gemm-shape} information in the model_bench pickle file (refer to "Kernel Selection Problem"
+  for how the data is normalized). and renders the normalized benchmark information as a heatmap.
+
+Example:
+  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --select-kernels
+
+  Effectively runs select_kernel.py on the input pkl file and renders the selected kernels as heatmap.
+
+
+
+
+
+
+
+tools/select_kernel.py :  
+
+
+
+
diff --git a/csrc/sparse/cutlass/generator/autogen_manifest.py b/csrc/sparse/cutlass/generator/autogen_manifest.py
new file mode 100644
index 0000000000000..5e27bf81a1b29
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/autogen_manifest.py
@@ -0,0 +1,145 @@
+import copy
+from dataclasses import dataclass
+from itertools import product
+from typing import Tuple
+
+
+@dataclass
+class Cutlass3xArgs:
+    dtype_str: str
+    arch: int
+    tile_shape: Tuple[int, int, int]
+    cluster_shape: Tuple[int, int, int]
+    kernel_schedule: str
+    epilogue_schedule: str
+    tile_schedule: str
+    gemm_mode: str
+    acc_type: str
+
+    def with_tile_shape(self, ts):
+        clone = copy.deepcopy(self)
+        clone.tile_shape = ts
+        return clone
+
+    def with_cluster_shape(self, cs):
+        clone = copy.deepcopy(self)
+        clone.cluster_shape = cs
+        return clone
+
+    def with_tile_schedule(self, ts):
+        clone = copy.deepcopy(self)
+        clone.tile_schedule = ts
+        return clone
+
+    def with_kernel_schedule(self, ks):
+        clone = copy.deepcopy(self)
+        clone.kernel_schedule = ks
+        return clone
+
+    def with_epilogue_schedule(self, es):
+        clone = copy.deepcopy(self)
+        clone.epilogue_schedule = es
+        return clone
+
+    def with_gemm_mode(self, gm):
+        clone = copy.deepcopy(self)
+        clone.gemm_mode = gm
+        return clone
+
+    def with_acc_type(self, acc):
+        clone = copy.deepcopy(self)
+        clone.acc_type = acc
+        return clone
+
+    def with_dtype_str(self, dtype_str):
+        clone = copy.deepcopy(self)
+        clone.dtype_str = dtype_str
+        return clone
+
+
+DefaultCutlass3xArgsFP8 = Cutlass3xArgs(
+    "fp8", 90, (128, 128, 128), (1, 2, 1),
+    "cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative",
+    "cutlass::epilogue::TmaWarpSpecializedCooperative",
+    "cutlass::gemm::PersistentScheduler",
+    "cutlass::gemm::GemmUniversalMode::kGemm", "float")
+
+## Kernel Schedules
+## All
+# struct KernelMultistage { };
+# struct KernelCpAsyncWarpSpecialized { };
+# struct KernelCpAsyncWarpSpecializedPingpong { };
+# struct KernelCpAsyncWarpSpecializedCooperative { };
+# struct KernelTma { };
+# struct KernelTmaWarpSpecialized { };
+# struct KernelTmaWarpSpecializedPingpong { };
+# struct KernelTmaWarpSpecializedCooperative { };
+# struct KernelPtrArrayTmaWarpSpecializedCooperative { };
+## FP8
+# struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
+# struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { }; # noqa
+# struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { }; #noqa
+# struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };  #noqa
+
+## Epilogue policies
+# struct NoSmemWarpSpecialized {};
+# struct PtrArrayNoSmemWarpSpecialized {};
+# struct TmaWarpSpecialized {};
+# struct TmaWarpSpecializedCooperative {};
+
+## Tile scheduler
+# struct PersistentScheduler { };
+# struct StreamKScheduler { };
+
+## Kgemms
+# kGemm
+# kGemmSplitKParallel,
+# kBatched,
+# kArray,
+# kGrouped,
+# kInvalid
+
+cluster_shapes = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (4, 1, 1),
+                  (1, 4, 1), (8, 1, 1), (1, 8, 1), (4, 4, 1)]
+tile_shapes_m = [64, 128, 256]
+tile_shapes_n = [64, 128, 256]
+tile_shapes_k = [32, 64, 128, 256]
+tile_shapes = list(product(tile_shapes_m, tile_shapes_n, tile_shapes_k))
+
+kernel_schedules = [
+    "cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum",
+    "cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum",
+    "cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum"
+]
+
+epilogue_schedules = [
+    "cutlass::epilogue::TmaWarpSpecialized",
+    "cutlass::epilogue::TmaWarpSpecializedCooperative"
+]
+
+tile_schedules = [
+    "cutlass::gemm::PersistentScheduler", "cutlass::gemm::StreamKScheduler"
+]
+
+gemm_modes = ["cutlass::gemm::GemmUniversalMode::kGemm"]
+
+acc_types = ["float"]
+
+#epilogue_schedules_v2 = ["cutlass::epilogue::NoSmemWarpSpecialized"]
+gemm_modes_v2 = ["cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel"]
+acc_types_v2 = ["cutlass::half_t"]
+
+## Make Cutlass3xArgsTest
+
+Cutlass3xArgsTest = []
+
+for ts, cs, ks, es, tile_schedule, gm, at in product(
+        tile_shapes, cluster_shapes, kernel_schedules, epilogue_schedules,
+        tile_schedules, gemm_modes, acc_types):
+
+    Cutlass3xArgsTest.append(
+        DefaultCutlass3xArgsFP8.with_tile_shape(ts).with_cluster_shape(cs).
+        with_kernel_schedule(ks).with_epilogue_schedule(es).with_tile_schedule(
+            tile_schedule).with_gemm_mode(gm).with_acc_type(at))
+
+Cutlass3xArgsTest = Cutlass3xArgsTest
diff --git a/csrc/sparse/cutlass/generator/generator.py b/csrc/sparse/cutlass/generator/generator.py
new file mode 100644
index 0000000000000..9d49b45e7489d
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/generator.py
@@ -0,0 +1,145 @@
+import pprint
+from dataclasses import dataclass
+from multiprocessing.pool import ThreadPool
+from typing import List, Optional
+
+import autogen_manifest
+from autogen_manifest import Cutlass3xArgs
+from generator_types import GeneratorType, GeneratorTypes
+from kernel_compiler import KernelCompiler
+from kernel_generator import GeneratorOutput, KernelGenerator
+from tqdm import tqdm
+
+
+@dataclass
+class GenerateFromArgInput:
+    generator_type: Optional[GeneratorType] = None
+    args: Optional[Cutlass3xArgs] = None
+    kernel_compiler: Optional[KernelCompiler] = None
+
+
+def generate_from_arg(input: GenerateFromArgInput) -> GeneratorOutput:
+    """
+    Kernel generation for a single Cutlass3xArg
+    """
+    generator_type, args, kernel_compiler = (input.generator_type, input.args,
+                                             input.kernel_compiler)
+    return KernelGenerator.generate(generator_type, args, kernel_compiler)
+
+
+def generate_from_args_mt(generator_type: GeneratorType,
+                          args: List[Cutlass3xArgs],
+                          kernel_compiler: KernelCompiler,
+                          num_threads: int = 32) -> GeneratorOutput:
+    """
+    Kernel generator for a list of Cutlass3xArgs with multi-threading.
+    """
+    generator_outputs = GeneratorOutput()
+    # create thread pool with {num_threads} threads
+    pool = ThreadPool(processes=num_threads)
+    inputs = [
+        GenerateFromArgInput(generator_type, x, kernel_compiler) for x in args
+    ]
+    result = pool.map_async(generate_from_arg, inputs)
+    for r in result.get():
+        generator_outputs.merge(r)
+    return generator_outputs
+
+
+def main(args):
+    pprint.pprint(args)
+
+    cutlass_args_list = getattr(autogen_manifest, args.cutlass_args_list)
+    print(f"Generating {len(cutlass_args_list)} cuda files ...")
+
+    generator_type: GeneratorType = GeneratorType.from_str(args.generator_type)
+
+    additional_compile_args = [x.strip() for x in args.additional_compile_args]
+    kernel_compiler: KernelCompiler = KernelCompiler(
+        vllm_root_dir=args.vllm_root_dir,
+        py_venv_dir=args.py_venv_dir,
+        cuda_dir=args.cuda_dir,
+        py_version=args.py_version,
+        additional_args=additional_compile_args,
+        test_compile=args.test_compile)
+    kernel_compiler.init_compile_cache()
+
+    generator_outputs = GeneratorOutput()
+    batch_size = 100  # Compile-and-Generate batch_size items at a time
+    for idx in tqdm(range(0, len(cutlass_args_list), batch_size)):
+        print(f"Total {len(cutlass_args_list)}"
+              f" | Success {len(generator_outputs.success_file_names)}"
+              f"| Fail {len(generator_outputs.failed_file_names)}")
+
+        chunk_generator_output = generate_from_args_mt(
+            generator_type, cutlass_args_list[idx:idx + batch_size],
+            kernel_compiler)
+        generator_outputs.merge(chunk_generator_output)
+
+        # Store intermediate results
+        # fill-out ops.h
+        KernelGenerator.write_ops(generator_type, generator_outputs.file_paths,
+                                  generator_outputs.fn_names,
+                                  generator_outputs.fn_decls)
+        # store result batch
+        kernel_compiler.cache.add(generator_outputs.success_file_names,
+                                  generator_outputs.failed_file_names)
+        kernel_compiler.cache.store()
+
+
+if __name__ == "__main__":
+
+    import argparse
+    parser = argparse.ArgumentParser(description='''
+            Autogen cutlass kernels
+            Example: 
+            python3 csrc/quantization/cutlass_w8a8/generator/generator.py \
+                 --generator-type scaled_sparse_mm \
+                 --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ \
+                 --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test \
+                 --cuda-dir /usr/local/cuda-12.5
+            ''')
+
+    parser.add_argument("--generator-type",
+                        required=True,
+                        choices=[x.description() for x in GeneratorTypes])
+    parser.add_argument("--cutlass-args-list",
+                        required=True,
+                        type=str,
+                        default=None,
+                        help='''
+                        The cutlass args list variable name constructed in
+                        autogen_manifest.py. The variable name is imported
+                        as,
+                        getattr(autogen_manifest, args.cutlass_args_list)
+                        ''')
+    parser.add_argument('--test-compile',
+                        action='store_true',
+                        help='''
+                        Runs as usual but,
+                            - Prints compiler errors
+                            - Doesn't update the kernel compiler cache.
+                        ''')
+    parser.add_argument("--vllm-root-dir",
+                        required=True,
+                        type=str,
+                        default=None,
+                        help="Root directory of vllm source code")
+    parser.add_argument("--py-venv-dir",
+                        required=True,
+                        type=str,
+                        default=None,
+                        help="py venv root directory")
+    parser.add_argument("--cuda-dir",
+                        type=str,
+                        default=None,
+                        help="CUDA dir example: /usr/local/cuda-12.5")
+    parser.add_argument(
+        "--py-version",
+        type=str,
+        default="3.10",
+        help="Python version to use. Used in fetching the python includes")
+    parser.add_argument("--additional-compile-args", nargs='*', default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/csrc/sparse/cutlass/generator/generator_types.py b/csrc/sparse/cutlass/generator/generator_types.py
new file mode 100644
index 0000000000000..b18d624f88699
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/generator_types.py
@@ -0,0 +1,77 @@
+"""
+Generator function types.
+
+Defines necessary information about each function type to generate.
+"""
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List
+
+from utils import get_script_dir
+
+
+class GeneratorType(ABC):
+    SCRIPT_DIR = get_script_dir()
+
+    @staticmethod
+    def description() -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def fn_defn_jinja_filepath(self) -> Path:
+        # Function definition jinja - the entrypoint to the function to
+        # generate.
+        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x.jinja for
+        # an example.
+        raise NotImplementedError
+
+    @abstractmethod
+    def fn_decl_jinja_filepath(self) -> Path:
+        # Function decl jinja - the c++ function declaration of the function
+        # to generate.
+        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x_fnprototype.jinja #noqa
+        # for an example.
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def ops_def(self, fn_name: str) -> str:
+        # torch binding ops.def template.
+        raise NotImplementedError
+
+    @abstractmethod
+    def ops_impl(self, fn_name: str) -> str:
+        # torch binding ops.impl template.
+        raise NotImplementedError
+
+    @staticmethod
+    def from_str(s: str) -> "GeneratorType":
+        if ScaledMMGenerator.description() == s:
+            return ScaledMMGenerator()
+        raise ValueError("Unknown generator type string {s}")
+
+
+class ScaledMMGenerator(GeneratorType):
+
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def description():
+        return "scaled_sparse_mm"
+
+    def fn_defn_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x.jinja"
+
+    def fn_decl_jinja_filepath(self):
+        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_fnprototype.jinja"
+
+    def ops_def(self, fn_name: str) -> str:
+        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor e, Tensor b, Tensor a_scales, Tensor b_scales) -> ()");'  #noqa
+
+    def ops_impl(self, fn_name: str) -> str:
+        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
+
+
+GeneratorTypes: List[GeneratorType] = [ScaledMMGenerator]
diff --git a/csrc/sparse/cutlass/generator/kernel_compiler.py b/csrc/sparse/cutlass/generator/kernel_compiler.py
new file mode 100644
index 0000000000000..b4fa1cf434567
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/kernel_compiler.py
@@ -0,0 +1,128 @@
+"""
+Utilities to invoke the kernel compiler.
+When generating cutlass kernels, we attempt an nvcc compile to make sure that
+there won't be any issues at vllm build time.
+"""
+
+import pickle as pkl
+import subprocess
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional
+
+# Global compile cache path that stores information about which kernels
+# compiled successfully and which failed.
+CACHE_FILE_PATH = Path('./kernels_compile_cache.pkl')
+
+
+class KernelCompileCache:
+
+    def __init__(self, test_compile=False):
+        # If test_compile is true, we override the cache operations so it
+        # is a no-op.
+        self.test_compile = test_compile
+
+        # self.bad_kernels are kernels that failed compilation
+        # self.good_kernels are kernels that succeeded compilation
+        if not CACHE_FILE_PATH.exists() or self.test_compile:
+            self.bad_kernels = []
+            self.good_kernels = []
+        else:
+            # Load from cache
+            data = None
+            with open(str(CACHE_FILE_PATH), 'rb') as f:
+                data = pkl.load(f)
+            self.bad_kernels, self.good_kernels = data
+        print(f"#bad kernels {len(self.bad_kernels)},"
+              f"#good kernels {len(self.good_kernels)} loaded from cache ...")
+
+    def is_bad_kernel(self, kernel_file_name: str):
+        if self.test_compile:
+            return False
+        return kernel_file_name in self.bad_kernels
+
+    def is_good_kernel(self, kernel_file_name: str):
+        if self.test_compile:
+            return False
+        return kernel_file_name in self.good_kernels
+
+    def add(self, success: List[str], fail: List[str]):
+        self.good_kernels.extend(success)
+        self.bad_kernels.extend(fail)
+        # Remove duplicates
+        self.good_kernels = list(set(self.good_kernels))
+        self.bad_kernels = list(set(self.bad_kernels))
+
+    def store(self):
+        if self.test_compile:
+            return
+        print(f"Storing #badkernels {len(self.bad_kernels)}, "
+              f"#goodkernels {len(self.good_kernels)}")
+        with open(str(CACHE_FILE_PATH), 'wb+') as f:
+            pkl.dump((self.bad_kernels, self.good_kernels), f)
+
+
+@dataclass
+class KernelCompiler:
+    # vllm source code directory path
+    vllm_root_dir: Optional[str] = None
+    # python venv directory path
+    py_venv_dir: Optional[str] = None
+    # cuda directory path. example : /usr/local/cuda-12.5
+    cuda_dir: Optional[str] = None
+    #python version
+    py_version: str = '3.10'
+    # any additional flags
+    additional_args: List[str] = field(default_factory=lambda: [])
+    # kernel compile cache. Cache that holds history of which kernels
+    # succeeded and failed compilation.
+    cache: Optional[KernelCompileCache] = None
+    # Print nvcc compile information and override cache updates.
+    test_compile: bool = False
+
+    def init_compile_cache(self):
+        self.cache = KernelCompileCache(self.test_compile)
+
+    def compile(self, cu_file: str, gencode_arch: str) -> bool:
+        compile_command_base = [
+            'nvcc',
+            '-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1',
+            f'-I{self.vllm_root_dir}/csrc',
+            f'-I{self.vllm_root_dir}/.deps/cutlass-src/include',  #noqa
+            '-isystem',
+            f'/usr/include/python{self.py_version}',
+            '-isystem',
+            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include',
+            '-isystem',
+            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include/torch/csrc/api/include',  #noqa
+            '-isystem',
+            f'{self.cuda_dir}/include',
+            '-gencode',
+            f'arch=compute_{gencode_arch},code=sm_{gencode_arch}',
+            '-DONNX_NAMESPACE=onnx_c2',
+            '-Xcudafe',
+            '-DNDEBUG',
+            '-std=c++17',
+            '-Xcompiler=-fPIC',
+            '--expt-relaxed-constexpr',
+            '--threads=1',
+            '-D_GLIBCXX_USE_CXX11_ABI=0'] + self.additional_args
+        if gencode_arch == 90:
+            compile_command_base += ['-gencode', 'arch=compute_90a,code=sm_90a']
+
+        result = subprocess.run(compile_command_base + ['-c', cu_file],
+                                capture_output=True)
+
+        if self.test_compile:
+            print(f"Compiling {cu_file} : \n"
+                  f"   Successful compilation: {result.returncode == 0}\n"
+                  f"   stdout : {result.stdout}\n"
+                  f"   stderr : {result.stderr}\n")
+
+        if result.returncode == 0:
+            # Cleanup generated object code on successful compile.
+            object_file_path = Path("./" + Path(cu_file).stem + '.o')
+            assert object_file_path.exists(), object_file_path
+            object_file_path.unlink()
+
+        return result.returncode == 0
diff --git a/csrc/sparse/cutlass/generator/kernel_generator.py b/csrc/sparse/cutlass/generator/kernel_generator.py
new file mode 100644
index 0000000000000..4e57b7313ae4e
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/kernel_generator.py
@@ -0,0 +1,249 @@
+"""
+Kernel Generator classes / functions.
+"""
+
+import shutil
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Tuple
+
+import jinja2
+import utils
+from autogen_manifest import Cutlass3xArgs
+from generator_types import GeneratorType
+from kernel_compiler import KernelCompiler
+
+
+@dataclass
+class GeneratorOutput:
+    # Used in torch_bindings generation
+    file_paths: List[str] = field(default_factory=lambda: [])
+    fn_names: List[str] = field(default_factory=lambda: [])
+    fn_decls: List[str] = field(default_factory=lambda: [])
+    # Used in cache update
+    failed_file_names: List[str] = field(default_factory=lambda: [])
+    success_file_names: List[str] = field(default_factory=lambda: [])
+
+    def merge(self, output: "GeneratorOutput"):
+        self.file_paths.extend(output.file_paths)
+        self.fn_names.extend(output.fn_names)
+        self.fn_decls.extend(output.fn_decls)
+        self.failed_file_names.extend(output.failed_file_names)
+        self.success_file_names.extend(output.success_file_names)
+
+
+## Abstract generator
+
+
+class KernelGenerator_(ABC):
+    SCRIPT_DIR = utils.get_script_dir()
+    GENERATE_DIR = SCRIPT_DIR / "generated"
+
+    @staticmethod
+    def write_torch_bindings(generator_type: GeneratorType,
+                             fn_names: List[str], fn_decls: List[str],
+                             ops_macro: str, dir_path: str):
+        s = "#pragma once\n"
+        s += "#include<torch/torch.h>\n"
+        s += f"#define {ops_macro} \\\n"
+        for fn_name in fn_names:
+            s += generator_type.ops_def(fn_name) + '\\\n'
+            s += generator_type.ops_impl(fn_name) + '\\\n'
+        s += "\n"
+
+        for fn_decl in fn_decls:
+            s += f'{fn_decl}\n'
+
+        # write ops.h
+        file_path = Path(dir_path) / "ops.h"
+        with open(str(file_path), 'w+') as f:
+            f.write(s)
+
+        # write torch_bindings.cpp
+        s = ""
+        s += '\n#include "core/registration.h"'
+        s += '\n#include <torch/library.h>'
+        s += '\n#include "ops.h"'
+        s += '\nTORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {'
+        s += f'\n {ops_macro}'
+        s += '\n}'
+        s += '\nREGISTER_EXTENSION(TORCH_EXTENSION_NAME)'
+        s += '\n'
+
+        tb_path = Path(dir_path) / "torch_bindings.cpp"
+        with open(str(tb_path), 'w+') as f:
+            f.write(s)
+
+    @staticmethod
+    def write_ops(generator_type: GeneratorType,
+                  file_paths: List[str],
+                  fn_names: List[str],
+                  fn_decls: List[str],
+                  ops_macro: str,
+                  batch_size: int = 100):
+        """
+        batch_size defines the number of files per .so.
+        If there are a 1000 filenames, then with batch_size 100, we generate
+        10 directories, each directory containing 100 kernels. Each directory
+        is converted into a .so during vllm compile.
+        """
+
+        assert len(file_paths) == len(fn_names)
+        assert len(file_paths) == len(fn_decls)
+
+        dir_name = 0
+        for i in range(0, len(file_paths), batch_size):
+
+            dir_path: Path = KernelGenerator_.GENERATE_DIR / f'{dir_name}'
+            dir_path.mkdir(exist_ok=True)
+
+            # Move files to dir
+            for file_path in file_paths[i:i + batch_size]:
+                if Path(file_path).exists():
+                    try:
+                        shutil.move(file_path, str(dir_path))
+                    except shutil.Error:
+                        # File already exists
+                        pass
+
+            KernelGenerator_.write_torch_bindings(generator_type,
+                                                  fn_names[i:i + batch_size],
+                                                  fn_decls[i:i + batch_size],
+                                                  ops_macro, dir_path)
+
+            dir_name += 1  #noqa
+
+    @staticmethod
+    def last_namespace(s):
+        return s.split('::')[-1]
+
+    @staticmethod
+    @abstractmethod
+    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
+                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
+        ...
+
+
+class KernelGenerator(KernelGenerator_):
+    OPS_MACRO = "CUTLASS_DEFS"
+
+    @staticmethod
+    def generate_name(description: str, args: Cutlass3xArgs):
+
+        return 'autogen_{}_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}'.format(
+            description, args.arch, args.tile_shape[0], args.tile_shape[1],
+            args.tile_shape[2], args.cluster_shape[0], args.cluster_shape[1],
+            args.cluster_shape[2],
+            KernelGenerator_.last_namespace(args.kernel_schedule),
+            KernelGenerator_.last_namespace(args.epilogue_schedule),
+            KernelGenerator_.last_namespace(args.tile_schedule),
+            KernelGenerator_.last_namespace(args.gemm_mode),
+            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str)
+
+    @staticmethod
+    def generate_filename(description: str, args: Cutlass3xArgs):
+
+        f = '{}/autogen_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}_{}'.format(
+            KernelGenerator_.GENERATE_DIR, description, args.tile_shape[0],
+            args.tile_shape[1], args.tile_shape[2], args.cluster_shape[0],
+            args.cluster_shape[1], args.cluster_shape[2],
+            KernelGenerator_.last_namespace(args.kernel_schedule),
+            KernelGenerator_.last_namespace(args.epilogue_schedule),
+            KernelGenerator_.last_namespace(args.tile_schedule),
+            KernelGenerator_.last_namespace(args.gemm_mode),
+            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str,
+            args.arch)
+
+        f = f + ".cu"
+        return f
+
+    @staticmethod
+    def generate_kernel_file(generator_type: GeneratorType,
+                             args: Cutlass3xArgs) -> Tuple[str, str]:
+        """
+        Generate a .cu file that respects args and return,
+         - The function name of the generated function.
+         - The c++ function declaration of the generated function.
+        The return values are used in generating the torch bindings.
+        """
+
+        # Make the generate dir
+        KernelGenerator_.GENERATE_DIR.mkdir(exist_ok=True)
+
+        # Get jinja templates
+        jenv = jinja2.Environment(loader=jinja2.FileSystemLoader("/"))
+        fn_defn_template = jenv.get_template(
+            str(generator_type.fn_defn_jinja_filepath()))
+        fn_decl_template = jenv.get_template(
+            str(generator_type.fn_decl_jinja_filepath()))
+
+        # Generate code
+        fn_name = KernelGenerator.generate_name(generator_type.description(),
+                                                args)
+        fn_decl = fn_decl_template.render(_name=fn_name)
+        code: str = fn_defn_template.render(
+            _name=fn_name,
+            _torch_input_dtype=utils.to_torch_dtype_str(args.dtype_str),
+            _cutlass_input_dtype=utils.to_cutlass_dtype_str(args.dtype_str),
+            _tile_shape=utils.get_as_cutlass3x_gemm_shape(args.tile_shape),
+            _cluster_shape=utils.get_as_cutlass3x_gemm_shape(
+                args.cluster_shape),
+            _kernel_schedule=args.kernel_schedule,
+            _epilogue_schedule=args.epilogue_schedule,
+            _tile_schedule=args.tile_schedule,
+            _gemm_mode=args.gemm_mode,
+            _acc_type=args.acc_type)
+
+        filename = KernelGenerator.generate_filename(
+            generator_type.description(), args)
+        if utils.file_contents_same(filename, code):
+            return (fn_name, fn_decl)
+
+        # write code
+        with open(filename, "w+") as f:
+            f.write(code)
+
+        return (fn_name, fn_decl)
+
+    @staticmethod
+    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
+                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
+        generator_output = GeneratorOutput()
+
+        filepath = KernelGenerator.generate_filename(
+            generator_type.description(), args)
+        filename = Path(filepath).name
+
+        if kernel_compiler.cache.is_bad_kernel(filename):
+            # We know that this kernel wouldn't compile. Abort
+            return generator_output
+
+        fn_name, fn_decl = KernelGenerator.generate_kernel_file(
+            generator_type, args)
+
+        if not kernel_compiler.cache.is_good_kernel(filename):
+            # We dont have any information about this kernel in the cache.
+            # try compiling
+            compile_success = kernel_compiler.compile(filepath,
+                                                      gencode_arch=args.arch)
+            if compile_success:
+                generator_output.success_file_names.append(filename)
+            else:
+                generator_output.failed_file_names.append(filename)
+                if not kernel_compiler.test_compile:
+                    # Remove generated file
+                    Path(filepath).unlink()
+                    return generator_output
+
+        generator_output.file_paths.append(filepath)
+        generator_output.fn_names.append(fn_name)
+        generator_output.fn_decls.append(fn_decl)
+
+        return generator_output
+
+    @staticmethod
+    def write_ops(generator_type: GeneratorType, file_paths: List[str],
+                  fn_names: List[str], fn_decls: List[str]):
+        return KernelGenerator_.write_ops(generator_type, file_paths, fn_names,
+                                          fn_decls, KernelGenerator.OPS_MACRO)
diff --git a/csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja b/csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja
new file mode 100644
index 0000000000000..9f7e886ed9510
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja
@@ -0,0 +1,57 @@
+#include <stddef.h>
+#include <torch/all.h>
+#include "cutlass/cutlass.h"
+#include "sparse/cutlass/sparse_scaled_mm_c3x.cuh"
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &e,
+                torch::Tensor const &b,
+                torch::Tensor const &a_scales,
+                torch::Tensor const &b_scales) {
+
+  using TileShape =  {{ _tile_shape }};
+  using ClusterShape = {{ _cluster_shape }};
+  using KernelSchedule = typename {{ _kernel_schedule }};
+  using EpilogueSchedule = typename {{ _epilogue_schedule }};
+  using TileSchedule = typename {{ _tile_schedule }};
+  using AccType = {{ _acc_type }};
+  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
+
+  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
+  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (out.dtype() == torch::kBFloat16) {
+    using Cutlass3xGemm =
+      cutlass_3x_gemm<cutlass::float_e4m3_t,
+                      cutlass::bfloat16_t,
+                      ScaledEpilogue,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      EpilogueSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_sparse_gemm_caller<Cutlass3xGemm>(
+        out, a, e, b, a_scales, b_scales);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    using Cutlass3xGemm =
+      cutlass_3x_gemm<cutlass::float_e4m3_t,
+                      cutlass::half_t,
+                      ScaledEpilogue,
+                      TileShape,
+                      ClusterShape,
+                      KernelSchedule,
+                      EpilogueSchedule,
+                      AccType,
+                      TileSchedule,
+                      Mode>;
+
+    return cutlass_sparse_gemm_caller<Cutlass3xGemm>(
+        out, a, e, b, a_scales, b_scales);
+  }
+}
diff --git a/csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja b/csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja
new file mode 100644
index 0000000000000..aa975f05d3db6
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja
@@ -0,0 +1,7 @@
+
+
+void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
+                torch::Tensor const &e,
+                torch::Tensor const &b,
+                torch::Tensor const &a_scales,
+                torch::Tensor const &b_scales);
diff --git a/csrc/sparse/cutlass/generator/tools/heatmap.py b/csrc/sparse/cutlass/generator/tools/heatmap.py
new file mode 100644
index 0000000000000..06383d4029b7f
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/tools/heatmap.py
@@ -0,0 +1,242 @@
+import pickle as pkl
+from pathlib import Path
+from typing import List, Optional
+
+import matplotlib.pyplot as plt
+import numpy as np
+from select_kernels import select_kernels
+from utils import Data, make_heatmap_data, measurement_to_data
+
+
+def plot_heatmap(data: np.array,
+                 y_labels: List[str],
+                 x_labels: List[str],
+                 save_filename='heatmap.png'):
+    # min because of some matplotlib render restrictions.
+    fig_size_x = min(len(x_labels), 320)
+    fig_size_y = len(y_labels) + 15
+    fig, ax = plt.subplots(figsize=(fig_size_x, fig_size_y))
+    im = ax.imshow(data, cmap="Reds", vmin=0.0, vmax=1.0, interpolation=None)
+
+    cbar = ax.figure.colorbar(im, ax=ax, cmap="Reds")
+    cbar.ax.set_ylabel("Hot == Closer to peak perf.", rotation=90, va="top")
+
+    # Show all ticks and label them with the respective list entries
+    ax.set_xticks(np.arange(len(x_labels)), labels=x_labels)
+    ax.set_yticks(np.arange(len(y_labels)), labels=y_labels)
+
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=90)
+
+    # Loop over data dimensions and create text annotations.
+    for i in range(len(y_labels)):
+        for j in range(len(x_labels)):
+            ax.text(j,
+                    i,
+                    data[i, j],
+                    ha="center",
+                    va="center",
+                    color="w",
+                    fontsize=10.0)
+
+    #ax.colorbar()
+
+    ax.set_title("GEMM shape vs Best cutlass op")
+    #ax.set_aspect('equal')
+    fig.tight_layout()
+
+    #fig.set_dpi(300)
+    #plt.show()
+    print(f"Save location : {save_filename}")
+    fig.savefig(save_filename, dpi=100)
+    #fig.savefig(save_filename, dpi=10)
+
+
+def select_top_k_kernels(gemm_ops: np.array,
+                         gemm_problems: List[str],
+                         ops: List[str],
+                         k: int = 100) -> List[str]:
+    """
+    Simple top_k kernel selection. 
+    Gather the top-k best performing kernels for each gemm problem and
+    return the union.
+    """
+    n_rows = len(gemm_problems)
+
+    max_kernels_per_gemm_shape = 100  # k-value
+    gemm_efficiency_threshold = 0.90
+
+    selected_ops = []
+    for r in range(n_rows):
+        gemm_ops_list = np.copy(gemm_ops[r])
+        sorted_indices = list(reversed(np.argsort(gemm_ops_list).tolist()))
+
+        selected_shape_ops = []
+        for x in sorted_indices:
+            if 'autogen' not in ops[x]:
+                # select only autogen kernels/ops
+                continue
+            if len(selected_shape_ops) >= max_kernels_per_gemm_shape:
+                break
+            # we have reached the min requirement. Decide to break based on
+            # the gemm_efficiency threshold.
+            if gemm_ops_list[x] < gemm_efficiency_threshold:
+                break
+            else:
+                selected_shape_ops.append(ops[x])
+
+        selected_ops.append(selected_shape_ops)
+
+        op_scores = []
+        for idx in range(len(selected_shape_ops)):
+            if 'autogen' not in ops[sorted_indices[idx]]:
+                continue
+            op_scores.append(gemm_ops_list[sorted_indices[idx]])
+        print(f"Gemm problem {gemm_problems[r]} "
+              f"- #kernels {len(selected_shape_ops)} "
+              f"- selected kernel range [ {min(op_scores)} , "
+              f"{max(op_scores)} ] ")
+
+    # Merge all ops to create a final list
+    selected_ops = [set(x) for x in selected_ops]
+    selected_ops_set = set()
+    for x in selected_ops:
+        selected_ops_set = selected_ops_set.union(x)
+
+    print(f"#Selected ops set {len(selected_ops_set)}")
+    for x in selected_ops_set:
+        print(x)
+    return list(selected_ops_set)
+
+
+def remove_less_performant_kernels(gemm_ops: np.array, ops: List[str]):
+    """
+    Removes kernel that are relatively less performant from gemm_ops.
+    """
+    n_ops = gemm_ops.shape[1]
+    assert n_ops == len(ops)
+
+    gemm_ops_predicated = gemm_ops < 0.75
+    ops_predicated = np.all(gemm_ops_predicated, axis=0)
+
+    bad_cols = list(range(n_ops))
+    bad_cols = list(filter(lambda x: ops_predicated[x], bad_cols))
+    bad_cols = sorted(list(set(bad_cols)), reverse=True)
+    for bc in bad_cols:
+        ops.pop(bc)
+        gemm_ops = np.delete(gemm_ops, bc, 1)
+
+    return gemm_ops, ops
+
+
+def plot(gemm_ops: np.array,
+         gemm_problems: List[str],
+         ops: List[str],
+         save_filename: str,
+         prune_ops: bool = False):
+    if prune_ops:
+        gemm_ops, ops = remove_less_performant_kernels(gemm_ops, ops)
+        print(f"Pruned gemm_ops {gemm_ops.shape}")
+
+    plot_heatmap(gemm_ops, gemm_problems, ops, save_filename)
+
+
+def select_kernels_and_plot(gemm_problems: List[str], ops: List[str],
+                            data: List[str], save_filename: str):
+
+    autogen_ops = list(filter(lambda x: x.startswith('autogen'), ops))
+    cutlass_ops = list(filter(lambda x: x.startswith('cutlass'), ops))
+    pytorch_ops = list(filter(lambda x: x.startswith('pytorch'), ops))
+    assert len(autogen_ops) + len(cutlass_ops) + len(pytorch_ops) == len(ops)
+
+    print("Selecting the autogen kernels ..")
+    # select the best autogen kernels
+    gemm_autogenops = make_heatmap_data(gemm_problems, autogen_ops, data)
+    selected_autogen_ops = select_kernels(gemm_autogenops,
+                                          gemm_problems,
+                                          autogen_ops,
+                                          min_gemm_efficiency=0.98)
+
+    # prepare plot data
+    selected_ops = selected_autogen_ops + cutlass_ops + pytorch_ops
+    gemm_ops = make_heatmap_data(gemm_problems, selected_ops, data)
+    print("Plotting autogen kernels ...")
+    plot(gemm_ops, gemm_problems, selected_ops, save_filename)
+
+
+def from_measurements(args):
+    pkl_files: List[str] = args.input_pkl
+    save_file: Optional[str] = args.save_file
+    data: List[Data] = []
+
+    for pkl_file in pkl_files:
+        with open(pkl_file, 'rb') as f:
+            pkl_data = pkl.load(f)
+            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
+
+    ops: List[str] = list(map(lambda x: x.description, data))
+    ops = sorted(list(set(ops)))
+
+    gemm_problems: List[str] = list(map(lambda x: (x.m, x.n, x.k), data))
+    gemm_problems = sorted(list(set(gemm_problems)))
+
+    print(f"#gemm_problems {len(gemm_problems)}")
+    print(f"#gemm_ops {len(ops)}")
+
+    # plot all data as heat map
+    if args.plot_all_ops:
+        gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
+        out_file: str = pkl_file.replace(
+            '.pkl', '_heatmap.png') if save_file is None else save_file
+        plot(gemm_ops, gemm_problems, ops, save_filename=out_file)
+
+    if args.select_kernels:
+        out_file = None
+        if save_file:
+            out_file = Path(save_file).with_suffix("_selected.png")
+        else:
+            out_file = pkl_file.replace('.pkl', 'selected_heatmap.png')
+        select_kernels_and_plot(gemm_problems, ops, data, out_file)
+
+
+def main(args):
+    from_measurements(args)
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='''
+        Plot bench measurements pkl.
+        Example invocation: 
+        Plot all the ops in model bench pickle file:
+            python3 heatmap.py \
+              --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
+              --plot-all-ops
+        Run select kernel on the input-pkl and plot the selected ops.
+            python3 heatmap.py \
+               --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
+               --select-kernels
+        ''')
+
+    parser.add_argument("--input-pkl",
+                        "-i",
+                        nargs="+",
+                        required=True,
+                        type=str,
+                        help=("This is typically the pickle file output by "
+                              "w8a8_benchmarks.py 's model_bench command"))
+    parser.add_argument("--save-file", "-o", required=False, type=str)
+    parser.add_argument("--select-kernels",
+                        action='store_true',
+                        help="Run kernel selection and plot the heatmap "
+                        "for the selected kernels")
+    parser.add_argument("--plot-all-ops",
+                        action='store_true',
+                        help="plot heatmap for all ops")
+    args = parser.parse_args()
+
+    if not args.plot_all_ops and not args.select_kernels:
+        print("Argument error : Please provide at least one argument among"
+              "[--plot-all-ops, --select-kernels]")
+
+    main(args)
diff --git a/csrc/sparse/cutlass/generator/tools/select_kernels.py b/csrc/sparse/cutlass/generator/tools/select_kernels.py
new file mode 100644
index 0000000000000..aea3289ed232d
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/tools/select_kernels.py
@@ -0,0 +1,244 @@
+import pickle as pkl
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+from utils import Data, make_heatmap_data, measurement_to_data
+
+
+@dataclass
+class Interval:
+    s: int  # start of interval
+    e: int  # end of interval
+    eff: float  # efficiency of the kernel in that range.
+
+    def x_in_interval(self, x: int) -> bool:
+        return self.s <= x and x <= self.e
+
+    def is_overlap(self, s, e):
+        return s <= self.e and self.s <= e
+
+
+@dataclass
+class KernelIntervals:
+    name: str
+    intervals: List[Interval]
+
+    def spanning_interval(self, pi: int) -> Optional[Interval]:
+        for i in self.intervals:
+            if i.x_in_interval(pi):
+                return i
+        return None
+
+
+class SelectKernelMeta:
+
+    def __init__(self, gemm_ops: np.array, gemm_problems: List[str],
+                 ops: List[str], min_gemm_efficiency: float):
+        self.gemm_ops = np.copy(gemm_ops)
+        self.gemm_problems = gemm_problems
+        self.ops = ops
+        self.min_gemm_efficiency = min_gemm_efficiency
+
+        self.n_problems = len(self.gemm_problems)
+        self.n_kernels = len(self.ops)
+
+        # Convert to kernel ranges
+        self.problem_indices = {x: idx for idx, x in enumerate(gemm_problems)}
+        self.kernel_indices = {x: idx for idx, x in enumerate(ops)}
+
+        self.kernel_intervals: List[KernelIntervals] = []
+        for ki in range(self.n_kernels):
+            self.kernel_intervals.append(self.make_kernel_intervals(ki))
+
+    def avg_efficiency(self, p_s: int, p_e: int, ki: int) -> float:
+        """
+        Average efficiency of the ki kernel for the gemm shapes in
+        range [p_s, p_e]
+        """
+        vals = self.gemm_ops[:, ki].tolist()[p_s:p_e + 1]
+        return sum(vals) / len(vals)
+
+    # TODO (varun) : Revisit kernel scores to use only the intervals we actually
+    # use for specific kernels.
+    def kernel_set_score(self, p_s: int, p_e: int, kernel_indices: set[int]):
+        """
+        Compute a score for a set of kernels for the gemm shape indices in
+        range [p_s, p_e]
+        """
+        if len(kernel_indices) == 0:
+            return 0.0
+        ki_scores = []
+        for ki in kernel_indices:
+            interval_scores = []
+            for i in self.kernel_intervals[ki].intervals:
+                if i.is_overlap(p_s, p_e):
+                    interval_scores.append(i.eff)
+            assert len(interval_scores) > 0
+            ki_scores.append(sum(interval_scores) / len(interval_scores))
+        assert len(ki_scores) > 0
+        return sum(ki_scores) / len(ki_scores)
+
+    def make_kernel_intervals(self, ki: int) -> KernelIntervals:
+        s = None
+        e = None
+        kernel_intervals: KernelIntervals = KernelIntervals(self.ops[ki], [])
+        for pi in range(self.n_problems):
+            if self.gemm_ops[pi][ki] < self.min_gemm_efficiency:
+                # record range
+                if e:
+                    assert s is not None
+                    kernel_intervals.intervals.append(
+                        Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
+                s, e = None, None
+            else:
+                s = pi if s is None else s
+                e = pi
+        if e:
+            assert s is not None
+            kernel_intervals.intervals.append(
+                Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
+        # sort intervals in the kernel
+        kernel_intervals.intervals = sorted(kernel_intervals.intervals,
+                                            key=lambda x: x.s)
+        return kernel_intervals
+
+
+def map_gemm_to_kernel(kernel_indices: List[int],
+                       meta: SelectKernelMeta) -> Dict[int, int]:
+    """
+    For every gemm problem in meta.gemm_problems, select a kernel from
+    kernel_indices and return as a dict.
+    """
+    gemm_to_kernel_map = {}
+
+    for pi in range(meta.n_problems):
+        kernels_for_pi = []
+        for ki in kernel_indices:
+            if meta.kernel_intervals[ki].spanning_interval(pi):
+                kernels_for_pi.append(ki)
+        assert len(kernels_for_pi) != 0
+
+        # select the kernel with max efficiency
+        eff_ki = [(meta.gemm_ops[pi][ki], ki) for ki in kernels_for_pi]
+        max_eff_ki = max(eff_ki, key=lambda x: x[0])[1]
+        gemm_to_kernel_map[pi] = max_eff_ki
+
+    return gemm_to_kernel_map
+
+
+def select_kernels_dp(
+        p_s: int,
+        p_e: int,  # Problem start index and problem end index
+        meta: SelectKernelMeta,
+        solution_cache: Dict[Tuple[int, int], set]) -> set[int]:
+    """
+    Compute the best set of kernels for the gemm problem shapes,
+    meta.gemm_problems[p_s:p_e].
+    """
+    if p_s > p_e:
+        return set([])
+    assert p_s <= p_e
+    assert p_s >= 0 and p_e >= 0
+    assert p_s < meta.n_problems and p_e < meta.n_problems
+
+    if solution_cache.get((p_s, p_e), None) is not None:
+        return solution_cache.get((p_s, p_e))
+
+    spanning_kernels: List[Tuple[int, Interval]] = []
+    for ki in range(meta.n_kernels):
+        span_i = meta.kernel_intervals[ki].spanning_interval(p_s)
+        assert span_i is None or (span_i.s <= p_s and span_i.e >= p_s)
+        if span_i is not None:
+            spanning_kernels.append((ki, span_i))
+
+    assert len(spanning_kernels) != 0, \
+            (f"Cannot find a spanning kernel in range ({p_s}, {p_e})"
+            f"- gemm {meta.gemm_problems[p_s]} to {meta.gemm_problems[p_e]}"
+            f". Try reducing the min_gemm_efficiency")
+    ki_solutions: List[set[int]] = []
+    for ki, span in spanning_kernels:
+        ki_solutions.append(
+            set([ki]).union(
+                select_kernels_dp(span.e + 1, p_e, meta, solution_cache)))
+
+    # find the solution with minimum number of kernels.
+    sol = min(ki_solutions, key=lambda x: len(x))
+    solution_cache[(p_s, p_e)] = sol
+    return sol
+
+
+def select_kernels(gemm_ops: np.array, gemm_problems: List[str],
+                   ops: List[str], min_gemm_efficiency: float) -> List[str]:
+    """
+    Given a list of gemm problem shapes, gemm_problems, a list of autogen
+    kernel operations ops, normalized benchmarking information and a
+    minimum operation efficiency to consider, this function, finds that
+    smallest set of kernels such that kernels in the satisfies the
+    min_gemm_efficiency for all the gemm shapes. 
+    """
+    solution_cache = {}
+    meta = SelectKernelMeta(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
+    kernels = select_kernels_dp(0, meta.n_problems - 1, meta, solution_cache)
+
+    gemm_to_kernel_map = map_gemm_to_kernel(list(kernels), meta)
+
+    print(f"#kernels found {len(kernels)}")
+    for pi in range(meta.n_problems):
+        print(f"Problem {meta.gemm_problems[pi]} - "
+              f"Kernel {meta.ops[gemm_to_kernel_map[pi]]} "
+              f"eff. ({gemm_ops[pi][gemm_to_kernel_map[pi]]}) ")
+
+    kernel_names = [ops[ki] for ki in kernels]
+    return kernel_names
+
+
+def from_measurements(pkl_files: List[str], min_gemm_efficiency: float):
+    data: List[Data] = []
+
+    for pkl_file in pkl_files:
+        with open(pkl_file, 'rb') as f:
+            pkl_data = pkl.load(f)
+            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
+
+    ops = list(map(lambda x: x.description, data))
+    ops = sorted(list(set(ops)))
+    # have only autogen kernels
+    ops = list(filter(lambda x: 'autogen' in x, ops))
+
+    gemm_problems = list(map(lambda x: (x.m, x.n, x.k), data))
+    gemm_problems = sorted(list(set(gemm_problems)))
+
+    print(f"#gemm_problems {len(gemm_problems)}")
+    print(f"#gemm_ops {len(ops)}")
+
+    gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
+    select_kernels(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
+
+
+def main(pkl_files: List[str], min_gemm_efficiency: float):
+    from_measurements(pkl_files, min_gemm_efficiency)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=("Select minimal set of kernels in some model_bench "
+                     "pkl file such that the set of kernels satisfy"
+                     "the min-gemm-efficiency for all the gemm shapes in"
+                     "the model_bench"))
+    parser.add_argument("--input-pkl",
+                        "-i",
+                        nargs="+",
+                        required=True,
+                        type=str)
+    parser.add_argument(
+        "--min-gemm-efficiency",
+        type=float,
+        default=0.95,
+        help="Gemms that are less than this for a particular gemm shape is"
+        "disregarded")
+    args = parser.parse_args()
+
+    main(args.input_pkl, args.min_gemm_efficiency)
diff --git a/csrc/sparse/cutlass/generator/tools/utils.py b/csrc/sparse/cutlass/generator/tools/utils.py
new file mode 100644
index 0000000000000..74d5c9ddbd08d
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/tools/utils.py
@@ -0,0 +1,63 @@
+from dataclasses import dataclass
+from typing import List
+
+import numpy as np
+from torch.utils.benchmark import Measurement as TMeasurement
+
+
+@dataclass
+class Data:
+    m: int
+    k: int
+    n: int
+    description: str
+    time: float
+    tflops: float
+
+
+def parse_mkn(mkn_str: str):
+    # mkn_str : MKN=(16x1024x512)
+    mkn_tuple = mkn_str.split("=")[1]
+    # mkn_tuple : (16x1024x512)
+    mkn_prod = mkn_tuple[1:-1]
+    # mkn_prod : 16x1024x512
+    mkn_tuple = tuple(mkn_prod.split("x"))
+    return (int(mkn_tuple[0]), int(mkn_tuple[1]), int(mkn_tuple[2]))
+
+
+def measurement_to_data(measurement: TMeasurement) -> Data:
+    m, k, n = parse_mkn(measurement.sub_label)
+    t_ops = 2 * m * k * n / 1024 / 1024 / 1024 / 1024
+    tflops = t_ops / measurement.median
+    return Data(m, k, n, measurement.task_spec.description, measurement.median,
+                tflops)
+
+
+def make_heatmap_data(gemm_problems: List[str], ops: List[str],
+                      data: List[Data]) -> np.array:
+    """
+        gemm_problems : List of gemm problem shapes
+        ops : List of operations (kernels)
+        data : List of Data that contains benchmark information for all
+            op-gemmshape pairs.
+        Normalize all the benchmark information w.r.t. to its gemm-shape
+        and return the normalized benchmark information as a numpy array.
+    """
+    gemm_ops: List[List[float]] = [[0.0] * len(ops)
+                                   for _ in range(len(gemm_problems))]
+    for op_idx, op in enumerate(ops):
+        op_data = list(filter(lambda x: x.description == op, data))
+        for gemm_idx, gemm in enumerate(gemm_problems):
+            m, n, k = gemm
+            selected = list(
+                filter(lambda x: x.m == m and x.n == n and x.k == k, op_data))
+            if len(selected) >= 1:
+                gemm_ops[gemm_idx][op_idx] = float(selected[0].tflops)
+
+    for gemm_idx in range(len(gemm_problems)):
+        max_tflops = max(gemm_ops[gemm_idx])
+        for op_idx in range(len(ops)):
+            gemm_ops[gemm_idx][op_idx] = round(
+                gemm_ops[gemm_idx][op_idx] / max_tflops, 2)
+
+    return np.array(gemm_ops)
diff --git a/csrc/sparse/cutlass/generator/utils.py b/csrc/sparse/cutlass/generator/utils.py
new file mode 100644
index 0000000000000..8121412e42a47
--- /dev/null
+++ b/csrc/sparse/cutlass/generator/utils.py
@@ -0,0 +1,48 @@
+"""
+Utils used in generating cutlass kernels.
+"""
+
+import os
+from pathlib import Path
+from typing import Tuple
+
+## Utilities ####
+
+
+def to_torch_dtype_str(dtype_str):
+    if dtype_str == "int8":
+        return "torch::kInt8"
+    if dtype_str == "fp8":
+        return "torch::kFloat8_e4m3fn"
+    raise ValueError("unknown type")
+
+
+def to_cutlass_dtype_str(dtype_str):
+    if dtype_str == "int8":
+        return "int8_t"
+    if dtype_str == "fp8":
+        return "cutlass::float_e4m3_t"
+    raise ValueError("unknown type")
+
+
+def get_script_dir() -> Path:
+    return Path(os.path.dirname(os.path.realpath(__file__)))
+
+
+def get_as_cutlass_gemm_shape(shape: Tuple[int, int, int]):
+    return f'cutlass::gemm::GemmShape<{shape[0]}, {shape[1]}, {shape[2]}>'
+
+
+def get_as_cutlass3x_gemm_shape(shape: Tuple[int, int, int]):
+    return f'Shape<_{shape[0]}, _{shape[1]}, _{shape[2]}>'
+
+
+def file_contents_same(filepath, contents):
+    if not Path(filepath).exists():
+        return
+
+    f_contents = None
+    with open(filepath, "r") as f:
+        f_contents = f.read()
+
+    return f_contents == contents
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index 118b56c7454df..8723f9db8525e 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -42,7 +42,7 @@
 
 #include "util/helper.h"
 
-#include "util/common_gemm.cuh"
+#include "sparse_scaled_mm_c3x.cuh"
 
 /// Make A structured sparse by replacing elements with 0 and compress it
 template<typename ElementA_>
@@ -158,11 +158,11 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
   stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
 
-  // Random sparsification is performed on host
-  std::vector<ElementA> block_A_host(m * k);
-  cutlass::device_memory::copy_to_host(block_A_host.data(), a_ptr, m * k);
-  compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), 2024);
-  cutlass::device_memory::copy_to_device(a_ptr, block_A_host.data(), m * k);
+  // // Random sparsification is performed on host
+  // std::vector<ElementA> block_A_host(m * k);
+  // cutlass::device_memory::copy_to_host(block_A_host.data(), a_ptr, m * k);
+  // compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), 2024);
+  // cutlass::device_memory::copy_to_device(a_ptr, block_A_host.data(), m * k);
 
   cutlass::KernelHardwareInfo hw_info;
   hw_info.device_id = 0;
diff --git a/csrc/sparse/cutlass/sparse_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
similarity index 94%
rename from csrc/sparse/cutlass/sparse_mm_c3x.cu
rename to csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 7996c02fa4d33..a5faaacce68d0 100644
--- a/csrc/sparse/cutlass/sparse_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -27,7 +27,7 @@
 #include "util/common.hpp"
 // clang-format on
 
-#include "util/common_gemm.cuh"
+#include "sparse_scaled_mm_c3x.cuh"
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
@@ -55,15 +55,15 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
 
   if (mp2 <= 64) {
     // m in [1, 64]
-    return cutlass_test_gemm_caller<Cutlass3xGemmM64>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // m in (64, 128]
-    return cutlass_test_gemm_caller<Cutlass3xGemmM128>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
-    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
   }
 }
@@ -85,7 +85,7 @@ void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
                                        Epilogue>::Cutlass3xGemm;
 
     // m in (128, inf)
-    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
 }
 
@@ -106,7 +106,7 @@ void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
                                        Epilogue>::Cutlass3xGemm;
 
     // m in (128, inf)
-    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
 }
 
@@ -146,23 +146,23 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   if (mp2 <= 32) {
     // m in [1, 32]
     if (is_small_n) {
-      return cutlass_test_gemm_caller<Cutlass3xGemmM32NSmall>(
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
           out, a, e, b, std::forward<EpilogueArgs>(args)...);
     } else {
-      return cutlass_test_gemm_caller<Cutlass3xGemmM32NBig>(
+      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
           out, a, e, b, std::forward<EpilogueArgs>(args)...);
     }
   } else if (mp2 <= 64) {
     // m in (32, 64]
-    return cutlass_test_gemm_caller<Cutlass3xGemmM64>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // m in (64, 128]
-    return cutlass_test_gemm_caller<Cutlass3xGemmM128>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
-    return cutlass_test_gemm_caller<Cutlass3xGemmDefault>(
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
   }
 }
diff --git a/csrc/sparse/cutlass/util/common_gemm.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
similarity index 93%
rename from csrc/sparse/cutlass/util/common_gemm.cuh
rename to csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index aab5b1cf0179c..ac31be8f2bfaf 100644
--- a/csrc/sparse/cutlass/util/common_gemm.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -1,3 +1,28 @@
+#include <cudaTypedefs.h>
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "util/broadcast_load_epilogue_c3x.hpp"
+#include "util/common.hpp"
+
 using namespace cute;
 
 /*
@@ -326,16 +351,21 @@ struct ScaledEpilogueBiasAzpToken
   }
 };
 
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
+          typename EpilogueSchedule, typename AccType,
+          typename TileSchedule = cutlass::gemm::PersistentScheduler,
+          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
 struct cutlass_3x_gemm {
+  static const GemmUniversalMode Mode = Mode_;
   using ElementAB = ElementAB_;
   using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
+  using ElementAcc = AccType;
+      // typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+      //                           float>::type;
 
   using EpilogueDescriptor =
       cutlass::epilogue::collective::detail::EpilogueDescriptor<
@@ -375,13 +405,13 @@ struct cutlass_3x_gemm {
 
   using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
       cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
+      TileSchedule>>;
 
   struct GemmKernel : public KernelType {};
 };
 
 template <typename Gemm, typename... EpilogueArgs>
-void cutlass_test_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                          torch::Tensor const& e, torch::Tensor const& b,
                          EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
@@ -459,7 +489,7 @@ struct sm90_fp16_config_default {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -474,7 +504,7 @@ struct sm90_bf16_config_default {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -489,7 +519,7 @@ struct sm90_fp8_config_default {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -504,7 +534,7 @@ struct sm90_fp8_config_M128 {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -520,7 +550,7 @@ struct sm90_fp8_config_M64 {
 
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -535,7 +565,7 @@ struct sm90_int8_config_default {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -550,7 +580,7 @@ struct sm90_int8_config_M128 {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -564,7 +594,7 @@ struct sm90_int8_config_M64 {
   using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -578,7 +608,7 @@ struct sm90_int8_config_M32_NBig {
   using ClusterShape = Shape<_1, _4, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -592,7 +622,7 @@ struct sm90_int8_config_M32_NSmall {
   using ClusterShape = Shape<_1, _8, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
+                      KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 }  // namespace
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
similarity index 100%
rename from csrc/sparse/cutlass/sparse_mm_entry.cu
rename to csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
diff --git a/nm_cutlass_c.cmake b/nm_cutlass_c.cmake
new file mode 100644
index 0000000000000..8228c890244af
--- /dev/null
+++ b/nm_cutlass_c.cmake
@@ -0,0 +1,44 @@
+function(build_nm_cutlass_c)
+
+  message (STATUS "Project root dir ${PROJECT_ROOT_DIR}")
+  file(GLOB full_path_generated_dirs LIST_DIRECTORIES true "${PROJECT_ROOT_DIR}/csrc/sparse/cutlass/generator/generated/*")
+  
+  message (STATUS "fullpath generated dirs ${full_path_generated_dirs}")
+  
+  set(generated_dirs)
+  foreach(d ${full_path_generated_dirs})
+    get_filename_component(d_name ${d} NAME)
+    list(APPEND generated_dirs ${d_name})
+  endforeach()
+  
+  set(NM_CUTLASS_C_ARCHS "9.0;9.0a")
+  
+  foreach(d ${generated_dirs})
+  
+      set(SRCS_DIR "csrc/sparse/cutlass/generator/generated/${d}")
+      set(SRCS)
+      file(GLOB SRCS "${SRCS_DIR}/*cu")
+      list(APPEND SRCS "${SRCS_DIR}/torch_bindings.cpp")
+  
+      set_gencode_flags_for_srcs(
+        SRCS "${SRCS}"
+        CUDA_ARCHS "${NM_CUTLASS_C_ARCHS}")
+  
+      set(EXT_NAME "_nm_cutlass_${d}_C")
+      message(STATUS "Enabling ${EXT_NAME} extension.")
+      define_gpu_extension_target(
+        ${EXT_NAME}
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${SRCS}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+        USE_SABI 3
+        WITH_SOABI)
+  
+      target_compile_definitions(${EXT_NAME} PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
+  
+  endforeach()
+
+endfunction()
diff --git a/setup.py b/setup.py
index d1f4b7f1c1119..43c3b3a268fa2 100644
--- a/setup.py
+++ b/setup.py
@@ -462,6 +462,16 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(
         CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
 
+if _is_cuda():
+    sparse_mm_generated_dir = './csrc/sparse/cutlass/generator/generated/'
+    sparse_mm_generated_dirs = \
+        [x for x in Path(sparse_mm_generated_dir).iterdir() if x.is_dir()]
+    sparse_mm_generated_dir_names = [x.name for x in sparse_mm_generated_dirs]
+    nm_cutlass_extensions = \
+        [f"vllm._nm_cutlass_{x}_C" for x in sparse_mm_generated_dir_names]
+    for x in nm_cutlass_extensions:
+        ext_modules.append(CMakeExtension(name=x))
+
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 

From 807737c0a53cb33d71fc1aaf6031686f9fbb3510 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Fri, 8 Nov 2024 18:05:28 +0000
Subject: [PATCH 36/92] Add multi-GPU

---
 .../cutlass_benchmarks/sparse_mm/bench_v2.py  | 129 ++++++++++++------
 1 file changed, 90 insertions(+), 39 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
index 370e770ba31c1..af8295d6a4ec5 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -1,9 +1,10 @@
 import dataclasses
 import random
-from typing import Any, Callable, Iterable, Optional, Tuple, Dict
+from typing import Any, Callable, Iterable, Optional, Tuple, Dict, List
 
 import multiprocessing as mp
 from multiprocessing import Process, Queue
+from queue import Empty
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -243,7 +244,7 @@ def get_autogen_functions():
     return name_fn
 
 
-def _run(kernel_config, queue):
+def run_benchmark_process(kernel_config, queue):
     try:
         # Initialize CUDA tensors
         arg_pool_size = kernel_config.get('arg_pool_size', 1)
@@ -339,30 +340,77 @@ def _run(kernel_config, queue):
         queue.put((False, None))
 
 
-def run_kernel_in_process(kernel_config: Dict) -> Tuple[bool, Optional[TMeasurement]]:
-    """
-    Run a single kernel benchmark in a separate process.
-    kernel_config contains everything needed to construct the test case:
-    {
-        'kernel_type': str,  # e.g. 'pytorch_mm', 'pytorch_scaled_mm', 'autogen_kernel', etc.
-        'kernel_name': str,  # For autogen kernels, the actual kernel name
-        'm': int,
-        'k': int, 
-        'n': int,
-        'dtype': torch.dtype,
-        'cuda_graph_ops': Optional[int],
-        'arg_pool_size': int,
-        'label': str,
-        'sub_label': str
-    }
-    """
-
-    queue = Queue()
-    p = Process(target=_run, args=(kernel_config, queue,))
-    p.start()
-    success, result = queue.get()
-    p.join()
-    return success, result
+def gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
+    try:
+        torch.cuda.set_device(gpu_id)
+        while True:
+            try:
+                kernel_config = task_queue.get_nowait()
+                if kernel_config is None:  # Poison pill
+                    break
+                
+                process_queue = Queue()
+                run_benchmark_process(kernel_config, process_queue)
+                success, result = process_queue.get()
+                
+                result_queue.put((success, result, kernel_config))
+                
+            except Empty:
+                break
+            except Exception as e:
+                print(f"Error in GPU {gpu_id} worker: {str(e)}")
+                print(traceback.format_exc())
+                result_queue.put((False, None, kernel_config))
+                
+    except Exception as e:
+        print(f"Fatal error in GPU {gpu_id} worker: {str(e)}")
+        print(traceback.format_exc())
+    finally:
+        print(f"GPU {gpu_id} worker finished")
+
+def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
+    # num_gpus = torch.cuda.device_count()
+    gpus_list = [5]
+    task_queue = Queue()
+    result_queue = Queue()
+    
+    # Fill task queue
+    for config in configs:
+        task_queue.put(config)
+    for _ in gpus_list:  # Add poison pills
+        task_queue.put(None)
+    
+    # Start GPU workers
+    workers = []
+    for gpu_id in gpus_list:
+        p = Process(target=gpu_worker, args=(gpu_id, task_queue, result_queue))
+        p.start()
+        workers.append(p)
+    
+    # Collect results
+    results = []
+    completed = 0
+    total_tasks = len(configs)
+    
+    while completed < total_tasks:
+        result = result_queue.get()
+        results.append(result)
+        completed += 1
+        
+        success, _, config = result
+        if config['kernel_type'] == 'autogen_kernel':
+            kernel_num = config['kernel_num']
+            kernel_name = get_autogen_functions()[kernel_num][0]
+            status = "Success" if success else "Failed"
+            print(f"{status}: autogen {kernel_num + 1}/{total_tasks} {kernel_name}")
+        else:
+            status = "Success" if success else "Failed"
+            print(f"{status}: {config['kernel_type']}")
+    
+    for worker in workers:
+        worker.join()
+    
+    return results
 
 
 def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
@@ -389,7 +437,7 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
         'sub_label': sub_label
     }
     
-    # Run standard kernels
+    # Prepare configs for all kernels
     standard_kernels = [
         {'kernel_type': 'pytorch_mm'},
         {'kernel_type': 'pytorch_scaled_mm'},
@@ -398,30 +446,33 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
         {'kernel_type': 'cutlass_sparse_mm'}
     ]
     
-    for kernel in standard_kernels:
-        print(f"Bench {kernel['kernel_type']}")
-        config = {**base_config, **kernel}
-        success, result = run_kernel_in_process(config)
-        if success and result is not None:
-            timers.append(result)
+    # Create configs for standard kernels
+    standard_configs = [{**base_config, **kernel} for kernel in standard_kernels]
     
-    # Run autogen kernels
+    # Create configs for autogen kernels
     autogen_name_fn = get_autogen_functions()
     # autogen_name_fn = autogen_name_fn[284:288]
     # i_range = [284, 285, 286, 287]
     i_range = range(len(autogen_name_fn))
+    autogen_configs = []
     
     for i in i_range:
-        autogen_name, _ = autogen_name_fn[i]
-        print(f"Bench autogen {i+1}/{len(autogen_name_fn)} {autogen_name}")
-        
         config = {
             **base_config,
             'kernel_type': 'autogen_kernel',
             'kernel_num': i
         }
-        
-        success, result = run_kernel_in_process(config)
+        autogen_configs.append(config)
+    
+    # Combine all configs
+    all_configs = standard_configs + autogen_configs
+    
+    # Run all kernels distributed across GPUs
+    print(f"Running {len(all_configs)} benchmarks across {torch.cuda.device_count()} GPUs...")
+    results = run_kernels_on_gpus(all_configs)
+    
+    # Process results
+    for success, result, _ in results:
         if success and result is not None:
             timers.append(result)
     

From 04c19a55e8131658da7a42fd54ee1b5e43037369 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 15:46:58 +0000
Subject: [PATCH 37/92] Add cutlass_scaled_sparse_mm op

---
 .../cutlass_benchmarks/sparse_mm/bench_v2.py  | 548 +++++++++++++++---
 .../sparse_mm/stable_kernels.json             |   1 +
 .../cutlass_benchmarks/sparse_mm/utils.py     |  18 +-
 csrc/ops.h                                    |   2 +-
 .../sparse/cutlass/generator/tools/heatmap.py |   6 +-
 csrc/sparse/cutlass/sparse_compressor.cu      |   2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  12 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |  67 ++-
 csrc/torch_bindings.cpp                       |   4 +-
 vllm/_custom_ops.py                           |   4 +-
 10 files changed, 551 insertions(+), 113 deletions(-)
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
index af8295d6a4ec5..11e4a8815a8ed 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -14,6 +14,12 @@
 import vllm._custom_ops as ops
 import traceback
 
+import json
+import os
+import hashlib
+from datetime import datetime
+from pathlib import Path
+
 
 @dataclasses.dataclass
 class CudaGraphBenchParams:
@@ -244,173 +250,552 @@ def get_autogen_functions():
     return name_fn
 
 
-def run_benchmark_process(kernel_config, queue):
+def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue):
+    """
+    Run a single kernel benchmark in an isolated process.
+    Puts (success, result, config) tuple in the queue.
+    """
     try:
+        torch.cuda.set_device(gpu_id)
+        
         # Initialize CUDA tensors
-        arg_pool_size = kernel_config.get('arg_pool_size', 1)
         m, k, n = kernel_config['m'], kernel_config['k'], kernel_config['n']
         dtype = kernel_config['dtype']
         
         # Create tensors
-        AComps, Es, As, Bs = make_n_rand_sparse_tensors(arg_pool_size, dtype, m, n, k)
+        BComps, Es, As, Bs = make_n_rand_sparse_tensors(
+            kernel_config.get('arg_pool_size', 1), 
+            dtype, m, n, k
+        )
+        AsT = [x.t() for x in As]
+        BsT = [x.t() for x in Bs]
         bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
-        bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
+        bf16_BsT = [x.to(dtype=torch.bfloat16) for x in BsT]
         scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        out = torch.zeros((m, n), dtype=torch.bfloat16, device="cuda")
+        # Because the transposed output will be computed
+        out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
 
         # Setup benchmark params
         cuda_graph_params = None
         if cgops := kernel_config.get('cuda_graph_ops'):
             cuda_graph_params = CudaGraphBenchParams(cgops)
-        
+
         label = kernel_config['label']
         sub_label = kernel_config['sub_label']
-        
+
         # Initialize benchmark based on kernel type
         bench = None
         kernel_type = kernel_config['kernel_type']
-        
+
         if kernel_type == 'pytorch_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
                             "pytorch_bf16_bf16_bf16_matmul-no-scales", 
                             torch.mm,
-                            ArgPool(bf16_As), ArgPool(bf16_Bs))
-        
+                            ArgPool(bf16_As), ArgPool(bf16_BsT))
+
         elif kernel_type == 'pytorch_scaled_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
                             "pytorch_fp8_fp8_bf16_scaled_mm",
                             torch._scaled_mm,
-                            ArgPool(As), ArgPool(Bs),
+                            ArgPool(As), ArgPool(BsT),
                             scale_a=scale_a, scale_b=scale_b,
                             out_dtype=torch.bfloat16)
-        
+
         elif kernel_type == 'pytorch_scaled_mm_fast':
             bench = BenchMM(cuda_graph_params, label, sub_label,
                             "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
                             torch._scaled_mm,
-                            ArgPool(As), ArgPool(Bs),
+                            ArgPool(As), ArgPool(BsT),
                             scale_a=scale_a, scale_b=scale_b,
                             out_dtype=torch.bfloat16,
                             use_fast_accum=True)
-        
+
         elif kernel_type == 'cutlass_scaled_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_mm", 
+                            "cutlass_fp8_fp8_bf16_scaled_mm_default", 
                             ops.cutlass_scaled_mm,
-                            ArgPool(As), ArgPool(Bs), scale_a, scale_b,
+                            ArgPool(As), ArgPool(BsT), scale_a, scale_b,
                             torch.bfloat16)
-        
+
         elif kernel_type == 'cutlass_sparse_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm", 
+                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm_default", 
                             ops.cutlass_scaled_sparse_mm,
-                            ArgPool(AComps), ArgPool(Es), ArgPool(Bs), 
-                            scale_a, scale_b, torch.bfloat16)
-        
+                            ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
+                            scale_b, scale_a, torch.bfloat16)
+
         elif kernel_type == 'autogen_kernel':
             # Get the autogen kernel
             kernel_num = kernel_config['kernel_num']
-            autogen_fn = None
-            
-            # Get the kernel in autogen functions
             kernel_name, autogen_fn = get_autogen_functions()[kernel_num]
-            
-            if autogen_fn is None:
-                raise ValueError(f"Autogen kernel {kernel_name} not found")
-            
+
             # Create appropriate benchmark based on kernel type
-            if "scaled_sparse_mm" in kernel_name:
-                bench = BenchMM(cuda_graph_params, label, sub_label,
-                                kernel_name, autogen_fn, out, 
-                                ArgPool(AComps), ArgPool(Es), ArgPool(Bs),
-                                scale_a, scale_b)
-            else:
-                bench = BenchMM(cuda_graph_params, label, sub_label,
-                                kernel_name, autogen_fn, out,
-                                ArgPool(As), ArgPool(Bs))
+            bench = BenchMM(cuda_graph_params, label, sub_label,
+                            kernel_name, autogen_fn, out, 
+                            ArgPool(BComps), ArgPool(Es), ArgPool(AsT),
+                            scale_b, scale_a)
 
         # Run the benchmark
         result = bench.run()
-        queue.put((True, result))
-        
+        queue.put((True, result, kernel_config))
+
     except Exception as e:
-        print(f"Error in process: {str(e)}")
+        print(f"Error in benchmark process: {str(e)}")
         print(traceback.format_exc())
-        queue.put((False, None))
+        queue.put((False, None, kernel_config))
+    finally:
+        # Explicit cleanup
+        torch.cuda.empty_cache()
+
+def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
+    """Worker process that spawns individual benchmark processes for each kernel."""
+    try:
+        while True:
+            try:
+                kernel_config = task_queue.get_nowait()
+                if kernel_config is None:  # Poison pill
+                    break
+
+                # Create a new process queue for this specific benchmark
+                process_queue = Queue()
+
+                # Create and start a new process for this kernel benchmark
+                p = Process(target=run_single_benchmark_process, 
+                          args=(kernel_config, gpu_id, process_queue))
+                p.start()
+
+                # Wait for result with timeout (5 minutes for benchmarking)
+                try:
+                    success, result, config = process_queue.get(timeout=300)
+                    result_queue.put((success, result, config))
+                except Empty:
+                    print(f"Kernel {kernel_config.get('kernel_type')} benchmark timed out")
+                    result_queue.put((False, None, kernel_config))
+
+                # Cleanup
+                p.join(timeout=1)  # Give it 1 second to join
+                if p.is_alive():
+                    p.terminate()
+                    p.join()
+
+            except Empty:
+                break
+            except Exception as e:
+                print(f"Error in GPU {gpu_id} worker: {str(e)}")
+                print(traceback.format_exc())
+                if 'kernel_config' in locals():
+                    result_queue.put((False, None, kernel_config))
+
+    finally:
+        print(f"GPU {gpu_id} worker finished")
+
+def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
+    MULTI_GPU_MULTI_PROCESS = False  # Set to False for single GPU testing
+    if MULTI_GPU_MULTI_PROCESS:
+        gpus_list = [5]
+        task_queue = Queue()
+        result_queue = Queue()
+
+        configs = configs[:10]
+
+        # Fill task queue
+        for config in configs:
+            task_queue.put(config)
+        for _ in gpus_list:  # Add poison pills
+            task_queue.put(None)
+
+        # Start GPU workers
+        workers = []
+        for gpu_id in gpus_list:
+            p = Process(target=benchmark_gpu_worker, args=(gpu_id, task_queue, result_queue))
+            p.start()
+            workers.append(p)
+
+        # Collect results
+        results = []
+        completed = 0
+        total_tasks = len(configs)
+
+        while completed < total_tasks:
+            success, result, config = result_queue.get()
+            results.append((success, result, config))
+            completed += 1
+
+            # Print progress
+            if config['kernel_type'] == 'autogen_kernel':
+                kernel_num = config['kernel_num']
+                kernel_name = get_autogen_functions()[kernel_num][0]
+                status = "Success" if success else "Failed"
+                print(f"{status}: autogen {kernel_num} {kernel_name}")
+            else:
+                status = "Success" if success else "Failed"
+                print(f"{status}: {config['kernel_type']}")
+
+        # Cleanup workers
+        for worker in workers:
+            worker.join(timeout=1)
+            if worker.is_alive():
+                worker.terminate()
+                worker.join()
+
+        return results
+    else:
+        """Run kernel benchmarks in a single process."""
+        results = []
+        gpu_id = 5  # Using the same GPU as before
+        torch.cuda.set_device(gpu_id)
+        # configs = configs[:10]  # Keep the original slice
+        
+        for config in configs:
+            try:
+                # Initialize CUDA tensors
+                m, k, n = config['m'], config['k'], config['n']
+                dtype = config['dtype']
+                
+                # Create tensors
+                BComps, Es, As, Bs = make_n_rand_sparse_tensors(
+                    config.get('arg_pool_size', 1), 
+                    dtype, m, n, k
+                )
+                AsT = [x.t() for x in As]
+                BsT = [x.t() for x in Bs]
+                bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
+                bf16_BsT = [x.to(dtype=torch.bfloat16) for x in BsT]
+                scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+                scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+                out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
+
+                # Setup benchmark params
+                cuda_graph_params = None
+                if cgops := config.get('cuda_graph_ops'):
+                    cuda_graph_params = CudaGraphBenchParams(cgops)
+
+                label = config['label']
+                sub_label = config['sub_label']
+
+                # Initialize benchmark based on kernel type
+                bench = None
+                kernel_type = config['kernel_type']
+
+                if kernel_type == 'pytorch_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "pytorch_bf16_bf16_bf16_matmul-no-scales", 
+                                    torch.mm,
+                                    ArgPool(bf16_As), ArgPool(bf16_BsT))
+
+                elif kernel_type == 'pytorch_scaled_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "pytorch_fp8_fp8_bf16_scaled_mm",
+                                    torch._scaled_mm,
+                                    ArgPool(As), ArgPool(BsT),
+                                    scale_a=scale_a, scale_b=scale_b,
+                                    out_dtype=torch.bfloat16)
+
+                elif kernel_type == 'pytorch_scaled_mm_fast':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                                    torch._scaled_mm,
+                                    ArgPool(As), ArgPool(BsT),
+                                    scale_a=scale_a, scale_b=scale_b,
+                                    out_dtype=torch.bfloat16,
+                                    use_fast_accum=True)
+
+                elif kernel_type == 'cutlass_scaled_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "cutlass_fp8_fp8_bf16_scaled_mm_default", 
+                                    ops.cutlass_scaled_mm,
+                                    ArgPool(As), ArgPool(BsT), scale_a, scale_b,
+                                    torch.bfloat16)
+
+                elif kernel_type == 'cutlass_sparse_mm':
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    "cutlass_fp8_fp8_bf16_scaled_sparse_mm_default", 
+                                    ops.cutlass_scaled_sparse_mm,
+                                    ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
+                                    scale_b, scale_a, torch.bfloat16)
+
+                elif kernel_type == 'autogen_kernel':
+                    # Get the autogen kernel
+                    kernel_num = config['kernel_num']
+                    kernel_name, autogen_fn = get_autogen_functions()[kernel_num]
+
+                    # Create appropriate benchmark based on kernel type
+                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                                    kernel_name, autogen_fn, out, 
+                                    ArgPool(BComps), ArgPool(Es), ArgPool(AsT),
+                                    scale_b, scale_a)
+
+                # Run the benchmark
+                result = bench.run()
+                
+                # Print progress
+                if kernel_type == 'autogen_kernel':
+                    kernel_num = config['kernel_num']
+                    kernel_name = get_autogen_functions()[kernel_num][0]
+                    print(f"Success: autogen {kernel_num} {kernel_name}")
+                else:
+                    print(f"Success: {kernel_type}")
+                    
+                results.append((True, result, config))
+                
+                # Cleanup
+                torch.cuda.empty_cache()
+
+            except Exception as e:
+                print(f"Error in benchmark: {str(e)}")
+                print(traceback.format_exc())
+                results.append((False, None, config))
+                torch.cuda.empty_cache()
+                
+        return results
 
 
-def gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
+
+def test_autogen_kernel_process(kernel_config: Dict, gpu_id: int, queue: Queue):
+    """
+    Test run a single autogen kernel in an isolated process.
+    Puts (kernel_num, success) tuple in the queue.
+    """
     try:
         torch.cuda.set_device(gpu_id)
+        
+        # Initialize test tensors (using smaller dimensions for quick testing)
+        test_m, test_k, test_n = 256, 256, 256  # Small test dimensions
+        dtype = kernel_config['dtype']
+        kernel_num = kernel_config['kernel_num']
+        
+        # Create minimal test tensors
+        BComps, Es, As, Bs = make_n_rand_sparse_tensors(1, dtype, test_m, test_n, test_k)
+        AsT = [x.t() for x in As]
+        BsT = [x.t() for x in Bs]
+        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+        out = torch.zeros((test_m, test_n), dtype=torch.bfloat16, device="cuda")
+        
+        # Get the autogen kernel
+        kernel_name, autogen_fn = get_autogen_functions()[kernel_num]
+        
+        # Test run based on kernel type
+        autogen_fn(out, BComps[0], Es[0], AsT[0], scale_a, scale_b)
+            
+        # Run a second time to ensure stability
+        torch.cuda.synchronize()
+        autogen_fn(out, BComps[0], Es[0], AsT[0], scale_a, scale_b)
+        torch.cuda.synchronize()
+        
+        queue.put((kernel_num, True))
+        
+    except Exception as e:
+        print(f"Kernel {kernel_num} ({kernel_name if 'kernel_name' in locals() else 'unknown'}) failed test: {str(e)}")
+        queue.put((kernel_num, False))
+    finally:
+        # Explicit cleanup
+        torch.cuda.empty_cache()
+
+
+def test_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
+    """Worker process that spawns individual test processes for each kernel."""
+    try:
         while True:
             try:
                 kernel_config = task_queue.get_nowait()
                 if kernel_config is None:  # Poison pill
                     break
                 
+                # Create a new process queue for this specific test
                 process_queue = Queue()
-                run_benchmark_process(kernel_config, process_queue)
-                success, result = process_queue.get()
                 
-                result_queue.put((success, result, kernel_config))
+                # Create and start a new process for this kernel test
+                p = Process(target=test_autogen_kernel_process, 
+                          args=(kernel_config, gpu_id, process_queue))
+                p.start()
+                
+                # Wait for result with timeout
+                try:
+                    kernel_num, success = process_queue.get(timeout=30)  # 30 second timeout
+                    result_queue.put((kernel_num, success))
+                except Empty:
+                    print(f"Kernel {kernel_config['kernel_num']} timed out")
+                    result_queue.put((kernel_config['kernel_num'], False))
+                
+                # Cleanup
+                p.join(timeout=1)  # Give it 1 second to join
+                if p.is_alive():
+                    p.terminate()
+                    p.join()
                 
             except Empty:
                 break
             except Exception as e:
-                print(f"Error in GPU {gpu_id} worker: {str(e)}")
+                print(f"Error in GPU {gpu_id} test worker: {str(e)}")
                 print(traceback.format_exc())
-                result_queue.put((False, None, kernel_config))
+                if 'kernel_config' in locals():
+                    result_queue.put((kernel_config['kernel_num'], False))
                 
-    except Exception as e:
-        print(f"Fatal error in GPU {gpu_id} worker: {str(e)}")
-        print(traceback.format_exc())
     finally:
-        print(f"GPU {gpu_id} worker finished")
+        print(f"GPU {gpu_id} test worker finished")
 
-def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
-    # num_gpus = torch.cuda.device_count()
-    gpus_list = [5]
+
+def filter_stable_autogen_kernels(base_config: Dict, gpus_list: List[int]) -> List[int]:
+    """
+    Test all autogen kernels and return list of kernel numbers that pass the test.
+    Each kernel is tested in a completely isolated process.
+    """
     task_queue = Queue()
     result_queue = Queue()
     
-    # Fill task queue
-    for config in configs:
+    # Get all autogen kernels
+    autogen_name_fn = get_autogen_functions()
+    total_kernels = len(autogen_name_fn)
+    
+    # Fill task queue with test configs
+    for i in range(total_kernels):
+        config = {
+            **base_config,
+            'kernel_type': 'autogen_kernel',
+            'kernel_num': i
+        }
         task_queue.put(config)
-    for _ in gpus_list:  # Add poison pills
+    
+    # Add poison pills
+    for _ in gpus_list:
         task_queue.put(None)
     
     # Start GPU workers
     workers = []
     for gpu_id in gpus_list:
-        p = Process(target=gpu_worker, args=(gpu_id, task_queue, result_queue))
+        p = Process(target=test_gpu_worker, args=(gpu_id, task_queue, result_queue))
         p.start()
         workers.append(p)
     
     # Collect results
-    results = []
+    stable_kernels = []
     completed = 0
-    total_tasks = len(configs)
     
-    while completed < total_tasks:
-        result = result_queue.get()
-        results.append(result)
+    print(f"Testing {total_kernels} autogen kernels for stability...")
+    while completed < total_kernels:
+        kernel_num, success = result_queue.get()
         completed += 1
         
-        success, _, config = result
-        if config['kernel_type'] == 'autogen_kernel':
-            kernel_num = config['kernel_num']
+        if success:
             kernel_name = get_autogen_functions()[kernel_num][0]
-            status = "Success" if success else "Failed"
-            print(f"{status}: autogen {kernel_num + 1}/{total_tasks} {kernel_name}")
-        else:
-            status = "Success" if success else "Failed"
-            print(f"{status}: {config['kernel_type']}")
+            stable_kernels.append(kernel_num)
+            print(f"Kernel {kernel_num} ({kernel_name}) passed stability test")
+        
+        if completed % 10 == 0:
+            print(f"Tested {completed}/{total_kernels} kernels. {len(stable_kernels)} stable so far.")
     
+    # Wait for workers to finish
     for worker in workers:
-        worker.join()
+        worker.join(timeout=1)
+        if worker.is_alive():
+            worker.terminate()
+            worker.join()
+    
+    print(f"Found {len(stable_kernels)} stable kernels out of {total_kernels}")
+    return stable_kernels
+
+
+def get_config_hash(base_config: Dict) -> str:
+    """
+    Create a hash of the relevant configuration parameters that would affect kernel stability.
+    """
+    # Extract only the parameters that affect kernel stability
+    relevant_params = {
+        'dtype': str(base_config['dtype']),  # Convert dtype to string for hashing
+        'm': base_config['m'],
+        'k': base_config['k'],
+        'n': base_config['n'],
+    }
     
-    return results
+    # Add CUDA version and PyTorch version to the hash
+    relevant_params['cuda_version'] = torch.version.cuda
+    relevant_params['torch_version'] = torch.__version__
+    
+    # Create a sorted string representation for consistent hashing
+    param_str = json.dumps(relevant_params, sort_keys=True)
+    
+    # Create hash
+    return hashlib.sha256(param_str.encode()).hexdigest()[:16]
+
+
+def get_cache_path() -> str:
+    """Get the path to the cache file for the given configuration hash."""
+    return f'{Path(os.path.dirname(os.path.realpath(__file__)))}/stable_kernels.json'
+
+
+def load_cached_kernels(cache_path: str) -> Optional[List[int]]:
+    """
+    Load cached stable kernel list if it exists and is not too old.
+    Returns None if cache doesn't exist or is invalid.
+    """
+    try:
+        if not os.path.exists(cache_path):
+            return None
+            
+        with open(cache_path, 'r') as f:
+            cache_data = json.load(f)
+            
+        # # Check if cache is too old (e.g., older than 7 days)
+        # cache_date = datetime.fromisoformat(cache_data['date'])
+        # cache_age = (datetime.now() - cache_date).days
+        # if cache_age > 7:
+        #     print("Cache is older than 7 days, will rerun stability tests")
+        #     return None
+            
+        # Verify the cached kernel numbers are valid
+        total_kernels = len(get_autogen_functions())
+        stable_kernels = cache_data['stable_kernels']
+        if any(k >= total_kernels for k in stable_kernels):
+            print("Cache is invalid (kernel numbers out of range), will rerun stability tests")
+            return None
+            
+        print(f"Loaded {len(stable_kernels)} stable kernels from cache")
+        return stable_kernels
+        
+    except Exception as e:
+        print(f"Error loading cache: {str(e)}")
+        return None
+
+
+def save_cached_kernels(cache_path: str, stable_kernels: List[int]):
+    """Save the list of stable kernels to cache."""
+    try:
+        cache_data = {
+            'date': datetime.now().isoformat(),
+            'stable_kernels': stable_kernels
+        }
+        
+        with open(cache_path, 'w') as f:
+            json.dump(cache_data, f)
+            
+        print(f"Saved {len(stable_kernels)} stable kernels to cache")
+        
+    except Exception as e:
+        print(f"Error saving cache: {str(e)}")
+
+
+def get_stable_autogen_kernels(base_config: Dict, gpus_list: List[int]) -> List[int]:
+    """
+    Get the list of stable autogen kernels, either from cache or by running tests.
+    """
+    # Generate config hash and get cache path
+    # config_hash = get_config_hash(base_config)
+    cache_path = get_cache_path()
+    
+    # Try to load from cache
+    stable_kernels = load_cached_kernels(cache_path)
+    
+    if stable_kernels is None:
+        # Cache miss or invalid cache - run stability tests
+        print("Running stability tests for autogen kernels...")
+        stable_kernels = filter_stable_autogen_kernels(base_config, gpus_list)
+        
+        # Save results to cache
+        save_cached_kernels(cache_path, stable_kernels)
+    
+    return stable_kernels
 
 
 def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
@@ -424,6 +809,7 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
         pass
     
     timers = []
+    gpus_list = [5]  # Using the same GPU list as original code
 
     # Base configuration for all kernels
     base_config = {
@@ -449,18 +835,16 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
     # Create configs for standard kernels
     standard_configs = [{**base_config, **kernel} for kernel in standard_kernels]
     
-    # Create configs for autogen kernels
-    autogen_name_fn = get_autogen_functions()
-    # autogen_name_fn = autogen_name_fn[284:288]
-    # i_range = [284, 285, 286, 287]
-    i_range = range(len(autogen_name_fn))
-    autogen_configs = []
+    # Get stable kernels (from cache or by testing)
+    stable_kernel_nums = get_stable_autogen_kernels(base_config, gpus_list)
     
-    for i in i_range:
+    # Create configs only for stable autogen kernels
+    autogen_configs = []
+    for kernel_num in stable_kernel_nums:
         config = {
             **base_config,
             'kernel_type': 'autogen_kernel',
-            'kernel_num': i
+            'kernel_num': kernel_num
         }
         autogen_configs.append(config)
     
@@ -468,7 +852,7 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
     all_configs = standard_configs + autogen_configs
     
     # Run all kernels distributed across GPUs
-    print(f"Running {len(all_configs)} benchmarks across {torch.cuda.device_count()} GPUs...")
+    print(f"Running {len(all_configs)} benchmarks across {len(gpus_list)} GPUs...")
     results = run_kernels_on_gpus(all_configs)
     
     # Process results
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json b/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json
new file mode 100644
index 0000000000000..2cb53b60807a1
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json
@@ -0,0 +1 @@
+{"date": "2024-11-09T05:36:00.932166", "stable_kernels": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 537, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295]}
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
index 33a630b2278b0..2d753b254a0ab 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
@@ -54,9 +54,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
     # # Initialize a to all ones
     # a = torch.ones((m, k), device='cuda')
     # # Initialize b to all ones
-    # b = torch.ones((n, k), device='cuda').t()
+    # b = torch.ones((n, k), device='cuda')
 
-    a = prune_to_2_4(a)
+    b = prune_to_2_4(b)
 
     if dtype == torch.int8:
         a, b = to_int8(a), to_int8(b)
@@ -69,10 +69,10 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
     else:
         raise ValueError("unsupported dtype")
 
-    a_compressed, e = ops.cutlass_sparsify_and_compress_entry(a)
+    b_compressed, e = ops.cutlass_compress_entry(b)
 
-    # Compressed A, Metadata, Original A, B
-    return a_compressed, e, a, b
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
 
 
 def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
@@ -80,8 +80,8 @@ def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
                         Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
     ABs = []
     for _ in range(num_tensors):
-        a_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
-        if a_comp is not None:
+        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+        if b_comp is not None:
             ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
-    AComps, Es, As, Bs = zip(*ABs)
-    return list(AComps), list(Es), list(As), list(Bs)
+    BComps, Es, As, Bs = zip(*ABs)
+    return list(BComps), list(Es), list(As), list(Bs)
diff --git a/csrc/ops.h b/csrc/ops.h
index 5a8db9248ea2a..a641e54ef183b 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -124,7 +124,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b_scales,
                        c10::optional<torch::Tensor> const& bias);
 
-bool cutlass_sparsify_and_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
                                  torch::Tensor const& a);
 #endif
 
diff --git a/csrc/sparse/cutlass/generator/tools/heatmap.py b/csrc/sparse/cutlass/generator/tools/heatmap.py
index 06383d4029b7f..b6059ff4e7ca7 100644
--- a/csrc/sparse/cutlass/generator/tools/heatmap.py
+++ b/csrc/sparse/cutlass/generator/tools/heatmap.py
@@ -14,7 +14,7 @@ def plot_heatmap(data: np.array,
                  save_filename='heatmap.png'):
     # min because of some matplotlib render restrictions.
     fig_size_x = min(len(x_labels), 320)
-    fig_size_y = len(y_labels) + 15
+    fig_size_y = len(y_labels) + 25
     fig, ax = plt.subplots(figsize=(fig_size_x, fig_size_y))
     im = ax.imshow(data, cmap="Reds", vmin=0.0, vmax=1.0, interpolation=None)
 
@@ -37,7 +37,7 @@ def plot_heatmap(data: np.array,
                     ha="center",
                     va="center",
                     color="w",
-                    fontsize=10.0)
+                    fontsize=6.0)
 
     #ax.colorbar()
 
@@ -155,7 +155,7 @@ def select_kernels_and_plot(gemm_problems: List[str], ops: List[str],
     selected_autogen_ops = select_kernels(gemm_autogenops,
                                           gemm_problems,
                                           autogen_ops,
-                                          min_gemm_efficiency=0.98)
+                                          min_gemm_efficiency=0.99)
 
     # prepare plot data
     selected_ops = selected_autogen_ops + cutlass_ops + pytorch_ops
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index 8723f9db8525e..660ee33044d9f 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -187,7 +187,7 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   return true;
 }
 
-bool cutlass_sparsify_and_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
+bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
 {
   if (a.dtype() == torch::kBFloat16) {
     return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index a5faaacce68d0..a62598587b1b1 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -48,6 +48,10 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
       typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
       typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM512 =
+      typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
@@ -61,9 +65,13 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
     // m in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // m in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
   } else {
-    // m in (128, inf)
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+    // m in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
   }
 }
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index ac31be8f2bfaf..0ff65ef199a91 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -522,35 +522,80 @@ struct sm90_fp8_config_default {
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
+};
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_M128 {
   // M in (64, 128]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
 };
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
+struct sm90_fp8_config_M256 {
+  // M in (128, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M512 {
+  // M in (256, ]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+
+  using TileSchedule = cutlass::gemm::PersistentScheduler;
 
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                      KernelSchedule, EpilogueSchedule, float,
+                      TileSchedule>;
 };
 
 template <typename InType, typename OutType,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9e55884549839..0663f2281a1ac 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -277,9 +277,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("cutlass_scaled_sparse_mm_supports_fp8", &cutlass_scaled_sparse_mm_supports_fp8);
 
   // Test
-  ops.def("cutlass_sparsify_and_compress_entry(Tensor! a_compressed, Tensor! e,"
+  ops.def("cutlass_compress_entry(Tensor! a_compressed, Tensor! e,"
           " Tensor a) -> bool");
-  ops.impl("cutlass_sparsify_and_compress_entry", &cutlass_sparsify_and_compress_entry);
+  ops.impl("cutlass_compress_entry", &cutlass_compress_entry);
 
   // Mamba selective scan kernel
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2c0bec4183708..c72e4eabecdd2 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -513,7 +513,7 @@ def cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability)
 
 
-def cutlass_sparsify_and_compress_entry(a: torch.Tensor) \
+def cutlass_compress_entry(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
     assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn or \
             a.dtype is torch.bfloat16 or a.dtype is torch.float16)
@@ -527,7 +527,7 @@ def cutlass_sparsify_and_compress_entry(a: torch.Tensor) \
     a_compressed = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
     e = torch.empty((m, k // 2 // elemsPerElemE), dtype=torch.uint8, device=a.device)
 
-    if not (torch.ops._C.cutlass_sparsify_and_compress_entry(a_compressed, e, a)):
+    if not (torch.ops._C.cutlass_compress_entry(a_compressed, e, a)):
         raise ValueError
 
     return a_compressed, e

From 2a85c5a387cbd0f4452744afd968d1252510b51e Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 18:16:44 +0000
Subject: [PATCH 38/92] Clean up

---
 csrc/sparse/cusparselt/binding.py             |   47 -
 csrc/sparse/cusparselt/cusparselt_mm.cu       | 1077 ---------
 csrc/sparse/cusparselt/cusparselt_mm_entry.cu |  135 --
 .../cutlass/example/62_hopper_sparse_gemm.cu  |  596 -----
 csrc/sparse/cutlass/example/Makefile          |   68 -
 .../cutlass/example/util/command_line.h       |  313 ---
 .../cutlass/example/util/distribution.h       |  154 --
 .../cutlass/example/util/gather_tensor.hpp    |  215 --
 csrc/sparse/cutlass/example/util/helper.h     |  108 -
 .../sparse/cutlass/example/util/host_tensor.h |  541 -----
 .../cutlass/example/util/packed_stride.hpp    |  570 -----
 .../util/reference/detail/inner_product.h     |  135 --
 .../reference/detail/linear_to_coordinate.h   |   94 -
 .../util/reference/device/convolution.h       | 1549 ------------
 .../example/util/reference/device/gemm.h      |  385 ---
 .../util/reference/device/gemm_complex.h      |  350 ---
 .../reference/device/gemm_planar_complex.h    |  311 ---
 .../example/util/reference/device/gett.hpp    |  146 --
 .../util/reference/device/kernel/gemm.h       |  162 --
 .../device/kernel/tensor_elementwise.h        |  168 --
 .../reference/device/kernel/tensor_foreach.h  |  159 --
 .../util/reference/device/rank_2k_complex.h   |  355 ---
 .../util/reference/device/tensor_compare.h    |  246 --
 .../util/reference/device/tensor_fill.h       | 2077 -----------------
 .../util/reference/device/tensor_foreach.h    |  144 --
 .../util/reference/device/tensor_reduce.h     |  510 ----
 .../util/reference/device/tensor_relu.h       |  141 --
 .../util/reference/device/thread/gemm.h       |  186 --
 .../example/util/reference/host/conv.hpp      |  698 ------
 .../example/util/reference/host/convolution.h |  802 -------
 .../util/reference/host/error_metrics.h       |   66 -
 .../example/util/reference/host/gemm.h        |  531 -----
 .../util/reference/host/gemm_complex.h        |  210 --
 .../util/reference/host/gemm_planar_complex.h |  228 --
 .../example/util/reference/host/gett.hpp      |  538 -----
 .../example/util/reference/host/rank_2k.h     |  261 ---
 .../util/reference/host/rank_2k_complex.h     |  318 ---
 .../util/reference/host/rank_k_complex.h      |  234 --
 .../example/util/reference/host/symm.h        |  285 ---
 .../util/reference/host/symm_complex.h        |  319 ---
 .../util/reference/host/tensor_compare.h      |  423 ----
 .../util/reference/host/tensor_compare.hpp    |  101 -
 .../example/util/reference/host/tensor_copy.h |  256 --
 .../util/reference/host/tensor_elementwise.h  |  341 ---
 .../example/util/reference/host/tensor_fill.h | 1718 --------------
 .../util/reference/host/tensor_fill.hpp       |  432 ----
 .../util/reference/host/tensor_foreach.h      |  134 --
 .../example/util/reference/host/tensor_norm.h |   42 -
 .../util/reference/host/tensor_reduce.h       |  203 --
 .../util/reference/host/tensor_reduce.hpp     |  203 --
 .../example/util/reference/host/trmm.h        |  215 --
 .../util/reference/host/trmm_complex.h        |  262 ---
 .../cutlass/example/util/tensor_view_io.h     |  270 ---
 53 files changed, 20032 deletions(-)
 delete mode 100644 csrc/sparse/cusparselt/binding.py
 delete mode 100644 csrc/sparse/cusparselt/cusparselt_mm.cu
 delete mode 100644 csrc/sparse/cusparselt/cusparselt_mm_entry.cu
 delete mode 100644 csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu
 delete mode 100644 csrc/sparse/cutlass/example/Makefile
 delete mode 100644 csrc/sparse/cutlass/example/util/command_line.h
 delete mode 100644 csrc/sparse/cutlass/example/util/distribution.h
 delete mode 100644 csrc/sparse/cutlass/example/util/gather_tensor.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/helper.h
 delete mode 100644 csrc/sparse/cutlass/example/util/host_tensor.h
 delete mode 100644 csrc/sparse/cutlass/example/util/packed_stride.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/detail/inner_product.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/convolution.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/gemm.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/gett.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/conv.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/convolution.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/error_metrics.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/gemm.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/gett.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/rank_2k.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/symm.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/symm_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/trmm.h
 delete mode 100644 csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h
 delete mode 100644 csrc/sparse/cutlass/example/util/tensor_view_io.h

diff --git a/csrc/sparse/cusparselt/binding.py b/csrc/sparse/cusparselt/binding.py
deleted file mode 100644
index 035c18abd312a..0000000000000
--- a/csrc/sparse/cusparselt/binding.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from torch.utils.cpp_extension import load
-import os
-import torch
-
-base_path = __file__.replace("spmm.py", "")
-
-if not os.path.exists(f"{base_path}/build"):
-    os.makedirs(f"{base_path}/build")
-
-if not os.path.exists(base_path + "/libcusparse_lt"):
-    os.system(
-    "wget https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
-    os.system("tar -xf libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
-    os.system(f"mv libcusparse_lt-linux-x86_64-0.5.1.1-archive {base_path}/libcusparse_lt")
-    os.system("rm libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
-
-pruner = load(name='pruner',
-              sources=[f'{base_path}/spmm_backend.cpp',
-                       f'{base_path}/spmm_backend.cu',
-                       ],
-              extra_cflags=[
-                  f'-L{base_path}/libcusparse_lt/lib',
-                  '-lcusparse',
-                  '-lcusparseLt',
-                  '-ldl'
-              ],
-              extra_cuda_cflags=[
-                  f'-L{base_path}/libcusparse_lt/lib',
-                  '-lcusparse',
-                  '-lcusparseLt',
-                  '-ldl'
-              ],
-              extra_ldflags=[
-                  f'-L{base_path}/libcusparse_lt/lib',
-                  '-lcusparse',
-                  '-lcusparseLt',
-                  '-ldl'
-              ],
-              extra_include_paths=[
-                  base_path + '/libcusparse_lt/include'
-              ],
-              build_directory=f'{base_path}/build',
-              with_cuda=True,
-              verbose=False)
-
-init_flag = pruner.init_cusparse_lt()
-assert init_flag == 0, "Failed to initialize CuSparseLT"
\ No newline at end of file
diff --git a/csrc/sparse/cusparselt/cusparselt_mm.cu b/csrc/sparse/cusparselt/cusparselt_mm.cu
deleted file mode 100644
index 0e088b35c7b87..0000000000000
--- a/csrc/sparse/cusparselt/cusparselt_mm.cu
+++ /dev/null
@@ -1,1077 +0,0 @@
-/*
- * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
-#include <cusparseLt.h>       // cusparseLt header
-#include <cstdio>             // printf
-#include <cstdlib>            // std::rand
-#include <vector>             // std::vector
-#include <torch/extension.h>
-#include <iostream>
-
-
-#define INT8_OUTPUT_TYPE int32_t //at::Half //int8_t
-#define INT8_OUTPUT_TYPE_CUDA CUDA_R_8I //CUDA_R_32I
-#define INT8_OUTPUT_TYPE_TORCH torch::kInt32 //torch::kInt32
-
-
-#define MAX(a, b) ((abs(a) > abs(b) ? (a) : (b)))
-#define MIN(a, b) ((abs(a) < abs(b) ? (a) : (b)))
-
-
-#define CHECK_CUDA(func)                                                       \
-{                                                                              \
-    cudaError_t status = (func);                                               \
-    if (status != cudaSuccess) {                                               \
-        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
-               __LINE__, cudaGetErrorString(status), status);                  \
-        return EXIT_FAILURE;                                                   \
-    }                                                                          \
-}
-
-
-#define CHECK_CUDA_TORCH(func)                                                       \
-{                                                                              \
-    cudaError_t status = (func);                                               \
-    if (status != cudaSuccess) {                                               \
-        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
-               __LINE__, cudaGetErrorString(status), status);                  \
-        return torch::ones(1);                                                   \
-    }                                                                          \
-}
-
-
-#define CHECK_CUSPARSE(func)                                                   \
-{                                                                              \
-    cusparseStatus_t status = (func);                                          \
-    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
-        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
-               __LINE__, cusparseGetErrorString(status), status);              \
-        return EXIT_FAILURE;                                                   \
-    }                                                                          \
-}
-
-
-#define CHECK_CUSPARSE_TORCH(func)                                                   \
-{                                                                              \
-    cusparseStatus_t status = (func);                                          \
-    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
-        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
-               __LINE__, cusparseGetErrorString(status), status);              \
-        return torch::ones(1);                                                   \
-    }                                                                          \
-}
-
-constexpr int EXIT_UNSUPPORTED = 2;
-
-cusparseLtHandle_t handle;
-
-float alpha = 1.0;
-float beta  = 0.0;
-
-
-typedef struct {
-   at::Half data;
-   int index;
-} indexed_half;
-
-
-int init_cusparse_lt_cuda()
-{
-    int major_cc, minor_cc;
-    CHECK_CUDA( cudaDeviceGetAttribute(&major_cc,
-                                       cudaDevAttrComputeCapabilityMajor, 0) )
-    CHECK_CUDA( cudaDeviceGetAttribute(&minor_cc,
-                                       cudaDevAttrComputeCapabilityMinor, 0) )
-    if (!(major_cc == 8 && minor_cc == 0) &&
-        !(major_cc == 8 && minor_cc == 6) &&
-        !(major_cc == 8 && minor_cc == 9)) {
-        std::printf("\ncusparseLt is supported only on GPU devices with"
-                    " compute capability == 8.0, 8.6, 8.9 current: %d.%d\n\n",
-                     major_cc, minor_cc);
-        return EXIT_UNSUPPORTED;
-    }
-    CHECK_CUSPARSE( cusparseLtInit(&handle) )
-
-    return EXIT_SUCCESS;
-}
-
-
-typedef struct cusparseLtMatmulArgs_t {
-    cusparseLtMatmulPlan_t*         plan;
-    cusparseLtMatmulDescriptor_t*   matmul;
-    cusparseLtMatmulAlgSelection_t* alg_sel;
-    cudaStream_t*                   streams;
-    int                             num_streams;
-    cudaStream_t                    stream;
-    size_t                          workspace_size;
-//     void*                           d_workspace;
-    void                            *dCompressed;
-    int                             m;
-    int                             n;
-//     torch::Tensor                   grad;
-
-    cusparseLtMatmulArgs_t()
-    {
-        plan = new cusparseLtMatmulPlan_t;
-        matmul = new cusparseLtMatmulDescriptor_t;
-        alg_sel = new cusparseLtMatmulAlgSelection_t;
-        streams = nullptr;
-        num_streams = 0;
-        stream = nullptr;
-        m = 0;
-        n = 0;
-        dCompressed = nullptr;
-    }
-
-    ~cusparseLtMatmulArgs_t()
-    {
-        cusparseLtMatmulPlanDestroy(plan);
-//         cudaFree(d_workspace);
-    }
-} cusparseLtMatmulArgs ;
-
-
-std::vector<cusparseLtMatmulArgs*> matmul_args;
-
-
-template <class T, class V>
-int setup_prune_matmul( const int                       m,
-                        const int                       n,
-                        const int                       k,
-                        T                               *dSparse,
-                        T                               *dDense,
-                        int                             *index,
-                        const bool                      transpose_A=false,
-                        const bool                      transpose_B=false,
-                        const bool                      sparseA=true,
-                        const bool                      transposable_mask=false,
-                        const bool                      is_sparse_pruned=false,
-                        const bool                      check_sparsity=false,
-                        cudaDataType_t                  input_type=CUDA_R_16F,
-                        cudaDataType_t                  output_type=CUDA_R_16F,
-                        cusparseComputeType             compute_type=CUSPARSE_COMPUTE_16F)
-{
-    matmul_args.push_back(new cusparseLtMatmulArgs_t);
-    *index = matmul_args.size() - 1;
-
-    auto args = matmul_args.back();
-    args->m = m;
-    args->n = n;
-
-    // Host problem definition, row-major order
-    // bigger sizes may require dynamic allocations
-    auto          order        = CUSPARSE_ORDER_ROW;
-    auto          opA          = transpose_A ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-    auto          opB          = transpose_B ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
-    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
-    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
-    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
-    auto     num_A_rows     = (isA_transposed) ? k : m;
-    auto     num_A_cols     = (isA_transposed) ? m : k;
-    auto     num_B_rows     = (isB_transposed) ? n : k;
-    auto     num_B_cols     = (isB_transposed) ? k : n;
-    auto     num_C_rows     = m;
-    auto     num_C_cols     = n;
-    unsigned alignment      = 16;
-    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
-    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
-    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
-    auto     C_height       = (is_rowmajor) ? num_C_rows : num_C_cols;
-    auto     C_size         = C_height * ldc * sizeof(V);
-
-
-    cusparseLtMatDescriptor_t*      matA;
-    cusparseLtMatDescriptor_t*      matB;
-    cusparseLtMatDescriptor_t*      matC;
-    matA = new cusparseLtMatDescriptor_t;
-    matB = new cusparseLtMatDescriptor_t;
-    matC = new cusparseLtMatDescriptor_t;
-
-    V *dC, *dD;
-    CHECK_CUDA( cudaMalloc((void**) &dC, C_size) )
-    dD = dC;
-
-    int *d_valid;
-    CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
-
-    // matrix descriptor initialization
-    if(sparseA)
-    {
-        CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
-                                                &handle, matA, num_A_rows,
-                                                num_A_cols, lda, alignment,
-                                                input_type, order,
-                                                CUSPARSELT_SPARSITY_50_PERCENT) )
-
-        CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
-                                                &handle, matB, num_B_rows,
-                                                num_B_cols, ldb, alignment,
-                                                input_type, order) )
-    }
-    else
-    {
-        CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
-                                                &handle, matB, num_B_rows,
-                                                num_B_cols, ldb, alignment,
-                                                input_type, order,
-                                                CUSPARSELT_SPARSITY_50_PERCENT) )
-
-        CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
-                                                &handle, matA, num_A_rows,
-                                                num_A_cols, lda, alignment,
-                                                input_type, order) )
-    }
-    CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
-                                            &handle, matC, num_C_rows,
-                                            num_C_cols, ldc, alignment,
-                                            output_type, order) )
-
-    // matmul, algorithm selection, and plan initialization
-    CHECK_CUSPARSE( cusparseLtMatmulDescriptorInit(
-                                            &handle, args->matmul, opA, opB,
-                                            matA, matB, matC, matC,
-                                            compute_type) )
-
-    CHECK_CUSPARSE( cusparseLtMatmulAlgSelectionInit(
-                                            &handle, args->alg_sel, args->matmul,
-                                            CUSPARSELT_MATMUL_ALG_DEFAULT) )
-
-    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, args->plan, args->matmul, args->alg_sel))
-
-    //--------------------------------------------------------------------------
-    // Prune the A matrix (in-place) and check the correctness
-    if (!is_sparse_pruned){
-        cusparseLtPruneAlg_t prune_alg = transposable_mask ? CUSPARSELT_PRUNE_SPMMA_TILE : CUSPARSELT_PRUNE_SPMMA_STRIP;
-        CHECK_CUSPARSE( cusparseLtSpMMAPrune(&handle, args->matmul, dSparse, dSparse,
-                                             prune_alg, args->stream) )
-    }
-    if (check_sparsity)
-    {
-        CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck(&handle, args->matmul, dSparse, d_valid, args->stream) )
-        int is_valid;
-        CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int), cudaMemcpyDeviceToHost, args->stream) )
-        CHECK_CUDA( cudaStreamSynchronize(args->stream) )
-        if (is_valid != 0) {
-            std::printf("!!!! The matrix does not conform to the SpMMA sparsity pattern. "
-                        "cusparseLtMatmul does not provide correct results\n");
-            return EXIT_FAILURE;
-        }
-    }
-    
-
-//     int    *d_valid;
-//     CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
-//     CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck2(    &handle,
-//                                                     sparseA ? matA : matB,
-//                                                     sparseA,
-//                                                     sparseA ? opA : opB,
-//                                                     dSparse,
-//                                                     d_valid,
-//                                                     args->stream) )
-
-//     int is_valid;
-//     CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int),
-//                                 cudaMemcpyDeviceToHost, args->stream) )
-//     CHECK_CUDA( cudaStreamSynchronize(args->stream) )
-//     if (is_valid != 0) {
-//         std::printf("!!!! The matrix has been pruned in a wrong way. "
-//                     "cusparseLtMatmul will not provide correct results\n");
-//         return EXIT_FAILURE;
-//     }
-    CHECK_CUDA( cudaFree(d_valid) )
-
-    //--------------------------------------------------------------------------
-    // Compress the A matrix
-    size_t compressed_size, compressed_buffer_size;
-    void*  dCompressedBuffer;
-    CHECK_CUSPARSE( cusparseLtSpMMACompressedSize(&handle,
-                                                  args->plan,
-                                                  &compressed_size,
-                                                  &compressed_buffer_size) )
-
-    CHECK_CUDA( cudaMalloc((void**) &args->dCompressed, compressed_size) )
-    CHECK_CUDA( cudaMalloc((void**) &dCompressedBuffer,
-                           compressed_buffer_size) )
-
-    CHECK_CUSPARSE( cusparseLtSpMMACompress(&handle,
-                                            args->plan,
-                                            dSparse,
-                                            (T *) args->dCompressed,
-                                            dCompressedBuffer,
-                                            args->stream) )
-    CHECK_CUDA( cudaFree(dCompressedBuffer) )
-
-    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // Search the best kernel
-    if(sparseA)
-    {
-//         printf("%f, %f, %f, %f, %f, %f\n", alpha, beta, *dDense,0.,0.,0.);// , dDense[0], beta, dC[0], dD[0]);
-        CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, args->plan, &alpha,
-                                            (T*) args->dCompressed, dDense, &beta,
-                                            dC, dD, nullptr,
-                                            args->streams, args->num_streams) )
-    } else {
-        CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, args->plan, &alpha,
-                                            dDense, (T*) args->dCompressed, &beta,
-                                            dC, dD, nullptr,
-                                            args->streams, args->num_streams) )
-    }
-//     // otherwise, it is possible to set it directly:
-//     int alg = 0;
-//     CHECK_CUSPARSE( cusparseLtMatmulAlgSetAttribute(
-//                                            &handle, args->alg_sel,
-//                                            CUSPARSELT_MATMUL_ALG_CONFIG_ID,
-//                                            &alg, sizeof(alg)))
-
-
-    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, args->plan, args->matmul, args->alg_sel))
-
-    CHECK_CUSPARSE( cusparseLtMatmulGetWorkspace(&handle, args->plan,
-                                                 &args->workspace_size))
-
-//     printf("workspace_size: %lu (MB)\n", args->workspace_size / 1024 / 1024);
-    CHECK_CUDA( cudaFree(dC) )
-    cusparseLtMatDescriptorDestroy(matA);
-    cusparseLtMatDescriptorDestroy(matB);
-    cusparseLtMatDescriptorDestroy(matC);
-
-    return EXIT_SUCCESS;
-}
-
-int destroy_cusparse_matmul_cuda(int index){
-    if (index > matmul_args.size() - 1)
-        throw std::runtime_error("Index out of range of matmul_args");
-
-    auto args = matmul_args[index];
-    cusparseLtMatmulPlanDestroy(args->plan);
-    CHECK_CUDA(cudaFree(args->streams));
-    CHECK_CUDA(cudaFree(args->dCompressed));
-    matmul_args.erase(matmul_args.begin() + index);
-
-    return EXIT_SUCCESS;
-}
-
-torch::Tensor setup_spmatmul_cuda(torch::Tensor A,
-                                torch::Tensor B,
-                                const bool transpose_A=false,
-                                const bool transpose_B=false,
-                                const bool sparseA=true,
-                                const bool transposable_mask=false,
-                                const bool is_sparse_pruned=false,
-                                const bool check_sparsity=false) {
-   auto index = torch::zeros({1}, torch::kInt32);
-   int result;
-   int m, k, n;
-   if(transpose_A && transpose_B)
-   {
-        m = A.size(1);
-        k = A.size(0);
-        n = B.size(0);
-   } else if(transpose_A)
-   {
-        m = A.size(1);
-        k = A.size(0);
-        n = B.size(1);
-   } else if(transpose_B)
-   {
-        m = A.size(0);
-        k = A.size(1);
-        n = B.size(0);
-   } else {
-        m = A.size(0);
-        k = A.size(1);
-        n = B.size(1);
-   }
-   switch (A.type().scalarType()) {
-        case torch::ScalarType::Half:
-        {
-            auto sparse_mat = sparseA ? A.data_ptr<at::Half>() : B.data_ptr<at::Half>();
-            auto dense_mat = sparseA ? B.data_ptr<at::Half>() : A.data_ptr<at::Half>();
-            at::Half *dCompressed;
-            result = setup_prune_matmul<at::Half, at::Half>(     m,
-                                             n,
-                                             k,
-                                             sparse_mat,
-                                             dense_mat,
-                                             index.data_ptr<int>(),
-                                             transpose_A,
-                                             transpose_B,
-                                             sparseA,
-                                             transposable_mask,
-                                             is_sparse_pruned,
-                                             check_sparsity,
-                                             CUDA_R_16F,
-                                             CUDA_R_16F,
-                                             CUSPARSE_COMPUTE_16F);
-            break;
-        }
-        case torch::ScalarType::Char:
-        {
-            auto sparse_mat = sparseA ? A.data_ptr<int8_t>() : B.data_ptr<int8_t>();
-            auto dense_mat = sparseA ? B.data_ptr<int8_t>() : A.data_ptr<int8_t>();
-            int8_t *dCompressed;
-            result = setup_prune_matmul<int8_t, INT8_OUTPUT_TYPE>(     m,
-                                             n,
-                                             k,
-                                             sparse_mat,
-                                             dense_mat,
-                                             index.data_ptr<int>(),
-                                             transpose_A,
-                                             transpose_B,
-                                             sparseA,
-                                             transposable_mask,
-                                             is_sparse_pruned,
-                                             check_sparsity,
-                                             CUDA_R_8I,
-                                             INT8_OUTPUT_TYPE_CUDA,
-                                             CUSPARSE_COMPUTE_32I);
-            break;}
-        default:
-        {
-            std::cout << A.type().scalarType() << std::endl;
-            throw std::runtime_error("Unsupported data type");
-        }
-   }
-   if(result == EXIT_SUCCESS) {
-     return index;
-   } else {
-     return -torch::ones({1}, torch::kInt32);
-   }
-}
-
-
-template <class T, class V>
-torch::Tensor matmul(   T* dDense,
-                        int index,
-                        bool sparseA,
-                        int m,
-                        torch::TensorOptions options=torch::TensorOptions()
-                    )
-{
-    auto args = matmul_args[index];
-
-    torch::Tensor C = torch::zeros({m, args->n}, options);
-    auto dC = C.data_ptr<V>();
-    auto dD = dC;
-    auto dA = sparseA ? (T*) args->dCompressed : dDense;
-    auto dB = sparseA ? dDense : (T*) args->dCompressed;
-    void *d_workspace;
-    CHECK_CUDA_TORCH( cudaMalloc((void**) &d_workspace, args->workspace_size) )
-    // Perform the matrix multiplication
-    CHECK_CUSPARSE_TORCH( cusparseLtMatmul(&handle, args->plan, &alpha, dA, dB,
-                                     &beta, dC, dD, d_workspace, args->streams,
-                                     args->num_streams) )
-    CHECK_CUDA_TORCH( cudaFree(d_workspace) )
-    return C;
-}
-
-
-torch::Tensor spmatmul_cuda(torch::Tensor   Dense,
-                            int             index,
-                            bool            sparseA)
-{
-    switch (Dense.type().scalarType()) {
-        case torch::ScalarType::Half: {
-            auto options = torch::TensorOptions().dtype(torch::kHalf).device(torch::kCUDA);
-            return matmul<at::Half, at::Half>(Dense.data_ptr<at::Half>(), index, sparseA, Dense.size(0), options);
-        }
-        case torch::ScalarType::Char: {
-            auto options = torch::TensorOptions().dtype(INT8_OUTPUT_TYPE_TORCH).device(torch::kCUDA);
-            return matmul<int8_t, INT8_OUTPUT_TYPE>(Dense.data_ptr<int8_t>(), index, sparseA, Dense.size(0), options);
-        }
-        default:
-        {
-            throw std::runtime_error("Unsupported data type");
-        }
-    }
-}
-
-
-void save_grad_cuda(torch::Tensor grad, int index)
-{
-    auto args = matmul_args[index];
-//    args->grad = grad.clone().detach();
-}
-
-
-__global__ void prune_kernel(
-        const float* __restrict__ input,
-        float* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-    const int column = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        reinterpret_cast<float4*>(&output[index])[0] = reinterpret_cast<const float4*>(&input[index])[0];
-        if(abs(output[index]) > abs(output[index + 1])){
-            output[index + 1] = 0.;
-            mask[index + 1] = true;
-        } else {
-            output[index] = 0.;
-            mask[index] = true;
-        }
-        if(abs(output[index + 2]) > abs(output[index + 3])){
-            output[index + 3] = 0.;
-            mask[index + 3] = true;
-        } else {
-            output[index + 2] = 0.;
-            mask[index + 2] = true;
-        }
-  }
-}
-
-
-__global__ void prune_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        reinterpret_cast<float4*>(&output[index])[0] = reinterpret_cast<const float4*>(&input[index])[0];
-        at::Half min1, min2;
-        int min_idx1, min_idx2;
-        min1 = output[index];
-        min_idx1 = index;
-        if(MIN(min1, output[index + 1]) == output[index + 1]){
-            min1 = output[index + 1];
-            min_idx1 = index + 1;
-        }
-        if(MIN(min1, output[index + 2]) == output[index + 2]){
-            min1 = output[index + 2];
-            min_idx1 = index + 2;
-        }
-        if(MIN(min1, output[index + 3]) == output[index + 3]){
-            min1 = output[index + 3];
-            min_idx1 = index + 3;
-        }
-        min2 = min_idx1 == index ? output[index + 1] : output[index];
-        min_idx2 = min_idx1 == index ? index + 1 : index;
-        if((MIN(min2, output[index + 1]) == output[index + 1]) && min_idx1 != index + 1){
-            min2 = output[index + 1];
-            min_idx2 = index + 1;
-        }
-        if((MIN(min2, output[index + 2]) == output[index + 2]) && min_idx1 != index + 2){
-            min2 = output[index + 2];
-            min_idx2 = index + 2;
-        }
-        if((MIN(min2, output[index + 3]) == output[index + 3]) && min_idx1 != index + 3){
-            min2 = output[index + 3];
-            min_idx2 = index + 3;
-        }
-        output[min_idx1] = 0.; mask[min_idx1] = true;
-        output[min_idx2] = 0.; mask[min_idx2] = true;
-
-        min1 = output[index + 4];
-        min_idx1 = index + 4;
-        if(MIN(min1, output[index + 5]) == output[index + 5]){
-            min1 = output[index + 5];
-            min_idx1 = index + 5;
-        }
-        if(MIN(min1, output[index + 6]) == output[index + 6]){
-            min1 = output[index + 6];
-            min_idx1 = index + 6;
-        }
-        if(MIN(min1, output[index + 7]) == output[index + 7]){
-            min1 = output[index + 7];
-            min_idx1 = index + 7;
-        }
-        min2 = min_idx1 == index + 4 ? output[index + 5] : output[index + 4];
-        min_idx2 = min_idx1 == index + 4 ? index + 5 : index + 4;
-        if((MIN(min2, output[index + 5]) == output[index + 5]) && min_idx1 != index + 5){
-            min2 = output[index + 5];
-            min_idx2 = index + 5;
-        }
-        if((MIN(min2, output[index + 6]) == output[index + 6]) && min_idx1 != index + 6){
-            min2 = output[index + 6];
-            min_idx2 = index + 6;
-        }
-        if((MIN(min2, output[index + 7]) == output[index + 7]) && min_idx1 != index + 7){
-            min2 = output[index + 7];
-            min_idx2 = index + 7;
-        }
-
-        output[min_idx1] = 0.; mask[min_idx1] = true;
-        output[min_idx2] = 0.; mask[min_idx2] = true;
-  }
-}
-
-
-template <class T>
-__device__ void find_kth_smallest(
-                                    int *smallest_idx,
-                                    const T* __restrict__ input,
-                                    const int k,
-                                    const int M, int index) {
-    int min_idx = 0;
-    T min = 6.0e4;
-
-    for(int i = 0; i < M; i++)
-    {
-        bool ignore = false;
-        for(int j = 0; j < k; j++)
-        {
-            if(smallest_idx[j] == i)
-            {
-                ignore = true;
-            }
-        }
-        if(ignore)
-        {
-            continue;
-        }
-        if(MIN(min, input[i]) == input[i]){
-            min = input[i];
-            min_idx = i;
-        }
-    }
-    smallest_idx[k] = min_idx;
-}
-
-
-__global__ void prune_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size,
-        const int N,
-        const int M) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 8; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 8 * i])[0] = reinterpret_cast<const float4*>(&input[index + 8 * i])[0];
-        }
-
-        int min_idx_list[16];
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<at::Half>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-__global__ void prune_kernel(
-        const float* __restrict__ input,
-        float* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size,
-        const int N,
-        const int M) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 4; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 4 * i])[0] = reinterpret_cast<const float4*>(&input[index + 4 * i])[0];
-        }
-
-        int *min_idx_list;
-        min_idx_list = (int*)malloc((M - N) * sizeof(int));
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<float>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-template <int N, int M>
-__global__ void prune_kernel(
-        const float* __restrict__ input,
-        float* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 4; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 4 * i])[0] = reinterpret_cast<const float4*>(&input[index + 4 * i])[0];
-        }
-
-        int min_idx_list[M - N];
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<float>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-template <int N, int M>
-__global__ void prune_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 8; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 8 * i])[0] = reinterpret_cast<const float4*>(&input[index + 8 * i])[0];
-        }
-
-        int min_idx_list[M - N];
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<at::Half>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-std::vector<torch::Tensor> prune_cuda(
-    torch::Tensor input, const int N, const int M) {
-
-    auto output = torch::zeros_like(input);
-    auto options = torch::TensorOptions().dtype(torch::kBool);
-    auto mask = torch::zeros_like(input, options);
-
-    const auto batch_size = input.size(0);
-    const auto row_size = input.size(1);
-
-    const int threads = 1024;
-
-    if(N == 1 && M == 2) {
-        switch (input.type().scalarType()) {
-            case torch::ScalarType::Float: {
-                const dim3 blocks(((row_size / 4) + threads - 1) / threads, batch_size);
-                prune_kernel<<<blocks, threads>>>(
-                        input.data<float>(),
-                        output.data<float>(),
-                        mask.data<bool>(),
-                        row_size);
-                break;
-            }
-            case torch::ScalarType::Half: {
-                throw std::runtime_error("Half precision not supported for N=1, M=2");
-            }
-        }
-    }
-    else if(N == 2 && M == 4)
-    {
-            switch (input.type().scalarType()) {
-                case torch::ScalarType::Float: {
-                    throw std::runtime_error("Full precision not supported for N=2, M=4");
-                    break;
-                }
-                case torch::ScalarType::Half: {
-                    const dim3 blocks(((row_size / 8) + threads - 1) / threads, batch_size);
-                    prune_kernel<<<blocks, threads>>>(
-                            input.data<at::Half>(),
-                            output.data<at::Half>(),
-                            mask.data<bool>(),
-                            row_size);
-                }
-            }
-    }
-    else if((N == 2 && M == 8))
-    {
-        switch (input.type().scalarType()){
-            case torch::ScalarType::Float: {
-            const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-            prune_kernel<2, 8><<<blocks, threads>>>(
-                    input.data<float>(),
-                    output.data<float>(),
-                    mask.data<bool>(),
-                    row_size);
-            break;
-            }
-            case torch::ScalarType::Half: {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<2, 8><<<blocks, threads>>>(
-                        input.data<at::Half>(),
-                        output.data<at::Half>(),
-                        mask.data<bool>(),
-                        row_size);
-            }
-        }
-    }
-    else if((N == 2 && M == 16))
-    {
-        switch (input.type().scalarType()){
-            case torch::ScalarType::Float: {
-            const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-            prune_kernel<2, 16><<<blocks, threads>>>(
-                    input.data<float>(),
-                    output.data<float>(),
-                    mask.data<bool>(),
-                    row_size);
-            break;
-            }
-            case torch::ScalarType::Half: {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<2, 16><<<blocks, threads>>>(
-                        input.data<at::Half>(),
-                        output.data<at::Half>(),
-                        mask.data<bool>(),
-                        row_size);
-            }
-        }
-    }
-    else
-    {
-        if(M < 8 || M % 8 != 0)
-        {
-            throw std::runtime_error("M must be a multiple of 8");
-        }
-        switch (input.type().scalarType()) {
-            case torch::ScalarType::Float:
-            {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<<<blocks, threads>>>(
-                    input.data<float>(),
-                    output.data<float>(),
-                    mask.data<bool>(),
-                    row_size,
-                    N,
-                    M);
-                 break;
-            }
-            case torch::ScalarType::Half:
-            {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<<<blocks, threads>>>(
-                    input.data<at::Half>(),
-                    output.data<at::Half>(),
-                    mask.data<bool>(),
-                    row_size,
-                    N,
-                    M);
-            }
-        }
-    }
-  return {output, mask};
-}
-
-
-__global__ void prune_and_compress_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-    const int input_column = 16 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int output_column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int input_row = blockIdx.y * row_size;
-    const int output_row = blockIdx.y * (row_size / 2);
-    const int input_index = input_row + input_column;
-    const int output_index = output_row + output_column;
-    if (input_column < row_size) {
-        bool local_mask[16];
-        reinterpret_cast<float4*>(local_mask)[0] = reinterpret_cast<const float4*>(&mask[input_index])[0];
-
-        int local_index = 0;
-        #pragma unroll (2)
-        for(int i = 0; i < 2; i++)
-        {
-            at::Half local_data[8];
-            reinterpret_cast<float4*>(local_data)[0] = reinterpret_cast<const float4*>(&input[input_index + 8 * i])[0];
-            #pragma unroll (8)
-            for(int j = 0; j < 8; j++)
-            {
-                if(local_mask[8 * i + j])
-                {
-                    output[local_index + output_index] = local_data[j];
-                    local_index++;
-                }
-            }
-        }
-    }
-}
-
-
-torch::Tensor prune_and_compress_cuda(torch::Tensor dense, torch::Tensor mask)
-{
-    auto row_size = dense.size(1);
-    auto batch_size = dense.size(0);
-    if(row_size % 16 != 0)
-    {
-        throw std::runtime_error("Pruning dimension should be a multiple of 128.");
-    }
-    auto options = torch::TensorOptions().dtype(torch::kHalf).device(torch::kCUDA);
-    torch::Tensor result = torch::zeros({dense.size(0), dense.size(1) / 2}, options);
-    const int threads = 1024;
-    switch (dense.type().scalarType()) {
-        case torch::ScalarType::Float:
-        {
-            throw std::runtime_error("Full precision not supported for prune_and_compress");
-        }
-        case torch::ScalarType::Half:
-        {
-            const dim3 blocks(((row_size / 16) + threads - 1) / threads, batch_size);
-            prune_and_compress_kernel<<<blocks, threads>>>(
-                dense.data<at::Half>(),
-                result.data<at::Half>(),
-                mask.data<bool>(),
-                row_size);
-        }
-    }
-    return result;
-}
-
-
-__global__ void sparse_add_kernel(
-        const at::Half* __restrict__ mat1,
-        const at::Half* __restrict__ mat2,
-        const at::Half alpha,
-        const at::Half beta,
-        at::Half* __restrict__ output,
-        size_t row_size) {
-    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        at::Half mat1_local[8], mat2_local[8];
-        reinterpret_cast<float4 *>(&mat1_local)[0] = reinterpret_cast<const float4 *>(&mat1[index])[0];
-        reinterpret_cast<float4 *>(&mat2_local)[0] = reinterpret_cast<const float4 *>(&mat2[index])[0];
-        #pragma unroll (8)
-        for(int i = 0; i < 8; i++)
-        {
-            output[index + i] = alpha * mat1_local[i] + beta * mat2_local[i];
-        }
-    }
-
-}
-
-
-torch::Tensor sparse_add_cuda(torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta)
-{
-    int row_size = dense.size(1);
-    int batch_size = dense.size(0);
-    if(row_size % 8 != 0)
-    {
-        throw std::runtime_error("Pruning dimension should be a multiple of 8.");
-    }
-    int index = sparse_index.item<int>();
-    auto args = matmul_args[index];
-    torch::Tensor result = torch::zeros_like(dense);
-    const int threads = 1024;
-    switch (dense.type().scalarType()) {
-        case torch::ScalarType::Float:
-        {
-            throw std::runtime_error("Full precision not supported for prune_and_compress");
-        }
-        case torch::ScalarType::Half:
-        {
-            const dim3 blocks(((row_size / 8) + threads - 1) / threads, batch_size);
-            sparse_add_kernel<<<blocks, threads>>>(
-                dense.data<at::Half>(),
-                (at::Half*) args->dCompressed,
-                alpha.item<float>(),
-                beta.item<float>(),
-                result.data<at::Half>(),
-                row_size);
-        }
-    }
-    return result;
-}
-
-
-__global__ void update_sparse_matrix_kernel(
-        const at::Half* __restrict__ new_data,
-        at::Half* __restrict__ output,
-        size_t row_size) {
-    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        reinterpret_cast<float4 *>(&output[index])[0] = reinterpret_cast<const float4 *>(&new_data[index])[0];
-    }
-}
-
-
-void update_sparse_matrix_cuda(torch::Tensor new_data, torch::Tensor sparse_idx)
-{
-    auto args = matmul_args[sparse_idx.item<int>()];
-    const int threads = 1024;
-    switch (new_data.type().scalarType()) {
-        case torch::ScalarType::Float:
-        {
-            throw std::runtime_error("Full precision not supported for prune_and_compress");
-        }
-        case torch::ScalarType::Half:
-        {
-            cudaMemcpy(args->dCompressed, new_data.data<at::Half>(), new_data.size(0) * new_data.size(1) * sizeof(at::Half), cudaMemcpyDeviceToDevice);
-        }
-    }
-}
-
-
-// sparse = prune_and_compress(dense, mask)
-// result = add_sparse_dense(sparse_idx, dense, alpha, beta)
-// update_sparse(data, sparse_idx, sparse_transpose_idx)
diff --git a/csrc/sparse/cusparselt/cusparselt_mm_entry.cu b/csrc/sparse/cusparselt/cusparselt_mm_entry.cu
deleted file mode 100644
index ddc5ef090ec2b..0000000000000
--- a/csrc/sparse/cusparselt/cusparselt_mm_entry.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <torch/extension.h>
-#include <cusparseLt.h>       // cusparseLt header
-#include <iostream>
-
-#define CHECK_CUDA_DEVICE(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA_DEVICE(x); CHECK_CONTIGUOUS(x)
-
-int init_cusparse_lt_cuda();
-torch::Tensor setup_spmatmul_cuda(torch::Tensor A,
-                                torch::Tensor B,
-                                const bool transpose_A=false,
-                                const bool transpose_B=false,
-                                const bool sparseA=true,
-                                const bool transposable_mask=false,
-                                const bool is_sparse_pruned=false,
-                                const bool check_sparsity=false);
-
-
-torch::Tensor spmatmul_cuda(torch::Tensor       Dense,
-                            int                 index,
-                            bool                sparseA);
-
-int destroy_cusparse_matmul_cuda(int index);
-
-void save_grad_cuda(torch::Tensor grad, int index);
-
-
-torch::Tensor init_cusparse_lt() {
-  int result = init_cusparse_lt_cuda();
-  if(result == EXIT_SUCCESS) {
-    return torch::zeros({1}, torch::kInt32);
-  } else {
-    return torch::ones({1}, torch::kInt32);
-  }
-}
-
-
-torch::Tensor setup_spmatmul(torch::Tensor A,
-                                torch::Tensor B,
-                                const bool transpose_A=false,
-                                const bool transpose_B=false,
-                                const bool sparseA=true,
-                                const bool transposable_mask=false,
-                                const bool is_sparse_pruned=false,
-                                const bool check_sparsity=false) {
-
-   CHECK_INPUT(A);
-   CHECK_INPUT(B);
-   return setup_spmatmul_cuda(A,
-                              B,
-                              transpose_A,
-                              transpose_B,
-                              sparseA,
-                              transposable_mask,
-                              is_sparse_pruned,
-                              check_sparsity);
-}
-
-
-torch::Tensor spmatmul( torch::Tensor Dense,
-                        torch::Tensor index,
-                        const bool sparseA=true) {
-   CHECK_INPUT(Dense);
-//   std::cout << Dense.data_ptr<at::Half>()[0] << std::endl;
-   auto result = spmatmul_cuda(     Dense,
-                                    *index.data_ptr<int>(),
-                                    sparseA);
-   return result;
-}
-
-int destroy_cusparse_matmul(int index){
-    return destroy_cusparse_matmul_cuda(index);
-}
-
-torch::Tensor save_grad(torch::Tensor input, torch::Tensor index) {
-    CHECK_INPUT(input);
-    save_grad_cuda(input, *index.data_ptr<int>());
-}
-
-
-std::vector<torch::Tensor> prune_cuda(torch::Tensor input, const int N, const int M);
-
-
-std::vector<torch::Tensor> prune(
-        torch::Tensor input, const int N, const int M) {
-    CHECK_INPUT(input);
-    return prune_cuda(input, N, M);
-}
-
-
-torch::Tensor prune_and_compress_cuda(torch::Tensor input, torch::Tensor mask);
-
-
-torch::Tensor prune_and_compress(
-        torch::Tensor input, torch::Tensor mask) {
-    CHECK_INPUT(input);
-    return prune_and_compress_cuda(input, mask);
-}
-
-
-torch::Tensor sparse_add_cuda(torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta);
-
-
-torch::Tensor sparse_add(
-        torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta) {
-    CHECK_INPUT(dense);
-    return sparse_add_cuda(dense, sparse_index, alpha, beta);
-}
-
-
-void update_sparse_matrix_cuda(torch::Tensor new_data, torch::Tensor sparse_idx);
-
-
-void update_sparse_matrix(
-        torch::Tensor new_data, torch::Tensor sparse_idx) {
-    CHECK_INPUT(new_data);
-    update_sparse_matrix_cuda(new_data, sparse_idx);
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("init_cusparse_lt", &init_cusparse_lt, "Initialize CUSPARSE LT");
-    m.def("setup_spmatmul", &setup_spmatmul, "Setup Sparse Matrix Multiplication");
-    m.def("destroy_cusparse_matmul", &destroy_cusparse_matmul, "Destroy matmul arguments");
-    m.def("spmatmul", &spmatmul, "Sparse Matrix Multiplication");
-    m.def("save_grad", &save_grad, "Save Gradient");
-    m.def("prune", &prune, "N:M Prune (CUDA)");
-    m.def("prune_and_compress", &prune_and_compress, "Prune the dense matrix using the mask and store it in a "
-                                                     "compressed tensor (CUDA)");
-    m.def("sparse_add", &sparse_add, "Add the sparse matrix to the dense matrix and return a "
-                                     "compressed dense matrix(CUDA)");
-    m.def("update_sparse_matrix", &update_sparse_matrix, "Update the sparse matrix with the new dense matrix "
-                                                         "data (CUDA)");
-}
diff --git a/csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu b/csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu
deleted file mode 100644
index 5b7361f805098..0000000000000
--- a/csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu
+++ /dev/null
@@ -1,596 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Hopper Sparse GEMM example.
-
-  This example demonstrates how to construct and run a structured sparse GEMM kernel
-  on NVIDIA Hopper architecture.
-    
-*/
-
-#include <iostream>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/transform/device/transform_universal_adapter.hpp"
-#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
-
-#include "util/command_line.h"
-#include "util/distribution.h"
-#include "util/host_tensor.h"
-#include "util/packed_stride.hpp"
-#include "util/tensor_view_io.h"
-#include "util/reference/device/gemm.h"
-#include "util/reference/device/tensor_compare.h"
-#include "util/reference/device/tensor_fill.h"
-
-#include "util/helper.h"
-
-using namespace cute;
-
-#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// GEMM kernel configurations
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// A matrix configuration
-using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
-using         LayoutTagA  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
-constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
-
-// B matrix configuration
-using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
-using         LayoutTagB  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
-constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
-
-// C/D matrix configuration
-using         ElementC    = float;                                          // Element type for C and D matrix operands
-using         LayoutTagC  = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
-constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
-
-// Core kernel configurations
-using ElementAccumulator  = float;                                          // Element type for internal accumulation
-using TileShape           = Shape<_128,_128,_128>;                          // Threadblock-level tile size for sparse kernel
-using TileShapeRef        = Shape<_128,_128, _64>;                          // Threadblock-level tile size for reference (dense) kernel
-using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
-using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecialized;        // Kernel schedule policy
-using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;          // Epilogue schedule policy
-
-using ProblemShape = Shape<int,int,int,int>;
-
-// Sparse kernel setup
-
-using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    TileShape, ClusterShape,
-    cutlass::epilogue::collective::EpilogueTileAuto,
-    ElementAccumulator, ElementAccumulator,
-    ElementC, LayoutTagC, AlignmentC,
-    ElementC, LayoutTagC, AlignmentC,
-    EpilogueSchedule
-  >::CollectiveOp;
-
-using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
-    ElementA, LayoutTagA, AlignmentA,
-    ElementB, LayoutTagB, AlignmentB,
-    ElementAccumulator,
-    TileShape, ClusterShape,
-    cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-    KernelSchedule
-  >::CollectiveOp;
-
-using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-    ProblemShape,
-    CollectiveMainloop,
-    CollectiveEpilogue
->;
-
-using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-// Reference (dense) kernel setup
-
-using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    TileShapeRef, ClusterShape,
-    cutlass::epilogue::collective::EpilogueTileAuto,
-    ElementAccumulator, ElementAccumulator,
-    ElementC, LayoutTagC, AlignmentC,
-    ElementC, LayoutTagC, AlignmentC,
-    EpilogueSchedule
-  >::CollectiveOp;
-
-using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    ElementA, LayoutTagA, AlignmentA,
-    ElementB, LayoutTagB, AlignmentB,
-    ElementAccumulator,
-    TileShapeRef, ClusterShape,
-    cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-    KernelSchedule
-  >::CollectiveOp;
-
-using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
-    ProblemShape,
-    CollectiveMainloopRef,
-    CollectiveEpilogue
->;
-
-using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
-
-// Layouts 
-using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
-using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
-using StrideB = typename Gemm::GemmKernel::StrideB;
-using StrideC = typename Gemm::GemmKernel::StrideC;
-using StrideD = typename Gemm::GemmKernel::StrideD;
-
-// Layouts for reference (non-sparse) tensors
-using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
-using StrideE = StrideA;
-
-using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
-using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
-
-// Offline compressor kernel
-using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
-                            ProblemShape,
-                            ElementA,
-                            LayoutTagA,
-                            SparseConfig>;
-
-using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
-                            ProblemShape,
-                            ElementA,
-                            LayoutTagA,
-                            SparseConfig,
-                            cutlass::arch::Sm90>;
-
-using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
-
-//
-// Data members
-//
-
-ProblemShape problem_shape;
-
-StrideA stride_A;
-StrideA stride_A_compressed;
-StrideE stride_E;
-StrideB stride_B;
-StrideC stride_C;
-StrideD stride_D;
-
-LayoutA layout_A;
-LayoutE layout_E;
-
-uint64_t seed;
-
-cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
-cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
-cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
-cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
-cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
-cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
-cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D_ref;
-
-#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Testbed utility types
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Command line options parsing
-struct Options {
-
-  bool help;
-
-  float alpha, beta;
-  int iterations;
-  int m, n, k, l;
-
-  Options():
-    help(false),
-    m(5120), n(4096), k(16384), l(1),
-    alpha(1.f), beta(0.f),
-    iterations(10)
-  { }
-
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    cutlass::CommandLine cmd(argc, args);
-
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-      return;
-    }
-
-    cmd.get_cmd_line_argument("m", m);
-    cmd.get_cmd_line_argument("n", n);
-    cmd.get_cmd_line_argument("k", k);
-    cmd.get_cmd_line_argument("l", l);
-    cmd.get_cmd_line_argument("alpha", alpha);
-    cmd.get_cmd_line_argument("beta", beta);
-    cmd.get_cmd_line_argument("iterations", iterations);
-  }
-
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-
-    out << "62_hopper_sparse_gemm\n\n"
-      << "  Hopper Sparse GEMM example.\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --m=<int>                   Sets the M extent of the GEMM\n"
-      << "  --n=<int>                   Sets the N extent of the GEMM\n"
-      << "  --k=<int>                   Sets the K extent of the GEMM\n"
-      << "  --l=<int>                   Sets the L extent of the GEMM (batch size)\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
-
-    out
-      << "\n\nExamples:\n\n"
-      << "$ " << "62_hopper_sparse_gemm" << " --m=4096 --n=5120 --k=8192 --l=1 --alpha=2 --beta=0.707 \n\n";
-
-    return out;
-  }
-
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s) const
-  {
-    // Two flops per multiply-add
-    uint64_t flop = uint64_t(2) * m * n * k;
-    double gflop = double(flop) / double(1.0e9);
-    return gflop / runtime_s;
-  }
-};
-
-#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// GEMM setup and evaluation
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to initialize a block of device data
-template <class Element>
-bool initialize_block(
-  cutlass::DeviceAllocation<Element>& block,
-  uint64_t seed) {
-
-  Element scope_max, scope_min;
-  int bits_input = cutlass::sizeof_bits<Element>::value;
-
-  if (bits_input == 1) {
-    scope_max = Element(2);
-    scope_min = Element(0);
-  } else if (bits_input <= 8) {
-    scope_max = Element(2);
-    scope_min = Element(-2);
-  } else {
-    scope_max = Element(8);
-    scope_min = Element(-8);
-  }
-
-  cutlass::reference::device::BlockFillRandomUniform(
-    block.get(), block.size(), seed, scope_max, scope_min, 0);
-
-  return true;
-}
-
-/// Make A structured sparse by replacing elements with 0 and compress it
-bool sparsify_and_compress()
-{
-  auto [M, N, K, L] = problem_shape;
-  CompressorUtility compressor_utility(problem_shape, stride_A);
-
-  int ME = compressor_utility.get_metadata_m_physical();
-  int KE = compressor_utility.get_metadata_k_physical();
-  int KC = compressor_utility.get_tensorA_k_physical();
-
-  block_A_compressed.reset(M * KC * L);
-  block_E.reset(ME * KE * L);
-
-  stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
-  stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
-
-  // Random sparsification is performed on host
-  std::vector<ElementA> block_A_host(block_A.size());
-  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
-  compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), static_cast<int>(seed + 2024));
-  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
-
-  cutlass::KernelHardwareInfo hw_info;
-  hw_info.device_id = 0;
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-  typename Compressor::Arguments arguments {
-    problem_shape,
-    { block_A.get(),
-      stride_A,
-      block_A_compressed.get(),
-      block_E.get() },
-    {hw_info} };
-
-  Compressor compressor_op;
-  size_t workspace_size = Compressor::get_workspace_size(arguments);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  CUTLASS_CHECK(compressor_op.can_implement(arguments));
-  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
-  CUTLASS_CHECK(compressor_op.run());
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  return true;
-}
-
-/// Initialize operands to be used in the GEMM and reference GEMM
-bool initialize(Options const& options) {
-
-  problem_shape = make_tuple(options.m, options.n, options.k, options.l);
-  auto [M, N, K, L] = problem_shape;
-
-  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
-  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
-  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
-
-  // Allocate memory for tensors
-  block_A.reset(M * K * L);
-  block_B.reset(N * K * L);
-  block_C.reset(M * N * L);
-  block_D.reset(M * N * L);
-  block_D_ref.reset(M * N * L);
-
-  // Fill input tensors with data
-  initialize_block(block_A, seed + 2021);
-  initialize_block(block_B, seed + 2022);
-  initialize_block(block_C, seed + 2023);
-
-  // Replace 0 in A with 1 to avoid metadata changes
-  std::vector<ElementA> block_A_host(block_A.size());
-  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
-  for (size_t i = 0; i < block_A.size(); ++i) if (block_A_host[i] == ElementA(0)) block_A_host[i] = ElementA(1.0);
-  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
-
-  if (!sparsify_and_compress()) {
-    return false;
-  };
-
-  // Build the compressed/metadata layouts
-  layout_A = SparseConfig::fill_layoutA(problem_shape);
-  layout_E = SparseConfig::fill_layoutE(problem_shape);
-
-  return true;
-}
-
-/// Populates a Gemm::Arguments structure from the given commandline options
-typename Gemm::Arguments make_args(Options const& options)
-{
-  typename Gemm::Arguments arguments{
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    problem_shape,
-    { block_A_compressed.get(), layout_A, block_B.get(), stride_B, block_E.get(), layout_E },
-    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
-      block_C.get(), stride_C, block_D.get(), stride_D }
-  };
-
-  return arguments;
-}
-
-typename GemmRef::Arguments make_args_ref(Options const& options)
-{
-  typename GemmRef::Arguments arguments{
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    problem_shape,
-    { block_A.get(), stride_A, block_B.get(), stride_B },
-    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
-      block_C.get(), stride_C, block_D_ref.get(), stride_D }
-  };
-
-  return arguments;
-}
-
-template<class Engine, class Layout>
-void print_device_tensor(cute::Tensor<Engine, Layout> const& t)
-{
-  // Assumes size = cosize, i.e. compact tensor
-  std::vector<typename Engine::value_type> data_host(t.size());
-  cutlass::device_memory::copy_to_host(data_host.data(), t.data(), t.size());
-  auto t_host = cute::make_tensor(data_host.data(), t.layout());
-  cute::print_tensor(t_host);
-}
-
-bool verify(Options const& options) {
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  bool passed = cutlass::reference::device::BlockCompareEqual(block_D_ref.get(), block_D.get(), block_D.size());
-
-#if 0
-  if (!passed) {
-    auto [M, N, K, L] = problem_shape;
-    CompressorUtility compressor_utility(problem_shape, stride_A);
-    int ME = compressor_utility.get_metadata_m_physical();
-    int KE = compressor_utility.get_metadata_k_physical();
-    int KC = compressor_utility.get_tensorA_k_physical();
-
-    cute::print("A (original): "); print_device_tensor(make_tensor(block_A.get(), make_shape(M, K, L), stride_A));
-    cute::print("A (compressed): "); print_device_tensor(make_tensor(block_A_compressed.get(), make_shape(M, KC, L), stride_A_compressed));
-    cute::print("E (physical): "); print_device_tensor(make_tensor(block_E.get(), make_shape(ME, KE, L), stride_E));
-    cute::print("E (logical): "); print_device_tensor(make_tensor(block_E.get(), upcast<CollectiveMainloop::ElementEMmaSparsity>(layout_E)));
-    cute::print("B: "); print_device_tensor(make_tensor(block_B.get(), make_shape(N, K, L), stride_B));
-    cute::print("C: "); print_device_tensor(make_tensor(block_C.get(), make_shape(M, N, L), stride_C));
-    cute::print("D reference: "); print_device_tensor(make_tensor(block_D_ref.get(), make_shape(M, N, L), stride_D));
-    cute::print("D  computed: "); print_device_tensor(make_tensor(block_D.get(), make_shape(M, N, L), stride_D));
-  }
-#endif
-
-  return passed;
-}
-
-template<typename Gemm>
-struct Runner
-{
-  using Arguments = typename Gemm::Arguments;
-
-  Runner(Arguments args): arguments(args) {
-    // Using the arguments, query for extra workspace required for matrix multiplication computation
-    size_t workspace_size = Gemm::get_workspace_size(arguments);
-
-    // Allocate workspace memory
-    workspace.reset(workspace_size);
-
-    // Check if the problem size is supported or not
-    CUTLASS_CHECK(gemm.can_implement(arguments));
-  }
-
-  void run() {
-    CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
-    CUTLASS_CHECK(gemm.run());
-  }
-
-  void benchmark(Options const& options) {
-    if (options.iterations > 0)
-    {
-      GpuTimer timer;
-      timer.start();
-      for (int iter = 0; iter < options.iterations; ++iter) {
-        run();
-      }
-      timer.stop();
-
-      // Compute average runtime and GFLOPs.
-      float elapsed_ms = timer.elapsed_millis();
-      double avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
-      double gflops = options.gflops(avg_runtime_ms / 1000.0);
-
-      std::cout << "  Avg runtime: " << avg_runtime_ms << " ms" << std::endl;
-      std::cout << "  GFLOPS: " << gflops << std::endl;
-    }
-  }
-
-  Gemm gemm;
-  Arguments arguments;
-  cutlass::device_memory::allocation<uint8_t> workspace;
-};
-
-/// Execute the example (verification and timing)
-void run(Options &options) {
-  bool init = initialize(options);
-  if (!init) {
-    std::cout << "Initialization failure" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  Runner<Gemm> gemm(make_args(options));
-  Runner<GemmRef> gemm_ref(make_args_ref(options));
-
-  gemm.run();
-  gemm_ref.run();
-
-  bool passed = verify(options);
-
-  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
-  std::cout << "  Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-  if (!passed) {
-    exit(EXIT_FAILURE);
-  }
-
-  std::cout << "Sparse GEMM:" << std::endl;
-  gemm.benchmark(options);
-
-  std::cout << "Dense GEMM:" << std::endl;
-  gemm_ref.benchmark(options);
-}
-
-#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-int main(int argc, char const **args) {
-
-  // CUTLASS must be compiled with CUDA 12.2 Toolkit to run this example
-  // and must have compute capability at least 90.
-  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 2)) {
-    std::cerr << "This example requires CUDA 12.2 or newer.\n";
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
-  cudaDeviceProp props;
-  int current_device_id;
-  CUDA_CHECK(cudaGetDevice(&current_device_id));
-  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (props.major < 9) {
-    std::cerr
-      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
-      << "later (compute capability 90 or greater).\n";
-    return 0;
-  }
-  //
-  // Parse options
-  //
-
-  Options options;
-
-  options.parse(argc, args);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  //
-  // Evaluate CUTLASS kernels
-  //
-
-#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-  run(options);
-#endif
-
-  return EXIT_SUCCESS;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/example/Makefile b/csrc/sparse/cutlass/example/Makefile
deleted file mode 100644
index 7e5eac250d2e3..0000000000000
--- a/csrc/sparse/cutlass/example/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2023 The FLash-LLM Authors. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# host compiler
-HOST_COMPILER ?= g++
-CUDA_PATH ?= /usr/local/cuda/
-#below is the path for Narval
-#CUDA_PATH ?= /cvmfs/soft.computecanada.ca/easybuild/software/2020/Core/cudacore/11.7.0/
-# CUDA_PATH ?= /cvmfs/soft.computecanada.ca/easybuild/software/2023/x86-64-v3/Core/cudacore/12.2.2/
-NVCC          := /usr/local/cuda/bin/nvcc -ccbin $(HOST_COMPILER)
-
-# internal flags
-NVCCFLAGS   := -m$(shell getconf LONG_BIT)
-CCFLAGS     := -fPIC
-LDFLAGS     :=
-
-ALL_CCFLAGS :=
-ALL_CCFLAGS += $(NVCCFLAGS)
-ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
-
-ALL_LDFLAGS :=
-ALL_LDFLAGS += $(ALL_CCFLAGS)
-ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
-
-# Common includes and paths for CUDA
-INCLUDES  := -I/usr/local/cuda/include/ -I /home/ferrar/vllm/.deps/cutlass-src/include
-LIBRARIES := -lcublas -lcusparse
-
-################################################################################
-
-# Gencode arguments
-SMS ?= 90
-# Generate SASS code for each SM architecture listed in $(SMS)
-$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
-
-ALL_CCFLAGS += --threads 0 --std=c++11 -lineinfo -O3
-
-FLASHLLM_CCFLAGS := -maxrregcount=255
-ALL_CCFLAGS += --use_fast_math
-ALL_CCFLAGS += --ptxas-options=-v,-warn-lmem-usage,--warn-on-spills
-################################################################################
-
-HEAD_FILES = ./util/command_line.h \
-			 ./util/distribution.h \
-			 ./util/host_tensor.h \
-			 ./util/packed_stride.hpp \
-			 ./util/tensor_view_io.h \
-			 ./util/reference/device/gemm.h \
-			 ./util/reference/device/tensor_compare.h \
-			 ./util/reference/device/tensor_fill.h
-
-
-# Target rules
-all: example
-
-example: 62_hopper_sparse_gemm.cu $(HEAD_FILES)
-	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(OUR_CCFLAGS) $(GENCODE_FLAGS) $< -o $@
-
-clean:
-	rm -f example
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/example/util/command_line.h b/csrc/sparse/cutlass/example/util/command_line.h
deleted file mode 100644
index 9dc3a1174067a..0000000000000
--- a/csrc/sparse/cutlass/example/util/command_line.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-/**
- * \file
- * Utility for parsing command line arguments
- */
-
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include <cuda_runtime.h>
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-
-/******************************************************************************
- * command_line
- ******************************************************************************/
-
-/**
- * Utility for parsing command line arguments
- */
-struct CommandLine {
-  std::vector<std::string> keys;
-  std::vector<std::string> values;
-  std::vector<std::string> args;
-
-  /**
-   * Constructor
-   */
-  CommandLine(int argc, const char** argv) {
-    using namespace std;
-
-    for (int i = 1; i < argc; i++) {
-      string arg = argv[i];
-
-      if ((arg[0] != '-') || (arg[1] != '-')) {
-        args.push_back(arg);
-        continue;
-      }
-
-      string::size_type pos;
-      string key, val;
-      if ((pos = arg.find('=')) == string::npos) {
-        key = string(arg, 2, arg.length() - 2);
-        val = "";
-      } else {
-        key = string(arg, 2, pos - 2);
-        val = string(arg, pos + 1, arg.length() - 1);
-      }
-
-      keys.push_back(key);
-      values.push_back(val);
-    }
-  }
-
-  /**
-   * Checks whether a flag "--<flag>" is present in the commandline
-   */
-  bool check_cmd_line_flag(const char* arg_name) const {
-    using namespace std;
-
-    for (int i = 0; i < int(keys.size()); ++i) {
-      if (keys[i] == string(arg_name)) return true;
-    }
-    return false;
-  }
-
-  /**
-   * Returns number of naked (non-flag and non-key-value) commandline parameters
-   */
-  size_t num_naked_args() const {
-    return args.size();
-  }
-
-  /**
-   * Print naked (non-flag and non-key-value) commandline parameters
-   */
-  void print_naked_args(std::ostream &out) const {
-    for (auto arg : args) {
-      out << "   " << arg <<"\n";
-    }
-  }
-
-  /**
-   * Returns the commandline parameter for a given index (not including flags)
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(size_t index, value_t& val) const {
-    using namespace std;
-    if (index < args.size()) {
-      istringstream str_stream(args[index]);
-      str_stream >> val;
-    }
-  }
-
-  /**
-   * Obtains the boolean value specified for a given commandline parameter --<flag>=<bool>
-   */
-  void get_cmd_line_argument(const char* arg_name, bool& val, bool _default) const {
-    val = _default;
-    if (check_cmd_line_flag(arg_name)) {
-      std::string value;
-      get_cmd_line_argument(arg_name, value);
-
-      val = !(value == "0" || value == "false");
-    }
-  }
-  
-  /**
-   * Obtains the value specified for a given commandline parameter --<flag>=<value>
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(const char* arg_name,
-                             value_t& val) const {
-
-    get_cmd_line_argument(arg_name, val, val);
-  }
-
-  /**
-   * Obtains the value specified for a given commandline parameter --<flag>=<value>
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(const char* arg_name,
-                             value_t& val,
-                             value_t const& _default) const {
-    using namespace std;
-
-    val = _default;
-
-    for (int i = 0; i < int(keys.size()); ++i) {
-      if (keys[i] == string(arg_name)) {
-        istringstream str_stream(values[i]);
-        str_stream >> val;
-      }
-    }
-  }
-
-  /**
-   * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
-   */
-  template <typename value_t>
-  void get_cmd_line_arguments(const char* arg_name,
-                              std::vector<value_t>& vals,
-                              char sep = ',') const {
-    using namespace std;
-
-    if (check_cmd_line_flag(arg_name)) {
-      // Clear any default values
-      vals.clear();
-
-      // Recover from multi-value string
-      for (size_t i = 0; i < keys.size(); ++i) {
-        if (keys[i] == string(arg_name)) {
-          string val_string(values[i]);
-          separate_string(val_string, vals, sep);
-        }
-      }
-    }
-  }
-
-  /**
-   * Returns the values specified for a given commandline parameter
-   * --<flag>=<value>,<value_start:value_end>*
-   */
-  void get_cmd_line_argument_pairs(const char* arg_name,
-                                   std::vector<std::pair<std::string, std::string> >& tokens,
-                                   char delim = ',',
-                                   char sep = ':') const {
-    if (check_cmd_line_flag(arg_name)) {
-      std::string value;
-      get_cmd_line_argument(arg_name, value);
-
-      tokenize(tokens, value, delim, sep);
-    }
-  }
-
-  /**
-   * Returns a list of ranges specified for a given commandline parameter
-   * --<flag>=<key:value>,<key:value>*
-   */
-  void get_cmd_line_argument_ranges(const char* arg_name,
-                                    std::vector<std::vector<std::string> >& vals,
-                                    char delim = ',',
-                                    char sep = ':') const {
-    std::vector<std::string> ranges;
-    get_cmd_line_arguments(arg_name, ranges, delim);
-
-    for (std::vector<std::string>::const_iterator range = ranges.begin();
-      range != ranges.end(); ++range) {
-
-      std::vector<std::string> range_vals;
-      separate_string(*range, range_vals, sep);
-      vals.push_back(range_vals);
-    }
-  }
-
-  /**
-   * The number of pairs parsed
-   */
-  int parsed_argc() const { return (int)keys.size(); }
-
-  //-------------------------------------------------------------------------
-  // Utility functions
-  //-------------------------------------------------------------------------
-
-  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
-  static void tokenize(std::vector<std::pair<std::string, std::string> >& tokens,
-                       std::string const& str,
-                       char delim = ',',
-                       char sep = ':') {
-    // Home-built to avoid Boost dependency
-    size_t s_idx = 0;
-    size_t d_idx = std::string::npos;
-    while (s_idx < str.size()) {
-      d_idx = str.find_first_of(delim, s_idx);
-
-      size_t end_idx = (d_idx != std::string::npos ? d_idx : str.size());
-      size_t sep_idx = str.find_first_of(sep, s_idx);
-      size_t offset = 1;
-      if (sep_idx == std::string::npos || sep_idx >= end_idx) {
-        sep_idx = end_idx;
-        offset = 0;
-      }
-
-      std::pair<std::string, std::string> item(
-          str.substr(s_idx, sep_idx - s_idx),
-          str.substr(sep_idx + offset, end_idx - sep_idx - offset));
-
-      tokens.push_back(item);
-      s_idx = end_idx + 1;
-    }
-  }
-
-  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
-  static void tokenize(std::vector<std::string>& tokens,
-                       std::string const& str,
-                       char delim = ',',
-                       char sep = ':') {
-    typedef std::vector<std::pair<std::string, std::string> > TokenVector;
-    typedef TokenVector::const_iterator token_iterator;
-
-    std::vector<std::pair<std::string, std::string> > token_pairs;
-    tokenize(token_pairs, str, delim, sep);
-    for (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {
-      tokens.push_back(tok->first);
-    }
-  }
-
-  template <typename value_t>
-  static void separate_string(std::string const& str,
-                              std::vector<value_t>& vals,
-                              char sep = ',') {
-    std::istringstream str_stream(str);
-    std::string::size_type old_pos = 0;
-    std::string::size_type new_pos = 0;
-
-    // Iterate <sep>-delimited values
-    value_t val;
-    while ((new_pos = str.find(sep, old_pos)) != std::string::npos) {
-      if (new_pos != old_pos) {
-        str_stream.width(new_pos - old_pos);
-        str_stream >> val;
-        vals.push_back(val);
-      }
-
-      // skip over delimiter
-      str_stream.ignore(1);
-      old_pos = new_pos + 1;
-    }
-
-    // Read last value
-    str_stream >> val;
-    vals.push_back(val);
-  }
-};
-
-}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/distribution.h b/csrc/sparse/cutlass/example/util/distribution.h
deleted file mode 100644
index 649a573603ff5..0000000000000
--- a/csrc/sparse/cutlass/example/util/distribution.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/*! \file
-    \brief This header contains a class to parametrize a statistical distribution function.
-*/
-
-#include <ostream>
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Distribution type
-struct Distribution {
-  /// Variant types
-  enum Kind { Invalid, Uniform, Gaussian, Identity, Sequential, AllZeros, AllOnes };
-
-  /// Distribution state
-  union {
-    /// Uniform distribution
-    struct {
-      double min;
-      double max;
-      // Percent elements set to NaN
-      double pnan;
-    } uniform;
-
-    /// Gaussian distribution
-    struct {
-      double mean;
-      double stddev;
-      double pnz;
-      double pnzA;
-      double pnzB;
-      double pnzC;
-    } gaussian;
-
-    /// Elements are linear combination of row and column index
-    struct {
-      double start;
-      double delta;
-    } sequential;
-  };
-
-  /// Active variant kind
-  Kind kind;
-
-  /// Random values are cast to integer after scaling by this power of two
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  Distribution() : kind(Invalid), int_scale(0) {}
-
-/// Configures distribution as uniform random
-  Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
-    kind = Uniform;
-    uniform.min = _min;
-    uniform.max = _max;
-    int_scale = _int_scale;
-    uniform.pnan = _pnan;
-    return *this;
-  }
-
-  /// Configures distribution as Gaussian distribution
-  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
-    kind = Gaussian;
-    gaussian.mean = _mean;
-    gaussian.stddev = _stddev;
-    gaussian.pnz = _pnz;
-    int_scale = _int_scale;
-    return *this;
-  }
-
-  /// Sets identity
-  Distribution &set_identity() {
-    kind = Identity;
-    return *this;
-  }
-
-  /// Sets sequential
-  Distribution &set_sequential(double start, double delta, int _int_scale = 0) {
-    kind = Sequential;
-    sequential.start = start;
-    sequential.delta = delta;
-    int_scale = _int_scale;
-    return *this;
-  }
-};
-
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Prints a Distribution to ostream
-inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
-  switch (dist.kind) {
-    case cutlass::Distribution::Uniform:
-      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
-          << ", pnan: " << dist.uniform.pnan;
-      break;
-    case cutlass::Distribution::Gaussian:
-      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
-          << ", pnzA: " << dist.gaussian.pnzA << ", pnzB: "
-          << dist.gaussian.pnzB << ", pnzC: " << dist.gaussian.pnzC;
-      break;
-    case cutlass::Distribution::Identity:
-      out << "identity";
-      break;
-    case cutlass::Distribution::Sequential:
-      out << "sequential";
-      break;
-    default:
-      out << "unknown";
-  }
-
-  out << ", int_scale: " << dist.int_scale;
-
-  return out;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/gather_tensor.hpp b/csrc/sparse/cutlass/example/util/gather_tensor.hpp
deleted file mode 100644
index 62616e00c7357..0000000000000
--- a/csrc/sparse/cutlass/example/util/gather_tensor.hpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cute/tensor.hpp"
-#include "cute/util/print.hpp"
-
-namespace example {
-
-using namespace cute;
-
-// Empty type used to disable gather/scatter for a GEMM argument
-struct NoGather
-{
-  template<class... Ts>
-  NoGather(Ts...) {};
-};
-
-/// Function object that applies an index to its argument
-template <class Index>
-struct IndexedGather
-{
-  CUTE_HOST_DEVICE constexpr
-  IndexedGather(Index const *indices = {}): indices_(indices) {}
-
-  template <typename I>
-  CUTE_HOST_DEVICE constexpr
-  Index
-  operator()(I i) const { return indices_[i]; }
-
-  CUTE_HOST_DEVICE friend
-  void 
-  print(IndexedGather const &s) {
-    cute::print("Indexed");
-  }
-
-  Index const *indices_;
-};
-
-/// Function object that applies a stride to its argument
-/// Example: StridedFunc<int,_2> gathers every other row/column
-template <class Stride>
-struct StridedGather
-{
-  CUTE_HOST_DEVICE constexpr
-  StridedGather(Stride stride = {}): stride_(stride) {}
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(I i) const { return i * stride_; }
-
-  CUTE_HOST_DEVICE friend
-  void 
-  print(StridedGather const &s) {
-    cute::print("Strided{");
-    print(s.stride_);
-    cute::print("}");
-  }
-
-  Stride stride_;
-};
-
-/// Custom stride object that applies a function followed by a stride
-template <class Func, class Stride>
-struct CustomStride
-{
-  CUTE_HOST_DEVICE constexpr
-  CustomStride(Func const &func, Stride const &stride): func_(func), stride_(stride) {}
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  operator*(I i, CustomStride const &s) { return s.func_(i) * s.stride_; }
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  operator*(CustomStride const &s, I i) { return s.func_(i) * s.stride_; }
-
-  CUTE_HOST_DEVICE friend
-  void
-  print(CustomStride const & s) {
-    cute::print("Custom{");
-    print(s.func_);
-    cute::print(",");
-    print(s.stride_);
-    cute::print("}");
-  }
-
-  template<class Div>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  safe_div(CustomStride const &s, Div const &div)
-  {
-    return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_, safe_div(s.stride_, div));
-  }
-
-  // Circumvent the requirement on make_layout that shape and stride are integral
-  template <class Shape>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  make_layout(Shape const &shape, CustomStride const &stride)
-  {
-    return Layout<Shape, CustomStride>(shape, stride);
-  }
-
-  Func func_;
-  Stride stride_;
-};
-
-template<class Stride, class Func>
-CUTLASS_HOST_DEVICE
-auto
-make_custom_stride_layout(Stride const &stride, Func&& func)
-{
-  // Use a dummy shape and replace the first non-unit stride with a custom gather stride
-  auto idx = find_if(stride, [](auto x){ return not is_constant<1, decltype(x)>{}; });
-  constexpr int I = decltype(idx)::value;
-  return make_layout(repeat_like(stride, _1{}),
-                     replace<I>(stride, CustomStride{static_cast<Func&&>(func), get<I>(stride)}));
-}
-
-/// Helper function to optionally create a gather tensor
-template<class Iterator, class Shape, class Stride, class Func>
-CUTLASS_HOST_DEVICE
-auto 
-make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func)
-{
-  if constexpr (not cutlass::platform::is_same<remove_cvref_t<Func>, NoGather>::value) {
-    Layout matrix_layout = make_identity_layout(shape);
-    auto offset = as_arithmetic_tuple(repeat_like(shape, _0{}));
-    Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func&&>(func));
-    return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
-  } else {
-    return make_tensor(iter, shape, stride);
-  }
-}
-
-} // namespace example
-
-namespace cute
-{
-
-template<int N, int I, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(Shape const& shape, Stride const& stride)
-{
-  if constexpr (is_tuple<Shape>::value) {
-    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N,I>(s,d); });
-  } else if constexpr (is_scaled_basis<Stride>::value) {
-    if constexpr (Stride::mode() == I) {
-      return make_layout(shape_div(shape, Int<N>{}), shape_div(stride, Int<N>{}));
-    } else {
-      return make_layout(shape, stride);
-    }
-  } else {
-    return upcast<N>(shape, stride);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int N, class OuterShape, class OuterStride, class Offset, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(ComposedLayout<Layout<OuterShape,OuterStride>,Offset,Layout<Shape,Stride>> const& layout)
-{
-  // Find index of the stride-1 mode - that is the only one that requires updating inner shape and offset
-  auto idx = find_if(layout.layout_a().stride(), [](auto x){ return is_constant<1, decltype(x)>{}; });
-  constexpr int I = decltype(idx)::value;
-
-  // Upcast the outer layout (works as expected)
-  auto outer = upcast<N>(layout.layout_a());
-
-  // Upcast the accumulated offset along stride-1 mode
-  auto offset = as_arithmetic_tuple(replace<I>(layout.offset(), upcast<N>(get<I>(layout.offset()))));
-
-  // Upcast the inner layout's shape along stride-1 mode
-  auto inner = upcast<N,I>(layout.layout_b().shape(), layout.layout_b().stride());
-
-  return composition(outer, offset, inner);
-}
-
-} // namespace example
diff --git a/csrc/sparse/cutlass/example/util/helper.h b/csrc/sparse/cutlass/example/util/helper.h
deleted file mode 100644
index a7a81e7479022..0000000000000
--- a/csrc/sparse/cutlass/example/util/helper.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cuda_runtime.h"
-#include <iostream>
-
-/**
- * Panic wrapper for unwinding CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                                                                    \
-  {                                                                                              \
-    cutlass::Status error = status;                                                              \
-    if (error != cutlass::Status::kSuccess) {                                                    \
-      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \
-                << std::endl;                                                                    \
-      exit(EXIT_FAILURE);                                                                        \
-    }                                                                                            \
-  }
-
-
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                              \
-  {                                                                     \
-    cudaError_t error = status;                                         \
-    if (error != cudaSuccess) {                                         \
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                << " at line: " << __LINE__ << std::endl;               \
-      exit(EXIT_FAILURE);                                               \
-    }                                                                   \
-  }
-
-
-/**
- * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
- */
-struct GpuTimer
-{
-    cudaStream_t _stream_id;
-    cudaEvent_t _start;
-    cudaEvent_t _stop;
-
-    /// Constructor
-    GpuTimer() : _stream_id(0)
-    {
-        CUDA_CHECK(cudaEventCreate(&_start));
-        CUDA_CHECK(cudaEventCreate(&_stop));
-    }
-
-    /// Destructor
-    ~GpuTimer()
-    {
-        CUDA_CHECK(cudaEventDestroy(_start));
-        CUDA_CHECK(cudaEventDestroy(_stop));
-    }
-
-    /// Start the timer for a given stream (defaults to the default stream)
-    void start(cudaStream_t stream_id = 0)
-    {
-        _stream_id = stream_id;
-        CUDA_CHECK(cudaEventRecord(_start, _stream_id));
-    }
-
-    /// Stop the timer
-    void stop()
-    {
-        CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
-    }
-
-    /// Return the elapsed time (in milliseconds)
-    float elapsed_millis()
-    {
-        float elapsed = 0.0;
-        CUDA_CHECK(cudaEventSynchronize(_stop));
-        CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
-        return elapsed;
-    }
-};
diff --git a/csrc/sparse/cutlass/example/util/host_tensor.h b/csrc/sparse/cutlass/example/util/host_tensor.h
deleted file mode 100644
index 3f061875b48dc..0000000000000
--- a/csrc/sparse/cutlass/example/util/host_tensor.h
+++ /dev/null
@@ -1,541 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/*! \file
-  \brief HostTensor contributes management for both host and device memory.
-
-  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
-  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
-  for CUDA memcpy operations.
-
-  Call {host, device}_{data, ref, view}() for accessing host or device memory.
-
-  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
-*/
-
-#include <vector>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/fast_math.h"
-
-#include "device_memory.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Host tensor
-template <
-  /// Data type of element stored within tensor (concept: NumericType)
-  typename Element_,
-  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
-  typename Layout_
->
-class HostTensor {
-public:
-
-  /// Data type of individual access
-  using Element = Element_;
-
-  /// Mapping function from logical coordinate to linear memory
-  using Layout = Layout_;
-
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Layout's stride vector
-  using Stride = typename Layout::Stride;
-
-  /// Tensor reference to device memory
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Tensor reference to constant device memory
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  /// Tensor reference to device memory
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Tensor reference to constant device memory
-  using ConstTensorView = typename TensorView::ConstTensorView;
-
-  /// Reference to element in tensor
-  using Reference = typename TensorRef::Reference;
-
-  /// Constant reference to element in tensor
-  using ConstReference = typename ConstTensorRef::Reference;
-
-private:
-  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
-                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
-                                      Element, uint8_t>>;
-  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
-  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
-  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
-  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
-  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
-
-  //
-  // Data members
-  //
-
-  /// Extent of tensor in logical dimensions
-  TensorCoord extent_;
-
-  /// Layout object
-  Layout layout_;
-
-  /// Host-side memory allocation
-  std::vector<StorageUnit> host_;
-
-  /// Device-side memory
-  device_memory::allocation<StorageUnit> device_;
-
-  /// number of containers 
-  size_t count_to_container_storage_unit_count(size_t count) {
-    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
-  }
-
-public:
-  //
-  // Device and Host Methods
-  //
-
-  /// Default constructor
-  HostTensor() {}
-
-  /// Constructs a tensor given an extent. Assumes a packed layout
-  HostTensor(
-    TensorCoord const &extent,
-    bool device_backed = true
-  ) {
-
-    this->reset(extent, Layout::packed(extent), device_backed);
-  }
-
-  /// Constructs a tensor given an extent and layout
-  HostTensor(
-    TensorCoord const &extent,
-    Layout const &layout,
-    bool device_backed = true
-  ) {
-
-    this->reset(extent, layout, device_backed);
-  }
-
-  ~HostTensor() { }
-
-  /// Clears the HostTensor allocation to size/capacity = 0
-  void reset() {
-    extent_ = TensorCoord();
-    layout_ = Layout::packed(extent_);
-
-    host_.clear();
-    device_.reset();
-  }
-
-  /// Resizes internal memory allocations without affecting layout or extent
-  void reserve(
-    size_t count,                                        ///< size of tensor in elements
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
-#endif
-
-    device_.reset();
-    host_.clear();
-
-    size_t count_container = count_to_container_storage_unit_count(count);
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
-#endif    
-    host_.resize(count_container);
-
-    // Allocate memory
-    StorageUnit* device_memory = nullptr;
-    if (device_backed_) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
-#endif
-      device_memory = device_memory::allocate<StorageUnit>(count_container);
-    }
-    device_.reset(device_memory, device_backed_ ? count_container : 0);
-  }
-
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    extent_ = extent;
-    layout_ = layout;
-
-    reserve(size_t(layout_.capacity(extent_)), device_backed_);
-  }
-
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout. Assumes a packed tensor configuration.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    reset(extent, Layout::packed(extent), device_backed_);
-  }
-
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset().
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    extent_ = extent;
-    layout_ = layout;
-
-    LongIndex new_size = size_t(layout_.capacity(extent_));
-    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
-
-    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
-      reserve(new_size, device_backed_);
-    }
-  }
-
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    resize(extent, Layout::packed(extent), device_backed_);
-  }
-
-  /// Returns the logical number of elements stored in the host tensor
-  size_t size() const {
-    return layout_.capacity(extent_);
-  }
-
-  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
-  LongIndex capacity() const {
-    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
-  }
-
-  /// Gets pointer to host data
-  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
-
-  /// Gets pointer to host data with a pointer offset
-  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
-
-  /// Gets a reference to an element in host memory
-  Reference host_data(LongIndex idx) {
-    return ReferenceFactory<Element>::get(host_data(), idx);
-  }
-
-  /// Gets pointer to host data
-  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
-
-  /// Gets pointer to host data with a pointer offset
-  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
-
-  /// Gets a constant reference to an element in host memory
-  ConstReference host_data(LongIndex idx) const {
-    return ReferenceFactory<Element const>::get(host_data(), idx);
-  }
-
-  /// Gets pointer to device data
-  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
-
-  /// Gets pointer to device data
-  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
-
-  /// Gets pointer to device data with a pointer offset
-  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
-
-  /// Gets pointer to device data with a pointer offset
-  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
-
-  /// Accesses the tensor reference pointing to data
-  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
-
-  /// Accesses the tensor reference pointing to data
-  TensorRef device_ref(LongIndex ptr_element_offset=0) {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorView host_view(LongIndex ptr_element_offset=0) {
-    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorView device_view(LongIndex ptr_element_offset=0) {
-    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Returns true if device memory is allocated
-  bool device_backed() const {
-    return (device_.get() == nullptr) ? false : true;
-  }
-
-
-  /// Returns the layout object
-  Layout & layout() {
-    return layout_;
-  }
-
-  /// Returns the layout object
-  Layout layout() const {
-    return layout_;
-  }
-
-  /// Returns the layout object's stride vector
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride vector
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex stride(int dim) const {
-    return layout_.stride().at(dim);
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex & stride(int dim) {
-    return layout_.stride().at(dim);
-  }
-
-  /// Computes the offset of an index from the origin of the tensor
-  LongIndex offset(TensorCoord const& coord) const {
-    return layout_(coord);
-  }
-
-  /// Returns a reference to the element at the logical Coord in host memory
-  Reference at(TensorCoord const& coord) {
-    return host_data(offset(coord));
-  }
-
-  /// Returns a const reference to the element at the logical Coord in host memory
-  ConstReference at(TensorCoord const& coord) const {
-    return host_data(offset(coord));
-  }
-
-  /// Returns the extent of the tensor
-  TensorCoord extent() const {
-    return extent_;
-  }
-
-  /// Returns the extent of the tensor
-  TensorCoord & extent() {
-    return extent_;
-  }
-
-  /// Copies data from device to host
-  void sync_host() {
-    if (device_backed()) {
-      device_memory::copy_to_host(
-          host_.data(), device_.get(), device_.size());
-    }
-  }
-
-  /// Copies data from host to device
-  void sync_device() {
-    if (device_backed()) {
-      device_memory::copy_to_device(
-          device_.get(), host_.data(), host_.size());
-    }
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_host(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_device(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_device(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_host(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_host(
-    Element * ptr_host,               ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_device(
-    Element * ptr_device,             ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_device(
-    Element * ptr_device,             ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_host(
-    Element * ptr_host,               ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/packed_stride.hpp b/csrc/sparse/cutlass/example/util/packed_stride.hpp
deleted file mode 100644
index e9a243a1322cc..0000000000000
--- a/csrc/sparse/cutlass/example/util/packed_stride.hpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
-*/
-
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cute/container/array.hpp"   // cute::array
-#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides without batch mode
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides with batch mode
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, int64_t>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT, int64_t>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides with group mode
-
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides for convolutions
-
-// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
-// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
-// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
-// right in KTRSC order and can be coalesced to just k.
-// We enforce this condition here with asserts.
-template <class IntT, size_t RankT_>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
-    cute::array<int32_t, RankT_> shape_output,
-    cute::array<IntT, RankT_> stride_output,
-    cutlass::conv::Operator conv_op) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  static_assert(RankT_ >= 3u);
-  constexpr static int RankT = static_cast<int>(RankT_);
-
-  assert(stride_output[RankT-1] == 1);
-  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
-    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
-  });
-
-  auto s_copy = s;
-  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
-      stride_output[0] :
-      stride_output[RankT-2];
-  return s_copy;
-}
-
-//
-// Activation tensor ((w, h, d, n), _1) for fprop kernel
-//
-
-// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_nwc[1];
-  cute::get<0,1>(s_copy) = stride_nwc[0];
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
-  });
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
-  });
-  return s_copy;
-}
-
-//
-// Filter tensor (k, (_1, s, r, t)) for fprop kernel
-//
-
-// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-
-//
-// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
-//
-// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
-//
-
-// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
-// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::get<1,0>(s_copy) = stride_nwc[1];
-    cute::get<1,1>(s_copy) = stride_nwc[0];
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nwc in dgrad is ksc.
-    cute::get<1,0>(s_copy) = stride_nwc[0];
-    cute::get<1,1>(s_copy) = stride_nwc[1];
-  }
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nhwc in dgrad is krsc.
-    cute::get<1,0>(s_copy) = stride_nhwc[0];
-    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_ndhwc in dgrad is ktrsc.
-    cute::get<1,0>(s_copy) = stride_ndhwc[0];
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-
-//
-// NZPQ tensor (_1, nzpq) for wgrad kernel
-//
-
-// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 3> stride_nqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nqk[2] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nqk[1];
-  return s_copy;
-}
-
-// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 4> stride_npqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_npqk[3] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_npqk[2];
-  return s_copy;
-}
-
-// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 5> stride_nzpqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nzpqk[4] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nzpqk[3];
-  return s_copy;
-}
-
-
-
-//
-// Wgrad output tensor (k, (_1, s, r, t), _0)
-//
-
-// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-
-
-//
-// Wgrad output tensor ((_1, s, r, t), k, _0)
-//
-
-// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ksc[0];
-  cute::get<0,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/detail/inner_product.h b/csrc/sparse/cutlass/example/util/reference/detail/inner_product.h
deleted file mode 100644
index 2bce60b1390c0..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/detail/inner_product.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-namespace cutlass {
-namespace reference {
-namespace detail {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template function to compute an inner product.
-#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate with a
-                            // host-only type
-template <typename Atype, typename Btype, typename Ctype>
-CUTLASS_HOST_DEVICE
-Ctype inner_product(Atype a, Btype b, Ctype c) {
-  return Ctype(a) * Ctype(b) + c;
-}
-
-/// Specialization for matrix multiplication with binary operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<bin1_t, 32>, Array<bin1_t, 32>, int>(
-    Array<bin1_t, 32> a,
-    Array<bin1_t, 32> b,
-    int c) {
-
-  int accum = 0;
-  for (int bit = 0; bit < 32; bit++) {
-    accum += a[bit] ^ b[bit];
-  }
-  return accum + c;
-}
-
-/*
-/// Specialization for matrix multiplication with signed 4-bit integer operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<int4b_t, 8>, Array<int4b_t, 8>, int>(
-    Array<int4b_t, 8> a,
-    Array<int4b_t, 8> b,
-    int c) {
-
-  int accum = 0;
-  for (int k = 0; k < 8; k++) {
-    accum += a[k] * b[k];
-  }
-  return accum + c;
-}
-
-/// Specialization for matrix multiplication with unsigned 4-bit integer operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<uint4b_t, 8>, Array<uint4b_t, 8>, int>(
-    Array<uint4b_t, 8> a,
-    Array<uint4b_t, 8> b,
-    int c) {
-
-  int accum = 0;
-  for (int k = 0; k < 8; k++) {
-    accum += a[k] * b[k];
-  }
-  return accum + c;
-}
-*/
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename SrcType, typename DstType>
-struct Cast {
-  // Default behavior: convert to the destination type
-#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate complex<T> with a
-                            // host-only type
-  CUTLASS_HOST_DEVICE
-  static DstType apply(SrcType src) { return static_cast<DstType>(src); };
-};
-
-template <>
-struct Cast<float, int8_t> {
-  CUTLASS_HOST_DEVICE
-  static int8_t apply(float src) {
-    // Clamp to the range of signed 8-bit integers.
-    return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
-  };
-};
-
-template <>
-struct Cast<float, uint8_t> {
-  CUTLASS_HOST_DEVICE
-  static uint8_t apply(float src) {
-    // Clamp to the range of signed 8-bit integers.
-    return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-} // namespace reference
-} // namespace cutlass
-
diff --git a/csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h b/csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h
deleted file mode 100644
index 1f784c46f6eb9..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Rank, int Index>
-struct LinearToCoordinateHelper {
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
-
-    int64_t prod = 1;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = Rank - Index; i < Rank; ++i) {
-      prod *= int64_t(extent[i]);
-    }
-
-    coord[Rank - Index - 1] = int(idx / prod);
-
-    int64_t residual = idx % prod;
-    LinearToCoordinateHelper<Rank, Index - 1>()(coord, residual, extent);
-  }
-};
-
-template <int Rank>
-struct LinearToCoordinateHelper<Rank, 0> {
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &) const {
-    coord[Rank - 1] = int(idx);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Rank>
-struct LinearToCoordinate {
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
-    LinearToCoordinateHelper<Rank, Rank - 1>()(coord, idx, extent);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-} // namespace reference
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/csrc/sparse/cutlass/example/util/reference/device/convolution.h b/csrc/sparse/cutlass/example/util/reference/device/convolution.h
deleted file mode 100644
index c91cd0e229bdd..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/convolution.h
+++ /dev/null
@@ -1,1549 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Reference implementation for convolution in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/functional.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-///                                   Conv2d device reference kernel
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2d Fprop kernel - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_p[kThreadM];
-  int thread_q[kThreadM];
-
-  // Compute N, P, Q coordinates for each row of a thread's tile
-  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t npq = npq_start + m;
-
-    thread_n[m] = int(npq / PQ);
-    
-    int64_t residual = npq % PQ;
-    thread_p[m] = int(residual / problem_size.Q);
-    thread_q[m] = int(residual % problem_size.Q);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  int c_per_group = problem_size.C / problem_size.groups;
-  int k_per_group = problem_size.K / problem_size.groups;
-
-  // Compute convolution
-  for (int R = 0; R < problem_size.R; ++R) {
-    for (int S = 0; S < problem_size.S; ++S) {
-      for (int C = 0; C < problem_size.C; ++C) {
-
-        // Get group id of currnet channel
-        int c_group_idx = C / c_per_group;
-
-        // Load from activations tensor
-        int filter_r = R;
-        int filter_s = S;   
-
-        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-          filter_r = problem_size.R - 1 - R;
-          filter_s = problem_size.S - 1 - S;
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-          int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-          if (thread_n[m] < problem_size.N && h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
-            element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], h, w, C}));
-          }
-          else {
-            element_A[m] = ElementAccumulator();
-          }
-        }
-
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          int thread_k = k_start + n;
-          int k_group_idx = thread_k / k_per_group;
-
-          if (thread_k < problem_size.K && k_group_idx == c_group_idx) {
-            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C % c_per_group}));
-          }
-          else {
-            element_B[n] = ElementAccumulator();
-          }
-        }
-
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_k = k_start + n;
-        if (thread_k < problem_size.K) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
-          }
-
-          tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-// Conv3d Fprop kernel - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator =  ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t nzpq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_z[kThreadM];
-  int thread_p[kThreadM];
-  int thread_q[kThreadM];
-
-  // Compute N, Z, P, Q coordinates for each row of a thread's tile
-  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
-  int64_t ZPQ = PQ * problem_size.Z;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t nzpq = nzpq_start + m;
-
-    thread_n[m] = int(nzpq / ZPQ);
-    
-    int64_t residual = nzpq % ZPQ;
-    thread_z[m] = int(residual / PQ);
-
-    residual = residual % PQ;
-    thread_p[m] = int(residual / problem_size.Q);
-    thread_q[m] = int(residual % problem_size.Q);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int T = 0; T < problem_size.T; ++T) {
-    for (int R = 0; R < problem_size.R; ++R) {
-      for (int S = 0; S < problem_size.S; ++S) {
-        for (int C = 0; C < problem_size.C; ++C) {
-
-          // Load from activations tensor
-          int filter_t = T;
-          int filter_r = R;
-          int filter_s = S;   
-
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_t = problem_size.T - 1 - T;
-            filter_r = problem_size.R - 1 - R;
-            filter_s = problem_size.S - 1 - S;
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            int d = thread_z[m] * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
-            int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-            int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-            if (thread_n[m] < problem_size.N && 
-              d >= 0 && d < problem_size.D && 
-              h >= 0 && h < problem_size.H && 
-              w >= 0 && w < problem_size.W) {
-
-              element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], d, h, w, C}));
-            }
-            else {
-              element_A[m] = ElementAccumulator();
-            }
-          }
-
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            int thread_k = k_start + n;
-
-            if (thread_k < problem_size.K) {
-              element_B[n] = ElementAccumulator(tensor_w.at({thread_k, T, R, S, C}));
-            }
-            else {
-              element_B[n] = ElementAccumulator();
-            }
-          }
-
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-
-        } // for (C)
-      } // for (S)
-    }  // for (R) 
-  } // for (T)
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    if (thread_n[m] < problem_size.N && 
-      thread_z[m] < problem_size.Z && 
-      thread_p[m] < problem_size.P && 
-      thread_q[m] < problem_size.Q) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_k = k_start + n;
-        if (thread_k < problem_size.K) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}));
-          }
-
-          tensor_y_out.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } // for (n)
- 
-    }
-  } // for (m)
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2d dgrad kernel - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv2dDgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t nhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_h[kThreadM];
-  int thread_w[kThreadM];
-
-  // Compute N, H, W coordinates for each row of a thread's tile
-  int64_t HW = int64_t(problem_size.H) * problem_size.W;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t nhw = nhw_start + m;
-
-    thread_n[m] = int(nhw / HW);
-    
-    int64_t residual = nhw % HW;
-    thread_h[m] = int(residual / problem_size.W);
-    thread_w[m] = int(residual % problem_size.W);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int R = 0; R < problem_size.R; ++R) {
-    for (int S = 0; S < problem_size.S; ++S) {
-      for (int K = 0; K < problem_size.K; ++K) {
-
-        // Load from activations tensor
-        int filter_r = R;
-        int filter_s = S;   
-
-        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-          filter_r = problem_size.R - 1 - R;
-          filter_s = problem_size.S - 1 - S;
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-
-          int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
-          int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-          element_A[m] = ElementAccumulator();
-
-          if (p >= 0 && !(p % problem_size.stride_h) && q >= 0 && !(q % problem_size.stride_w)) {
-
-            p = p / problem_size.stride_h;
-            q = q / problem_size.stride_w;
-
-            if (thread_n[m] < problem_size.N && p < problem_size.P && q < problem_size.Q) {
-              element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], p, q, K}));  
-            }
-          }
-        }
-
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          int thread_c = c_start + n;
-
-          if (thread_c < problem_size.C) {
-            element_B[n] = ElementAccumulator(tensor_w.at({K, R, S, thread_c}));
-          }
-          else {
-            element_B[n] = ElementAccumulator();
-          }
-        }
-
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    
-    if (thread_n[m] < problem_size.N && thread_h[m] < problem_size.H && thread_w[m] < problem_size.W) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_c = c_start + n;
-        if (thread_c < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_h[m], thread_w[m], thread_c}));
-          }
-
-          tensor_dx_out.at({thread_n[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-// Conv3d dgrad kernel - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv3dDgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t ndhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_d[kThreadM];
-  int thread_h[kThreadM];
-  int thread_w[kThreadM];
-
-  // Compute N, H, W coordinates for each row of a thread's tile
-  int64_t HW = int64_t(problem_size.H) * problem_size.W;
-  int64_t DHW = HW * problem_size.D;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t ndhw = ndhw_start + m;
-
-    thread_n[m] = int(ndhw / DHW);
-    
-    int64_t residual = ndhw % DHW;
-    thread_d[m] = int(residual / HW);
-
-    residual = residual % HW;
-    thread_h[m] = int(residual / problem_size.W);
-    thread_w[m] = int(residual % problem_size.W);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int T = 0; T < problem_size.T; ++T) {
-    for (int R = 0; R < problem_size.R; ++R) {
-      for (int S = 0; S < problem_size.S; ++S) {
-        for (int K = 0; K < problem_size.K; ++K) {
-
-          // Load from activations tensor
-          int filter_t = T;
-          int filter_r = R;
-          int filter_s = S;   
-
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_t = problem_size.T - 1 - T;
-            filter_r = problem_size.R - 1 - R;
-            filter_s = problem_size.S - 1 - S;
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-
-            int z = thread_d[m] + problem_size.pad_d - filter_t * problem_size.dilation_d;
-            int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
-            int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-            element_A[m] = ElementAccumulator();
-
-            if (z >= 0 && !(z % problem_size.stride_d) && 
-              p >= 0 && !(p % problem_size.stride_h) && 
-              q >= 0 && !(q % problem_size.stride_w)) {
-
-              z = z / problem_size.stride_d;
-              p = p / problem_size.stride_h;
-              q = q / problem_size.stride_w;
-
-              if (thread_n[m] < problem_size.N && z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
-                element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], z, p, q, K}));  
-              }
-            }
-          }
-
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            int thread_c = c_start + n;
-
-            if (thread_c < problem_size.C) {
-              element_B[n] = ElementAccumulator(tensor_w.at({K, T, R, S, thread_c}));
-            }
-            else {
-              element_B[n] = ElementAccumulator();
-            }
-          }
-
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-
-        } // for (C)
-      } // for (S)
-    } // for (R)
-  } // for (T)
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    
-    if (thread_n[m] < problem_size.N && 
-      thread_d[m] < problem_size.D && 
-      thread_h[m] < problem_size.H && 
-      thread_w[m] < problem_size.W) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_c = c_start + n;
-        if (thread_c < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}));
-          }
-
-          tensor_dx_out.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2d wgrad kernel - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
-  int kCtaShapeN = 16     // shape of a threadblock in units of threads
->
-__global__ void Conv2dWgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int64_t rsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  
-  int thread_r[kThreadN];
-  int thread_s[kThreadN];
-  int thread_c[kThreadN];
-
-  // Compute R, S, C coordinates for each row of a thread's tile
-  int64_t SC = int64_t(problem_size.S) * problem_size.C;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int n = 0; n < kThreadN; ++n) {
-
-    int64_t rsc = rsc_start + n;
-    int64_t residual = rsc % SC;
-
-    thread_r[n] = int(rsc / SC);
-    thread_s[n] = int(residual / problem_size.C);
-    thread_c[n] = int(residual % problem_size.C);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int N = 0; N < problem_size.N; ++N) {
-    for (int P = 0; P < problem_size.P; ++P) {
-      for (int Q = 0; Q < problem_size.Q; ++Q) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          int thread_k = k_start + m;
-
-          element_A[m] = ElementAccumulator();
-
-          if (thread_k < problem_size.K) {
-            element_A[m] = ElementAccumulator(tensor_dy.at({N, P, Q, thread_k}));
-          }
-        }
-
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          
-          // Load from activations tensor
-          int filter_r = thread_r[n];
-          int filter_s = thread_s[n];
-
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_r = problem_size.R - 1 - filter_r;
-            filter_s = problem_size.S - 1 - filter_s;
-          }
-
-          int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-          int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-          element_B[n] = ElementAccumulator();
-
-          if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W && thread_c[n] < problem_size.C) {
-            element_B[n] = ElementAccumulator(tensor_x.at({N, h, w, thread_c[n]}));
-          }
-        }
-
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int thread_k = k_start + m;
-
-    if (thread_k < problem_size.K) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-
-        if (thread_r[n] < problem_size.R && thread_s[n] < problem_size.S && thread_c[n] < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}));
-          }
-
-          tensor_dw_out.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-// Conv3d wgrad kernel - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
-  int kCtaShapeN = 16     // shape of a threadblock in units of threads
->
-__global__ void Conv3dWgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int64_t trsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  
-  int thread_t[kThreadN];
-  int thread_r[kThreadN];
-  int thread_s[kThreadN];
-  int thread_c[kThreadN];
-
-  // Compute R, S, C coordinates for each row of a thread's tile
-  int64_t SC = int64_t(problem_size.S) * problem_size.C;
-  int64_t RSC = SC * problem_size.R;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int n = 0; n < kThreadN; ++n) {
-
-    int64_t trsc = trsc_start + n;
-
-    thread_t[n] = int(trsc / RSC);
-
-    int64_t residual = trsc % RSC;
-    thread_r[n] = int(residual / SC);
-
-    residual = residual % SC; 
-    thread_s[n] = int(residual / problem_size.C);
-    thread_c[n] = int(residual % problem_size.C);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int N = 0; N < problem_size.N; ++N) {
-    for (int Z = 0; Z < problem_size.Z; ++Z) {
-      for (int P = 0; P < problem_size.P; ++P) {
-        for (int Q = 0; Q < problem_size.Q; ++Q) {
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            int thread_k = k_start + m;
-
-            element_A[m] = ElementAccumulator();
-
-            if (thread_k < problem_size.K) {
-              element_A[m] = ElementAccumulator(tensor_dy.at({N, Z, P, Q, thread_k}));
-            }
-          }
-
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            
-            // Load from activations tensor
-            int filter_t = thread_t[n];
-            int filter_r = thread_r[n];
-            int filter_s = thread_s[n];
-
-            if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-              filter_t = problem_size.T - 1 - filter_t;
-              filter_r = problem_size.R - 1 - filter_r;
-              filter_s = problem_size.S - 1 - filter_s;
-            }
-
-            int d = Z * problem_size.stride_d - problem_size.pad_w + filter_t * problem_size.dilation_d;
-            int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-            int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-            element_B[n] = ElementAccumulator();
-
-            if (d >= 0 && d < problem_size.D && 
-              h >= 0 && h < problem_size.H && 
-              w >= 0 && w < problem_size.W && 
-              thread_c[n] < problem_size.C) {
-
-              element_B[n] = ElementAccumulator(tensor_x.at({N, d, h, w, thread_c[n]}));
-            }
-          }
-
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-
-        } // for (Q)
-      } // for (P)
-    } // for (Z)
-  } // for (N)
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int thread_k = k_start + m;
-
-    if (thread_k < problem_size.K) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-
-        if (thread_t[n] < problem_size.T && 
-          thread_r[n] < problem_size.R &&
-          thread_s[n] < problem_size.S && 
-          thread_c[n] < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}));
-          }
-
-          tensor_dw_out.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Conv2d Fprop dispatcher - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q;
-  int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv2dFprop<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_x,
-    tensor_w,
-    tensor_y_in,
-    tensor_y_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv3d Fprop dispatcher - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t nzpq = int64_t(problem_size.N) * problem_size.Z * problem_size.P * problem_size.Q;
-  int64_t blocks_m = (nzpq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv3dFprop<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_x,
-    tensor_w,
-    tensor_y_in,
-    tensor_y_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv2d Dgrad dispatcher - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dDgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t nhw = int64_t(problem_size.N) * problem_size.H * problem_size.W;
-  int64_t blocks_m = (nhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv2dDgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_w,
-    tensor_dx_in,
-    tensor_dx_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv3d Dgrad dispatcher - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dDgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t ndhw = int64_t(problem_size.N) * problem_size.D * problem_size.H * problem_size.W;
-  int64_t blocks_m = (ndhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv3dDgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_w,
-    tensor_dx_in,
-    tensor_dx_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv2d Wgrad dispatcher - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dWgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
-  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
-
-  int64_t rsc = int64_t(problem_size.R) * problem_size.S * problem_size.C;
-  int64_t blocks_n = (rsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
-
-  kernel::Conv2dWgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_x,
-    tensor_dw_in,
-    tensor_dw_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv3d Wgrad dispatcher - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dWgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
-  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
-
-  int64_t trsc = int64_t(problem_size.T) * problem_size.R * problem_size.S * problem_size.C;
-  int64_t blocks_n = (trsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
-
-  kernel::Conv3dWgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_x,
-    tensor_dw_in,
-    tensor_dw_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2d(
-  conv::Operator convolutional_operator,
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    return Conv2dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-
-  case conv::Operator::kDgrad:
-    return Conv2dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-
-  case conv::Operator::kWgrad:
-    return Conv2dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-
-  default: break;
-  }
-  
-  return Status::kErrorNotSupported;
-}
-
-/// Generic 3D convolution targeting Conv3dFprop, Conv3dDgrad, and Conv3dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3d(
-  conv::Operator convolutional_operator,
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    return Conv3dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-
-  case conv::Operator::kDgrad:
-    return Conv3dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-
-  case conv::Operator::kWgrad:
-    return Conv3dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-
-  default: break;
-  }
-  
-  return Status::kErrorNotSupported;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace device
-}  // namespace reference
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/gemm.h
deleted file mode 100644
index 1a1bd3751801a..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gemm.h
+++ /dev/null
@@ -1,385 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/util/reference/device/kernel/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  AccumulatorType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Blocking structure potentially improves performance of reference implementation
-  // with a minor increase in complexity.
-  //
-  // Note, this reference implementation is NOT expected to approach peak performance.
-  using OutputTile = MatrixShape<4, 4>;
-
-  dim3 block(16, 8);
-
-  dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
-  );
-
-  // Launch a GEMM kernel
-  kernel::Gemm<
-    TensorRef<ElementA, LayoutA>,
-    TensorRef<ElementB, LayoutB>,
-    TensorRef<ElementC, LayoutC>,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  AccumulatorType initial_accum) {
-
-  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                ScalarType, AccumulatorType, InnerProductOp, ConvertOp>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-        initial_accum);
-}
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Gemm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-            ScalarType, AccumulatorType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-
-    static_assert(
-      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-      "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                  ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-      "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add-saturate
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            AccumulatorType, arch::OpMultiplyAddSaturate> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for XOR-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            AccumulatorType, arch::OpXorPopc> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Batched GEMM
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a batch of GEMMs over a set of matrices of common dimension.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp,
-  typename ConvertOp
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c,
-  AccumulatorType initial_accum) {
-
-  static_assert(
-    TensorRefCollectionA::kRank == 2 &&
-    TensorRefCollectionB::kRank == 2 &&
-    TensorRefCollectionC::kRank == 2, "Tensors must be of rank 2");
-
-  // Blocking structure potentially improves performance of reference implementation
-  // with a minor increase in complexity.
-  //
-  // Note, this reference implementation is NOT expected to approach peak performance.
-  using OutputTile = MatrixShape<4, 4>;
-
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn),
-    batch_count
-  );
-
-  // Launch a GEMM kernel
-  kernel::BatchedGemm<
-    TensorRefCollectionA,
-    TensorRefCollectionB,
-    TensorRefCollectionC,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    initial_accum
-  );
-}
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c) {
-
-  BatchedGemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h b/csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h
deleted file mode 100644
index b4d41bd28efb5..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  int kMblock = 4,
-  int kNblock = 4
->
-__global__ void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-  int batch_idx = blockIdx.z;
-
-  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
-  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
-  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
-  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
-
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
-
-    // Compute matrix product using blocks
-    ComputeType accum[kMblock][kNblock];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        accum[i][j] = initial_accum;
-      }
-    }
-
-    for (int k_block = 0; k_block < K; ++k_block) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kNblock; j++) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kMblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          if (row < M && col < N) {
-            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-            ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-
-            ComputeType a_ik = ComputeType(a);
-            ComputeType b_kj = ComputeType(b);
-
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_ik = conj(a_ik);
-            }
-
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_kj = conj(b_kj);
-            }
-
-            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-          }
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        int row = row_block + i;
-        int col = col_block + j;
-
-        MatrixCoord coord = MatrixCoord(row, col);
-
-        if (row < M && col < N) {
-
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[i][j]) + 
-            beta * ScalarType(tensor_c.at(coord)));
-        }
-      }
-    }
-
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
-
-  } // for (batch_idx)
-}
-
-} // namespace kernel
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
- 
-  int const kMblock = 4;
-  int const kNblock = 4;
-
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    batch_count % std::numeric_limits<uint16_t>::max()
-  );
-
-  if (grid.y <= std::numeric_limits<uint16_t>::max()) {
-    kernel::GemmComplex<
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      ScalarType,
-      ComputeType,
-      ElementD,
-      ConvertOp,
-      InnerProductOp,
-      kMblock,
-      kNblock
-    ><<< grid, block >>>(
-      problem_size,
-      alpha,
-      tensor_a,
-      transform_a,
-      tensor_b,
-      transform_b,
-      beta,
-      tensor_c,
-      tensor_d,
-      initial_accum,
-      batch_count,
-      batch_stride_A,
-      batch_stride_B,
-      batch_stride_C,
-      batch_stride_D
-    );
-  } else {
-    // Using bigger thread tile size
-    int const kBigMblock = 4;
-    int const kBigNblock = 16;
-
-    dim3 Bigblock(16, 8);
-    dim3 Biggrid(
-      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
-      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
-      batch_count % std::numeric_limits<uint16_t>::max()
-    );
-
-    kernel::GemmComplex<
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      ScalarType,
-      ComputeType,
-      ElementD,
-      ConvertOp,
-      InnerProductOp,
-      kBigMblock,
-      kBigNblock
-    ><<< Biggrid, Bigblock >>>(
-      problem_size,
-      alpha,
-      tensor_a,
-      transform_a,
-      tensor_b,
-      transform_b,
-      beta,
-      tensor_c,
-      tensor_d,
-      initial_accum,
-      batch_count,
-      batch_stride_A,
-      batch_stride_B,
-      batch_stride_C,
-      batch_stride_D
-    );
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ElementD = ElementC
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d) {
-
-  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h b/csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h
deleted file mode 100644
index 37c103c3fcb45..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static int const kGemmPlanarComplexBlockSize = 4;
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-__global__ void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-
-  int const kMblock = kGemmPlanarComplexBlockSize;
-  int const kNblock = kGemmPlanarComplexBlockSize;
-
-  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
-  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
-  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  complex<ComputeType> accum[kMblock][kNblock];
-  
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int j = 0; j < kNblock; j++) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kMblock; i++) {
-      accum[i][j] = initial_accum;
-    }
-  }
-
-  CUTLASS_PRAGMA_NO_UNROLL
-  for (int k_block = 0; k_block < K; ++k_block) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-
-        int row = row_block + i;
-        int col = col_block + j;
-
-        if (row < M && col < N) {
-
-          ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
-          ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
-
-          complex<ComputeType> a = complex<ComputeType>{
-            ComputeType(a_ik.real()),
-            ComputeType(a_ik.imag())
-          };
-
-          complex<ComputeType> b = complex<ComputeType>{
-            ComputeType(b_kj.real()),
-            ComputeType(b_kj.imag())
-          };
-
-          if (transform_a == ComplexTransform::kConjugate) {
-            a = conj(a);
-          }
-
-          if (transform_b == ComplexTransform::kConjugate) {
-            b = conj(b);
-          }
-
-          accum[i][j] = inner_product_op(a, b,  accum[i][j]);
-        }
-      }
-    }
-  }
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int j = 0; j < kNblock; j++) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kMblock; i++) {
-
-      int row = row_block + i;
-      int col = col_block + j;
-
-      MatrixCoord coord = MatrixCoord(row, col);
-
-      if (row < M && col < N) {
-
-        complex<ScalarType> acc{
-          ScalarType(accum[i][j].real()),
-          ScalarType(accum[i][j].imag())
-        };
-
-        ComplexC c_ij = ComplexC();
-
-        if (beta.real() != ScalarType() || beta.imag() != ScalarType()) {
-          c_ij = tensor_c.at(coord);
-        }
-
-        complex<ScalarType> src{
-          ScalarType(c_ij.real()),
-          ScalarType(c_ij.imag())
-        };
-
-        complex<ScalarType> result = alpha * acc + beta * src;
-
-        ComplexC d_ij;
-
-        d_ij.real() = convert_op(result.real());
-        d_ij.imag() = convert_op(result.imag());
-
-        tensor_d.at(coord) = d_ij;
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  int const kMblock = kernel::kGemmPlanarComplexBlockSize;
-  int const kNblock = kernel::kGemmPlanarComplexBlockSize;
-
-  dim3 block(16, 8);
-
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    1);
-
-  kernel::GemmPlanarComplex<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ScalarType,
-    ComputeType,
-    ConvertOp,
-    InnerProductOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    transform_a,
-    tensor_b,
-    transform_b,
-    beta,    
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
-
-  GemmPlanarComplex(
-    problem_size, 
-    alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, 
-    tensor_c,
-    tensor_d,
-    complex<ScalarType>());
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gett.hpp b/csrc/sparse/cutlass/example/util/reference/device/gett.hpp
deleted file mode 100644
index 78586ad62dc18..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gett.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief GETT device reference code
-*/
-#pragma once
-
-#include <cute/tensor.hpp>
-
-namespace cutlass::reference::device {
-
-template <
-  class ATensor,
-  class BTensor,
-  class CTensor,
-  class DTensor,
-  class ElementAccumulator,
-  class ElementEpilogue>
-__global__ static
-void
-gett_kernel(
-  DTensor       D,
-  ATensor const A,
-  BTensor const B,
-  CTensor const C,
-  ElementEpilogue alpha, ElementEpilogue beta,
-  ElementAccumulator acc_init)
-{
-  using namespace cute;
-
-  static_assert(DTensor::rank == 3, "(M,N,L)");
-  static_assert(ATensor::rank == 3, "(M,K,L)");
-  static_assert(BTensor::rank == 3, "(N,K,L)");
-  static_assert(CTensor::rank == 3, "(M,N,L)");
-
-  assert(size<0>(A) == size<0>(D));  // M
-  assert(size<0>(C) == size<0>(D));  // M
-  assert(size<0>(B) == size<1>(D));  // N
-  assert(size<1>(C) == size<1>(D));  // N
-  assert(size<1>(A) == size<1>(B));  // K
-  assert(size<2>(A) == size<2>(D));  // L
-  assert(size<2>(B) == size<2>(D));  // L
-  assert(size<2>(C) == size<2>(D));  // L
-
-  NumericConverter<ElementAccumulator, typename ATensor::value_type> a_converter;
-  NumericConverter<ElementAccumulator, typename BTensor::value_type> b_converter;
-  NumericConverter<ElementEpilogue, ElementAccumulator> acc_converter;
-  NumericConverter<ElementEpilogue, typename CTensor::value_type> source_converter;
-  NumericConverter<typename DTensor::value_type, ElementEpilogue> output_converter;
-
-  // Thread id to each element of D
-  for (int tid = threadIdx.x + blockDim.x * blockIdx.x;
-       tid < size(D);
-       tid += blockDim.x * gridDim.x) {
-    // (m,n,l) coordinate
-    auto mnl_coord = idx2crd(tid, product_each(shape(D)));
-    auto m = get<0>(mnl_coord);
-    auto n = get<1>(mnl_coord);
-    auto l = get<2>(mnl_coord);
-
-    auto A_ml = A(m,_,l);
-    auto B_nl = B(n,_,l);
-
-    ElementAccumulator accum = ElementAccumulator(0);
-    for (int k = 0; k < size<1>(A); ++k) {
-      ElementAccumulator a = a_converter(A_ml(k));
-      ElementAccumulator b = b_converter(B_nl(k));
-      accum += a * b;
-    }
-
-    ElementEpilogue scaled_output = (alpha * acc_converter(accum)) + (beta * source_converter(C(m,n,l)));
-    D(m,n,l) = output_converter(scaled_output);
-  }
-}
-
-// Most general version
-template <
-  class ProblemShapeMNKL,
-  class ElementA,
-  class StrideA,
-  class ElementB,
-  class StrideB,
-  class ElementAccumulator,
-  class ElementC,
-  class StrideC,
-  class ElementD,
-  class StrideD,
-  class ElementEpilogue>
-void
-gett(
-    ProblemShapeMNKL problem_shape_mnkl,
-    ElementA const* ptr_A, StrideA stride_a_mkl,
-    ElementB const* ptr_B, StrideB stride_b_nkl,
-    ElementAccumulator _,
-    ElementC const* ptr_C, StrideC stride_c_mnl,
-    ElementD      * ptr_D, StrideD stride_d_mnl,
-    ElementEpilogue alpha, ElementEpilogue beta,
-    cudaStream_t stream = 0) {
-  using namespace cute;
-
-  static_assert(cute::rank(ProblemShapeMNKL{}) == 4);
-  auto M = get<0>(problem_shape_mnkl);
-  auto N = get<1>(problem_shape_mnkl);
-  auto K = get<2>(problem_shape_mnkl);
-  auto L = get<3>(problem_shape_mnkl);
-
-  // Represent the full tensors
-  auto A = make_tensor(make_gmem_ptr(ptr_A), make_shape(M,K,L), stride_a_mkl); // (M,K,L)
-  auto B = make_tensor(make_gmem_ptr(ptr_B), make_shape(N,K,L), stride_b_nkl); // (N,K,L)
-  auto C = make_tensor(make_gmem_ptr(ptr_C), make_shape(M,N,L), stride_c_mnl); // (M,N,L)
-  auto D = make_tensor(make_gmem_ptr(ptr_D), make_shape(M,N,L), stride_d_mnl); // (M,N,L)
-
-  dim3 dimBlock(256);
-  dim3 dimGrid(240);
-  gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0));
-}
-
-} // namespace cutlass::reference::device
diff --git a/csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h
deleted file mode 100644
index f7731213013d5..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/util/reference/device/thread/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename TensorRefA,
-  typename TensorRefB,
-  typename TensorRefC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp,
-  typename ConvertOp
->
-__global__ void Gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRefA tensor_a,
-  TensorRefB tensor_b,
-  ScalarType beta,
-  TensorRefC tensor_c,
-  TensorRefC tensor_d,
-  AccumulatorType initial_accum) {
-
-  // Map each thread to a unique tile of the output matrix
-  MatrixCoord output_coord(
-    MatrixCoord::Index((threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow),
-    MatrixCoord::Index((threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn)
-  );
-
-  // Compute the general matrix product
-  thread::Gemm<
-    TensorRefA,
-    TensorRefB,
-    TensorRefC,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  > gemm(initial_accum);
-
-  gemm.multiply_add(
-    problem_size,
-    tensor_a,
-    tensor_b,
-    output_coord);
-
-  gemm.epilogue(problem_size, alpha, beta, tensor_c, tensor_d, output_coord);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp,
-  typename ConvertOp
->
-__global__ void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRefCollectionA tensor_collection_a,
-  TensorRefCollectionB tensor_collection_b,
-  ScalarType beta,
-  TensorRefCollectionC tensor_collection_c,
-  AccumulatorType initial_accum) {
-
-  // Obtain batch ID
-  int batch_id = blockIdx.z;
-
-  // Dereference based on batch_id
-  typename TensorRefCollectionA::TensorRef tensor_a = tensor_collection_a.at(batch_id);
-  typename TensorRefCollectionB::TensorRef tensor_b = tensor_collection_b.at(batch_id);
-  typename TensorRefCollectionC::TensorRef tensor_c = tensor_collection_c.at(batch_id);
-
-  // Map each thread to a unique tile of the output matrix
-  MatrixCoord output_coord(
-    (threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kColumn,
-    (threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kRow
-  );
-
-  // Compute the general matrix product
-  thread::Gemm<
-    typename TensorRefCollectionA::TensorRef,
-    typename TensorRefCollectionB::TensorRef,
-    typename TensorRefCollectionC::TensorRef,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  > gemm(initial_accum);
-
-  gemm.multiply_add(
-    problem_size,
-    tensor_a,
-    tensor_b,
-    output_coord);
-
-  gemm.epilogue(problem_size, alpha, beta, tensor_c, output_coord);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h
deleted file mode 100644
index c703f07f78a24..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <curand_kernel.h>
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel to initialize tensor to uniform random distribution
-template <typename T>
-__global__ void TensorInitializeUniform(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      double range = dist.uniform.max - dist.uniform.min;
-
-      double rnd = curand_uniform(&rng_state[threadIdx.x]);
-
-      rnd = dist.uniform.min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      if (dist.int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << dist.int_scale)));
-        *tensor = T(rnd / double(1 << dist.int_scale));
-      } else {
-        *tensor = T(rnd);
-      }
-
-      tensor += ldm;
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel to initialize tensor to uniform distribution
-template <typename T>
-__global__ void TensorInitializeGaussian(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-
-      double rnd = curand_normal(&rng_state[threadIdx.x]);
-
-      rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
-
-      if (dist.int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << dist.int_scale)));
-        *tensor = T(rnd / double(1 << dist.int_scale));
-      } else {
-        *tensor = T(rnd);
-      }
-    }
-  }
-}
-
-/// Kernel to initialize tensor to an identity matrix
-template <typename T>
-__global__ void TensorInitializeLinear(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      *tensor =
-          dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
-    }
-  }
-}
-
-/// Kernel to initialize tensor to an identity matrix
-template <typename T>
-__global__ void TensorInitializeIdentity(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      *tensor = (c_idx == s_idx ? T(1) : T(0));
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h
deleted file mode 100644
index a64a419d8a193..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/subbyte_reference.h"
-#include "cutlass/fast_math.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines several helpers
-namespace detail {
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank, int RankRemaining>
-struct TensorForEachHelper {
-
-  /// Constructor for general rank
-  __inline__ __device__
-  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
-
-    int64_t product = 1;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = Rank - RankRemaining; i < Rank; ++i) {
-      product *= size[i];
-    }
-
-    coord[Rank - 1 - RankRemaining] = index / product;
-    int64_t remaining = index % product;
-    
-    TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
-  }
-};
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank>
-struct TensorForEachHelper<Func, Rank, 0> {
-
-  /// Constructor for fastest changing rank
-  __inline__ __device__
-  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
-
-    coord[Rank - 1] = index;
-
-    if (coord < size) {
-      func(coord);
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel calls a functor for each element in a tensor's index space
-template <typename Func, int Rank, typename Params>
-__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
-
-  Func func(params);
-
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  int64_t max_index = 1;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank; ++i) {
-    max_index *= size[i];
-  }
-
-  CUTLASS_PRAGMA_NO_UNROLL
-  while  (index < max_index) {
-    Coord<Rank> coord;
-
-    detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index); 
-    index += blockDim.x * gridDim.x;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel calls a functor for each element along a tensor's diagonal
-template <typename Func, int Rank, typename Params>
-__global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
-
-  Func func(params);
-
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
-
-  if (index < end) {
-    Coord<Rank> coord;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Rank; ++i) {
-      coord[i] = index;
-    }
-
-    func(coord);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, typename Func>
-__global__ void BlockForEach(
-  Element *ptr, 
-  size_t capacity, 
-  typename Func::Params params) {
-
-  Func func(params);
-
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
-
-  for (; index < capacity; index += blockDim.x * gridDim.x) {
-    ReferenceFactory<Element>::get(ptr, index) = func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-
diff --git a/csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h b/csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h
deleted file mode 100644
index d5892457ca942..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  int kMblock = 4,
-  int kNblock = 4
->
-__global__ void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  assert(M=N);
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-  int batch_idx = blockIdx.z;
-
-  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
-  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
-  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
-  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
-
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
-
-    // Compute matrix product using blocks
-    ComputeType accum[kMblock][kNblock];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        accum[i][j] = initial_accum;
-      }
-    }
-
-    for (int k_block = 0; k_block < K; ++k_block) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kNblock; j++) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kMblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          if (row < M && col < N &&
-             ( (fill_mode_c == FillMode::kLower && row >= col) || 
-              (fill_mode_c == FillMode::kUpper && row <= col) )               
-            ) {
-
-            // A x B^T (Symmetric) or A x B^H (Hermitian)
-            // complex conjugation on operandB (b_t) is function of blas3 computation
-            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-            ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
-                          conj(tensor_b.at(MatrixCoord(col, k_block))) : 
-                          tensor_b.at(MatrixCoord(col, k_block));
-
-            ComputeType a_ik = ComputeType(a);
-            ComputeType b_jk = ComputeType(b_t);
-
-            // complex conjugation is a function of operand layouts
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_ik = conj(a_ik);
-            }
-            // complex conjugation is a function of operand layouts
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_jk = conj(b_jk);
-            }
-
-            accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
-
-            // B x A^T (Symmetric) or B x A^H (Hermitian)
-            // complex conjugation on operandB (a_t) is function of blas3 computation
-            ElementB b = tensor_b.at(MatrixCoord(row, k_block));
-            ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
-                            conj(tensor_a.at(MatrixCoord(col, k_block))):
-                            tensor_a.at(MatrixCoord(col, k_block));
-
-            ComputeType b_ik = ComputeType(b);
-            ComputeType a_jk = ComputeType(a_t);
-            
-            // complex conjugation here is a function of operand layouts
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_ik = conj(b_ik);
-            }
-            // complex conjugation here is a function of operand layouts
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_jk = conj(a_jk);
-            }
-
-            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-          }
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        int row = row_block + i;
-        int col = col_block + j;
-
-        MatrixCoord coord = MatrixCoord(row, col);
-
-        if (row < M && col < N && 
-            ((fill_mode_c == FillMode::kLower && row >= col) || 
-             (fill_mode_c == FillMode::kUpper && row <= col))
-          ) {
-
-          ScalarType c = tensor_c.at(coord);
-          // The imaginary parts of the diagonal elements of 
-          // a complex data type are assumed and set to zero
-          if (blas_mode == BlasMode::kHermitian) {
-            c = (row == col) ? real(c) : c;
-          }
-
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[i][j]) + 
-            beta * c);
-        }
-      }
-    }
-
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
-
-  } // for (batch_idx)
-}
-
-} // namespace kernel
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
- 
-  int const kMblock = 4;
-  int const kNblock = 4;
-
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    batch_count % std::numeric_limits<uint16_t>::max()
-  );
-
-  kernel::Rank2KComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ScalarType,
-    ComputeType,
-    ConvertOp,
-    InnerProductOp,
-    kMblock,
-    kNblock
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    transform_a,
-    tensor_b,
-    transform_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum,
-    fill_mode_c,
-    blas_mode,
-    batch_count,
-    batch_stride_A,
-    batch_stride_B,
-    batch_stride_C,
-    batch_stride_D
-  );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  FillMode fill_mode_c,
-  BlasMode blas_mode) {
-
-  Rank2KComplex(    
-    problem_size, alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, tensor_c, tensor_d, 
-    ScalarType(0),
-    fill_mode_c,
-    blas_mode);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h
deleted file mode 100644
index e6b36990f0f1a..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-// Standard Library includes
-#include <utility>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/relatively_equal.h"
-
-#include "cutlass/util/distribution.h"
-
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-template <typename Element>
-__global__ void BlockCompareEqual(
-  int *equal, 
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity) {
-
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
-
-    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
-    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
-
-    if (a != b) {
-      *equal = 0;
-
-      return;
-    }
-  }
-}
-
-template <typename Element>
-__global__ void BlockCompareRelativelyEqual(
-  int *equal, 
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  Element epsilon,
-  Element nonzero_floor) {
-
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
-
-    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
-    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
-
-    if (!relatively_equal(a, b, epsilon, nonzero_floor)) {
-      *equal = 0;
-      return;
-    }
-  }
-}
-
-} // namespace kernel
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Performs a bit-level equality check between two blocks
-template <typename Element>
-bool BlockCompareEqual(
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  int grid_size = 0, 
-  int block_size = 0) {
-
-  int equal_flag = 1;
-  int *device_equal_flag = nullptr;
-
-  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
-    throw std::runtime_error("Failed to allocate device flag.");
-  }
-
-  if (cudaMemcpy(
-    device_equal_flag, 
-    &equal_flag, 
-    sizeof(int), 
-    cudaMemcpyHostToDevice) != cudaSuccess) {
-
-    throw std::runtime_error("Failed to copy equality flag to device.");
-  }
-
-  if (!grid_size || !block_size) {
-
-    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-      &grid_size,
-      &block_size,
-      reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
-
-    if (result != cudaSuccess) {
-      throw std::runtime_error("Failed to query occupancy.");
-    }
-
-    // Limit block size. This has the effect of increasing the number of items processed by a
-    // single thread and reduces the impact of initialization overhead.
-    block_size = (block_size < 128 ? block_size : 128);
-  }
-
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-
-  kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
-
-  if (cudaMemcpy(
-    &equal_flag, 
-    device_equal_flag,
-    sizeof(int), 
-    cudaMemcpyDeviceToHost) != cudaSuccess) {
-    
-    cudaFree(device_equal_flag);
-
-    throw std::runtime_error("Failed to copy equality flag from device.");
-  }
-
-  cudaFree(device_equal_flag);
-
-  return equal_flag;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Performs a bit-level equality check between two blocks
-template <typename Element>
-bool BlockCompareRelativelyEqual(
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  Element epsilon,
-  Element nonzero_floor,
-  int grid_size = 0, 
-  int block_size = 0) {
-
-  int equal_flag = 1;
-  int *device_equal_flag = nullptr;
-
-  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
-    throw std::runtime_error("Failed to allocate device flag.");
-  }
-
-  if (cudaMemcpy(
-    device_equal_flag, 
-    &equal_flag, 
-    sizeof(int), 
-    cudaMemcpyHostToDevice) != cudaSuccess) {
-
-    throw std::runtime_error("Failed to copy equality flag to device.");
-  }
-
-  if (!grid_size || !block_size) {
-
-    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-      &grid_size,
-      &block_size,
-      reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
-
-    if (result != cudaSuccess) {
-      throw std::runtime_error("Failed to query occupancy.");
-    }
-
-    // Limit block size. This has the effect of increasing the number of items processed by a
-    // single thread and reduces the impact of initialization overhead.
-    block_size = (block_size < 128 ? block_size : 128);
-  }
-
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-
-  kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
-    device_equal_flag, 
-    ptr_A, 
-    ptr_B, 
-    capacity, 
-    epsilon, 
-    nonzero_floor
-  );
-
-  if (cudaMemcpy(
-    &equal_flag, 
-    device_equal_flag,
-    sizeof(int), 
-    cudaMemcpyDeviceToHost) != cudaSuccess) {
-    
-    cudaFree(device_equal_flag);
-
-    throw std::runtime_error("Failed to copy equality flag from device.");
-  }
-
-  cudaFree(device_equal_flag);
-
-  return equal_flag;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // device
-} // reference
-} // cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h
deleted file mode 100644
index 13aedf14d113f..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h
+++ /dev/null
@@ -1,2077 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
-    in this header are not specialized for any particular data layout and are therefore not
-    intended to offer the best possible performance. Rather, they are intended to be generic
-    reference implementations to support the CUTLASS unit tests.
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-#include <type_traits>
-#include <cstdint>
-
-#endif
-
-// CUDA includes
-#include <curand_kernel.h>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/layout/vector.h"
-
-#include "cutlass/util/reference/device/tensor_foreach.h"
-#include "cutlass/util/distribution.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename FloatType>
-CUTLASS_DEVICE
-FloatType random_normal_float(curandState_t *state) {
-  return curand_normal(state);
-}
-
-template <>
-CUTLASS_DEVICE
-double random_normal_float<double>(curandState_t *state) {
-  return curand_normal_double(state);
-}
-
-template <typename FloatType>
-CUTLASS_DEVICE
-FloatType random_uniform_float(curandState_t *state) {
-  return curand_uniform(state);
-}
-
-template <>
-CUTLASS_DEVICE
-double random_uniform_float<double>(curandState_t *state) {
-  return curand_uniform_double(state);
-}
-
-template <typename Element>
-struct RandomGaussianFunc {
-
-  using FloatType = typename std::conditional<(sizeof(Element) > 4), double, float>::type;
-  using IntType = typename std::conditional<(sizeof(Element) > 4), int64_t, int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType mean;
-    FloatType stddev;
-    int int_scale;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      Element mean_ = 0, 
-      Element stddev_ = 1,
-      int int_scale_ = -1,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      mean(static_cast<FloatType>(mean_)), 
-      stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_),
-      exclude_zero(exclude_zero_) {
-
-      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
-      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomGaussianFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    FloatType rnd = random_normal_float<FloatType>(&rng_state);
-    rnd = params.mean + params.stddev * rnd;
-
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd = FloatType(IntType(std::llround(rnd * params.float_scale_up)));
-      result = Element(IntType(rnd * params.float_scale_down));
-    }
-    else {
-      result = Element(rnd);
-    }
-
-    if (params.exclude_zero >=0 && result == Element(0.0)) {
-      if (rnd > FloatType(0)) {
-        rnd += FloatType(1);
-      } else {
-        rnd -= FloatType(1);
-      }
-      result = Element(rnd);
-    }
-
-    return result;
-  }
-};
-
-
-template <typename Real>
-struct RandomGaussianFunc<complex<Real>> {
-
-  using Element = complex<Real>;
-  using FloatType = typename std::conditional<(sizeof(Real) > 4), double, float>::type;
-  using IntType = typename std::conditional<(sizeof(Real) > 4), int64_t, int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType mean;
-    FloatType stddev;
-    int int_scale;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      Real mean_ = 0, 
-      Real stddev_ = 1,
-      int int_scale_ = -1,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      mean(static_cast<FloatType>(mean_)), 
-      stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_),
-      exclude_zero(exclude_zero_) {
-
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomGaussianFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    FloatType rnd_r = random_normal_float<FloatType>(&rng_state);
-    FloatType rnd_i = random_normal_float<FloatType>(&rng_state);
-    rnd_r = params.mean + params.stddev * rnd_r;
-    rnd_i = params.mean + params.stddev * rnd_i;
-
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(IntType(rnd_i * params.float_scale_down));
-
-      result = {
-        Real(rnd_r * params.float_scale_down),
-        Real(rnd_i * params.float_scale_down)
-      };
-    }
-    else {
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    if (params.exclude_zero >= 0 && 
-        result.real() == Real(0.0) &&
-        result.imag() == Real(0.0)) {
-
-      if (rnd_r > FloatType(0)) {
-        rnd_r += FloatType(1);
-      } else {
-        rnd_r -= FloatType(1);
-      }
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomGaussianFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  using RandomFunc = RandomGaussianFunc<Element>;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    typename RandomFunc::Params random;
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = typename RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-  RandomFunc random;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomGaussianFunc(Params const &params): params(params), random(params.random) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    params.view.at(coord) = random();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomGaussian(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type mean = Element(0),   ///< Gaussian distribution's mean
-  typename RealType<Element>::Type stddev = Element(1), ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  int exclude_zero = -1,                  ///< If non-negative, excludes zeros from tensor init
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomGaussianFunc<Element>;
-  using Func = detail::TensorFillRandomGaussianFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <typename Element>               ///< Element type
-void BlockFillRandomGaussian(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                              ///< seed for RNG
-  typename RealType<Element>::Type mean,      ///< Gaussian distribution's mean
-  typename RealType<Element>::Type stddev,    ///< Gaussian distribution's standard deviation
-  int bits = -1,                              ///< If non-negative, specifies number of fractional bits that
-                                              ///  are not truncated to zero. Permits reducing precision of
-                                              ///  data.
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomGaussianFunc<Element>;
-
-  typename RandomFunc::Params params(seed, mean, stddev, bits);
-
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random uniform distribution
-template <typename Element>                ///< Element type 
-struct RandomUniformFunc {
-
-  using FloatType = typename std::conditional<
-    (sizeof(Element) > 4),
-    double,
-    float>::type;
-
-  using IntType = typename std::conditional<
-    (sizeof(Element) > 4),
-    int64_t,
-    int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType range;
-    FloatType max;
-    int int_scale;
-    double pnan;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0, 
-      Element max_ = 1,
-      Element min = 0,
-      int int_scale_ = -1,
-      double pnan_ = 0,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)), 
-      max(static_cast<FloatType>(max_)),
-      int_scale(int_scale_),
-      pnan(pnan_),
-      exclude_zero(exclude_zero_) {
-      
-      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
-      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
-
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero >= 0) {
-        range = (min == Element(0)) ? range - FloatType(1): range;
-        max = (max_ == Element(0)) ? max - FloatType(1): max; 
-      }
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomUniformFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
-        return Element(NAN);
-      }
-    }
-
-    FloatType rnd = random_uniform_float<FloatType>(&rng_state);
-    rnd = params.max - params.range * rnd;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-
-    if (params.int_scale >= 0) {
-      rnd = FloatType(IntType(std::llround(rnd * params.float_scale_up)));
-      result = Element(IntType(rnd * params.float_scale_down));
-    }
-    else {
-      result = Element(rnd);
-    }
-
-    if (params.exclude_zero >=0 && result == Element(0.0)) {
-      if (rnd > FloatType(0)) {
-        rnd = std::min(params.max, rnd + FloatType(1));
-      } else {
-        rnd = std::max((params.max - params.range), rnd - FloatType(1));
-      }
-      result = Element(rnd);
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <typename Real>
-struct RandomUniformFunc<complex<Real>> {
-
-  using Element = complex<Real>;
-
-  using FloatType = typename std::conditional<
-    (sizeof(Real) > 4),
-    double,
-    float>::type;
-
-  using IntType = typename std::conditional<
-    (sizeof(Real) > 4),
-    int64_t,
-    int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType range;
-    FloatType min;
-    int int_scale;
-    double pnan;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0, 
-      FloatType max = 1,
-      FloatType min_ = 0,
-      int int_scale_ = -1,
-      double pnan_ = 0,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      range(static_cast<FloatType>(max - min_)), 
-      min(static_cast<FloatType>(min_)), 
-      int_scale(int_scale_),
-      pnan(pnan_),
-      exclude_zero(exclude_zero_) {
-
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero >= 0) {
-        min = (min == FloatType(0)) ? min + FloatType(1): min;
-        range = (max == FloatType(0)) ? range - FloatType(1): range; 
-      }
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomUniformFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
-        return Element(Real(NAN), Real(NAN));
-      }
-    }
-
-    FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
-    FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
-
-    rnd_r = params.min + params.range * rnd_r;
-    rnd_i = params.min + params.range * rnd_i;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-
-    if (params.int_scale >= 0) {
-      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(IntType(rnd_i * params.float_scale_up));
-
-      result = {
-        Real(rnd_r * params.float_scale_down),
-        Real(rnd_i * params.float_scale_down)
-      };
-    }
-    else {
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    if (params.exclude_zero >= 0 && 
-        result.real() == Real(0.0) &&
-        result.imag() == Real(0.0)) {
-
-      if (rnd_r > FloatType(0)) {
-        rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
-      } else {
-        rnd_r = std::max((params.min), rnd_r - FloatType(1));
-      }
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random uniform distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomUniformFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  using RandomFunc = RandomUniformFunc<Element>;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    typename RandomFunc::Params random;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-  RandomFunc random;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomUniformFunc(Params const &params): params(params), random(params.random) {
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    params.view.at(coord) = random();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type max = Element(1), ///< upper bound of distribution
-  typename RealType<Element>::Type min = Element(0), ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  int exclude_zero = -1,               ///< If non-negative, excludes zeros from tensor init
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomUniformFunc<Element>;
-  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, random),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Element>
-void BlockFillRandomUniform(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type max,   ///< upper bound of distribution
-  typename RealType<Element>::Type min,   ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomUniformFunc<Element>;
-
-  typename RandomFunc::Params params(seed, max, min, bits, pnan);
-
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random sparse meta 
-template <typename Element>               ///< Element type
-struct RandomSparseMetaFunc {
-
-  using FloatType = float;
-
-  using IntType = int32_t;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType range;
-    int MetaSizeInBits;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0, 
-      int MetaSizeInBits_ = 2 
-    ):
-      seed(seed_), 
-      MetaSizeInBits(MetaSizeInBits_) {
-      if (MetaSizeInBits_ == 2) {
-        range = 6;
-      }
-      else if (MetaSizeInBits_ == 4) {
-        range = 2;
-      }
-      else {
-        throw std::invalid_argument("Invalid MetaSizeInBits");
-      }
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomSparseMetaFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
-    Element TwoToOneMeta[2] = {0x4, 0xe};
-
-    Element *MetaArray =
-        (params.MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
-
-    Element result = 0x0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
-      FloatType rnd = random_uniform_float<FloatType>(&rng_state);
-      rnd = params.range * rnd;
-      Element meta = MetaArray[(int)rnd];
-
-      result = (Element)(result | ((Element)(meta << (i * 4))));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomSparseMetaFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  using RandomFunc = RandomSparseMetaFunc<Element>;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    typename RandomFunc::Params random;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-  RandomFunc random;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomSparseMetaFunc(Params const &params): params(params), random(params.random) {
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    params.view.at(coord) = random();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomSparseMeta(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2,                 ///< meta data size
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
-  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  typename RandomFunc::Params random(seed, MetaSizeInBits);
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, random),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Element>
-void BlockFillRandomSparseMeta(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2,                 ///< meta data size
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
-
-  typename RandomFunc::Params params(seed, MetaSizeInBits);
-
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Functor to fill a tensor with zeros off the diagonal and a uniform value on the diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillDiagonalFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element diag;
-    Element other;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    Params(
-      TensorView view_ = TensorView(),
-      Element diag_ = Element(1),
-      Element other_ = Element(0)
-    ):
-      view(view_), diag(diag_), other(other_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillDiagonalFunc(Params const &params): params(params) {
-
-  }
-
-  /// Updates the tensor
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    params.view.at(coord) = (is_diag ? params.diag : params.other);
-  }
-};
-
-// Overwrites the elements of a tensor with a uniform value depending on fill mode
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillPartialFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element element;
-    FillMode fill_mode;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params(): fill_mode(FillMode::kNone) { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,
-      Element element_,
-      FillMode fill_mode_
-    ):
-      view(view_), element(element_), fill_mode(fill_mode_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  TensorFillPartialFunc(Params const &params): params(params) {
-
-  }
-
-  /// Overwrites the element if it is within the covered region.
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool predicate = true;
-      
-    switch (params.fill_mode) {
-    case FillMode::kFull:
-      predicate = true;
-      break;
-
-    case FillMode::kLower:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] < coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-
-    case FillMode::kUpper:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] > coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-
-    case FillMode::kDiagonal:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] != coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-
-    case FillMode::kNone: // fall-through
-    
-    default:
-      predicate = false;
-      break;
-    }
-    
-    if (predicate) {
-      params.view.at(coord) = params.element;
-    }
-  }
-};
-
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorClearPartialFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// 
-  static_assert((Layout::kRank == 2), "TensorClearPartial is only supported for matrices");
-
-  /// Parameters structure
-  struct Params {
-    TensorView view{};
-    Element element{};
-    FillMode fill_mode{FillMode::kNone};
-    int alignment{0};
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  TensorClearPartialFunc(Params const &params): params(params) {
-
-  }
-
-  /// Overwrites the element if it is within the covered region.
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool predicate = true;
-      
-    switch (params.fill_mode) {
-
-    case FillMode::kLower:
-      if ((coord[0] >= coord[1]) || 
-          ((coord[1] - coord[0]) >= params.alignment))  {
-          predicate = false;
-        break;
-      }
-      break;
-
-    case FillMode::kUpper:
-      if ((coord[0] <= coord[1]) ||
-          ((coord[0] - coord[1]) >= params.alignment))  {
-          predicate = false;
-        break;
-      }
-      break;
-
-    case FillMode::kNone: // fall-through
-    
-    default:
-      predicate = false;
-      break;
-    }
-    
-    if (predicate) {
-      params.view.at(coord) = params.element;
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor everywhere with a unique value for its diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillDiagonal(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element diag = Element(1),              ///< value to write in the diagonal
-  Element other = Element(0),             ///< value to write off the diagonal
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorFillDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, diag, other),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-/// Fills a tensor partially depending on fill mode. Elements not covered by the fillmode are
-/// not written.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillPartial(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element element,
-  FillMode fill_mode,
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorFillPartialFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, element, fill_mode),
-    stream
-  );
-}
-
-/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side
-/// of fillmode (upto the alignment) are overwritten with the user supplied element (typically zeros)
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorClearPartial(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element element,
-  FillMode fill_mode,
-  int alignment,
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorClearPartialFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params{view, element, fill_mode, alignment},
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFill(
-  TensorView<Element, Layout> view,         ///< destination tensor
-  Element val = Element(0),                 ///< value to uniformly fill it with
-  cudaStream_t stream = nullptr) {
-
-  TensorFillDiagonal(view, val, val, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor's diagonal with 1 and 0 everywhere else.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillIdentity(
-  TensorView<Element, Layout> view,                 ///< destination tensor
-  cudaStream_t stream = nullptr) {
-
-  TensorFillDiagonal(view, Element(1), Element(0), stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateDiagonalFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element diag;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      Element diag_ = Element(1)
-    ):
-      view(view_), diag(diag_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorUpdateDiagonalFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    if (is_diag) {
-      params.view.at(coord) = params.diag;  
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateDiagonal(
-  TensorView<Element, Layout> view,                 ///< destination tensor
-  Element diag = Element(1),
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorUpdateDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, diag),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateOffDiagonalFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element other;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      Element other_ = Element(0)
-    ):
-      view(view_), other(other_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorUpdateOffDiagonalFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    if (!is_diag) {
-      params.view.at(coord) = params.other;  
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateOffDiagonal(
-  TensorView<Element, Layout> view,      ///< destination tensor
-  Element other = Element(1),
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorUpdateOffDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, other),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillLinearFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Array<Element, Layout::kRank> v;
-    Element s;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Array<Element, Layout::kRank> const & v_,
-      Element s_ = Element(0)
-    ):
-      view(view_), v(v_), s(s_) { 
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillLinearFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    Element sum = params.s;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank; ++i) {
-      if constexpr (is_complex<Element>::value) {
-        if constexpr (sizeof_bits<Element>::value <= 32) {
-          sum = Element(static_cast<complex<float>>(sum) + 
-                  static_cast<complex<float>>(params.v[i]) * static_cast<complex<float>>(coord[i]));
-        }
-      }
-      else if constexpr (sizeof_bits<Element>::value <= 32) {
-        if constexpr (std::numeric_limits<Element>::is_integer) {
-          sum = Element(static_cast<int32_t>(sum) + 
-                  static_cast<int32_t>(params.v[i]) * static_cast<int32_t>(coord[i]));
-        }
-        else {
-          sum = Element(static_cast<float>(sum) + 
-                  static_cast<float>(params.v[i]) * static_cast<float>(coord[i]));
-        }
-      }
-      else {
-        sum += params.v[i] * coord[i];
-      }
-    }
-
-    params.view.at(coord) = sum;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills tensor with a linear combination of its coordinate and another vector
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillLinear(
-  TensorView<Element, Layout> view,      ///< destination tensor
-  Array<Element, Layout::kRank> const & v,
-  Element s = Element(0),
-  cudaStream_t stream = nullptr) {
-
-  using Func = detail::TensorFillLinearFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, v, s),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values from a distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandom(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,
-  Distribution dist,
-  cudaStream_t stream = nullptr,
-  int exclude_zero = -1                   ///< If non-negative, excludes 0.
-                                          ///  Note that setting this flag will result in more 1's,
-                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
-  ) {
-
-  using Real = typename RealType<Element>::Type;
-
-  if (dist.kind == Distribution::Gaussian) {
-    TensorFillRandomGaussian<Element, Layout>(
-      view,
-      seed,
-      static_cast<Real>(dist.gaussian.mean),
-      static_cast<Real>(dist.gaussian.stddev),
-      dist.int_scale,
-      exclude_zero,
-      stream);
-  } else if (dist.kind == Distribution::Uniform) {
-    TensorFillRandomUniform<Element, Layout>(
-      view,
-      seed,
-      static_cast<Real>(dist.uniform.max),
-      static_cast<Real>(dist.uniform.min),
-      dist.int_scale,
-      dist.uniform.pnan,
-      exclude_zero,
-      stream);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequential(
-  Element *ptr,
-  int64_t capacity,
-  Element v = Element(1),
-  Element s = Element(0)) {
-
-  using Layout = layout::PackedVectorLayout;
-  Layout::TensorCoord size(static_cast<Layout::Index>(capacity)); // -Wconversion
-  Layout layout = Layout::packed(size);
-  TensorView<Element, Layout> view(ptr, layout, size);
-
-  Array<Element, Layout::kRank> c{};
-  c[0] = v;
-
-  TensorFillLinear(view, c, s);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillRandom(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,
-  Distribution dist,
-  cudaStream_t stream = nullptr) {
-
-  using Real = typename RealType<Element>::Type;
-
-  if (dist.kind == Distribution::Gaussian) {
-    BlockFillRandomGaussian<Element>(
-      ptr,
-      capacity,
-      seed,
-      static_cast<Real>(dist.gaussian.mean),
-      static_cast<Real>(dist.gaussian.stddev),
-      dist.int_scale,
-      stream);
-  }
-  else if (dist.kind == Distribution::Uniform) {
-    BlockFillRandomUniform<Element>(
-      ptr,
-      capacity,
-      seed,
-      static_cast<Real>(dist.uniform.max),
-      static_cast<Real>(dist.uniform.min),
-      dist.int_scale,
-      dist.uniform.pnan,
-      stream);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorCopyDiagonalInFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element const *ptr;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Element const *ptr_
-    ):
-      view(view_), ptr(ptr_) { 
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorCopyDiagonalInFunc(Params const &params): params(params) {
-
-  }
-
-  /// Only update the diagonal element
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diagonal = true;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[0]) {
-        is_diagonal = false;
-      }
-    }
-    if (is_diagonal) {
-      params.view.at(coord) = params.ptr[coord[0]];
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies a diagonal in from host memory without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalIn(
-  TensorView<Element, Layout> view,   ///< destination tensor
-  Element const *ptr,                        ///< dense buffer of elements
-  cudaStream_t stream = nullptr) {
-
-  using Func = detail::TensorCopyDiagonalInFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, ptr),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorCopyDiagonalOutFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element *ptr;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Element *ptr_
-    ):
-      view(view_), ptr(ptr_) { 
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorCopyDiagonalOutFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diagonal = true;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[0]) {
-        is_diagonal = false;
-      }
-    }
-    if (is_diagonal) {
-      params.ptr[coord[0]] = params.view.at(coord);  
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies the diagonal of a tensor into a dense buffer in host memory.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalOut(
-  Element *ptr,                               ///< dense buffer of elements
-  TensorView<Element, Layout> view,      ///< source tensor
-  cudaStream_t stream = nullptr) {
-
-  using Func = detail::TensorCopyDiagonalOutFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, ptr),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h
deleted file mode 100644
index 3911b0240c6d2..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/util/reference/device/kernel/tensor_foreach.h"
-
-namespace cutlass  {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Launches a kernel calling a functor for each element in a tensor's index space.
-template <typename Func, int Rank, typename Params>
-struct TensorForEach {
-
-  /// Constructor performs the operation.
-  TensorForEach(
-    Coord<Rank> size, Params params = Params(),
-    int grid_size = 0, int block_size = 0,
-    cudaStream_t stream = nullptr) {
-
-    if (!grid_size || !block_size) {
-
-      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
-
-      if (result != cudaSuccess) {
-        throw std::runtime_error("Failed to query occupancy.");
-      }
-
-      // Limit block size. This has the effect of increasing the number of items processed by a
-      // single thread and reduces the impact of initialization overhead.
-      block_size = (block_size < 128 ? block_size : 128);
-    }
-
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-
-    kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Launches a kernel calling a functor for each element along a tensor's diagonal
-template <typename Func, int Rank, typename Params>
-struct TensorDiagonalForEach {
-
-  /// Constructor performs the operation
-  TensorDiagonalForEach(
-    Coord<Rank> size, Params params = Params(),
-    int start = 0, int end = -1,
-    int block_size = 128, cudaStream_t stream = nullptr) {
-
-    if (end < 0) {
-      end = size.min();
-    }
-
-    dim3 block(block_size, 1, 1);
-    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
-
-    kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
-      size, params, start, end);
-  }
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, typename Func>
-struct BlockForEach {
-
-  /// Constructor performs the operation.
-  BlockForEach(
-    Element *ptr,
-    size_t capacity,
-    typename Func::Params params = typename Func::Params(),
-    int grid_size = 0,
-    int block_size = 0,
-    cudaStream_t stream = nullptr) {
-
-    if (!grid_size || !block_size) {
-
-      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
-
-      if (result != cudaSuccess) {
-        throw std::runtime_error("Failed to query occupancy.");
-      }
-
-      // Limit block size. This has the effect of increasing the number of items processed by a
-      // single thread and reduces the impact of initialization overhead.
-      block_size = (block_size < 128 ? block_size : 128);
-    }
-
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-
-    kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h
deleted file mode 100644
index 47b898b4fd161..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cmath>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/reference/detail/linear_to_coordinate.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp,
-  int kBlockSize = 128
->
-__global__ void TensorTransformReducePartial(
-  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  
-  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int64_t size = view.size();
-
-  __shared__ ComputeType scratchpad[kBlockSize];
-
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-
-    // Map linear thread ID onto tensor coordinate
-    typename Layout::TensorCoord coord;
-
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
-
-    if (view.contains(coord)) {
-
-      // Fetch element
-      Element x = view.at(coord);
-
-      // Transform 
-      identity = reduce(identity, transform(x));
-    }
-  }
-
-  scratchpad[threadIdx.x] = identity;
-
-  __syncthreads();
-
-  // One thread performs the final reduction and stores out. This could be enhanced via
-  // a tree reduction and pipelining.
-  if (threadIdx.x == 0) {
-
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-    
-    workspace[blockIdx.x] = identity;
-  }
-}
-
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp,
-  int kBlockSize = 128
->
-__global__ void TensorTransformReducePartial(
-  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
-  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  
-  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-  auto size = static_cast<int64_t>(view_A.size());
-
-  __shared__ ComputeType scratchpad[kBlockSize];
-
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-
-    // Map linear thread ID onto tensor coordinate
-    typename Layout::TensorCoord coord;
-
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
-
-    if (view_A.contains(coord)) {
-
-      // Fetch element
-      Element a = view_A.at(coord);
-      Element b = view_B.at(coord);
-
-      // Transform 
-      identity = reduce(identity, transform(a, b));
-    }
-  }
-
-  scratchpad[threadIdx.x] = identity;
-
-  __syncthreads();
-
-  // One thread performs the final reduction and stores out. This could be enhanced via
-  // a tree reduction and pipelining.
-  if (threadIdx.x == 0) {
-
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-
-    workspace[blockIdx.x] = identity;
-  }
-}
-
-
-template <
-  typename ComputeType,
-  typename ReduceOp,
-  int kBlockSize = 32
->
-__global__ void TensorTransformReduceFinalize(
-  ComputeType *workspace, 
-  ComputeType identity,
-  int workspace_size,
-  ReduceOp reduce) {
-
-  __shared__ ComputeType scratchpad[kBlockSize];
-
-  for (int idx = threadIdx.x; idx < workspace_size; idx += kBlockSize) {
-    identity = reduce(identity, workspace[idx]);
-  }
-
-  scratchpad[threadIdx.x] = identity;
-
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-
-    workspace[0] = identity;
-  }
-}
-
-} // namespace kernel
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Transform-reduce operation over the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int workspace_size,                   /// Number of elements in workspace
-  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
-  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
-) {
-
-  int const kBlockSize = 128;
-
-  dim3 block(kBlockSize, 1);
-  dim3 grid(workspace_size, 1);
-
-  kernel::TensorTransformReducePartial<
-    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
-  ><<< grid, block, 0, stream >>>(
-    view, identity, reduce, transform, workspace
-  );
-
-  int const kFinalizeBlockSize = 32;
-
-  kernel::TensorTransformReduceFinalize<
-    ComputeType, ReduceOp, kFinalizeBlockSize
-  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
-    workspace, identity, workspace_size, reduce
-  );
-
-  if (copy_out) {
-    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaMemcpy() failed");
-    }
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of two tensors, zipped together
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
-  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int workspace_size,                   /// Number of elements in workspace
-  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
-  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
-) {
-
-  if (view_A.extent() != view_B.extent()) {
-    throw std::runtime_error("Extents must be equal.");
-  }
-
-  int const kBlockSize = 128;
-
-  dim3 block(kBlockSize, 1);
-  dim3 grid(workspace_size, 1);
-
-  kernel::TensorTransformReducePartial<
-    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
-  ><<< grid, block, 0, stream >>>(
-    view_A, view_B, identity, reduce, transform, workspace
-  );
-
-  int const kFinalizeBlockSize = 32;
-
-  kernel::TensorTransformReduceFinalize<
-    ComputeType, ReduceOp, kFinalizeBlockSize
-  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
-    workspace, identity, workspace_size, reduce
-  );
-
-  if (copy_out) {
-    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaMemcpy() failed");
-    }
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,
-  ComputeType identity,            
-  ReduceOp reduce,                 
-  TransformOp transform,
-  cudaStream_t stream = nullptr, 
-  int workspace_size = 0           
-) {
-
-  // Optionally query for the SM count to size the workspace.
-  if (!workspace_size) {
-
-    int device_idx = 0;
-    cudaDeviceProp prop;
-
-    cudaError_t result = cudaGetDevice(&device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() failed");
-    }
-
-    result = cudaGetDeviceProperties(&prop, device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProp() failed");
-    }
-
-    workspace_size = int(prop.multiProcessorCount);
-  }
-
-  DeviceAllocation<ComputeType> workspace(workspace_size);
-
-  ComputeType output = TensorTransformReduce(
-    view, 
-    identity, 
-    reduce, 
-    transform, 
-    workspace.get(), 
-    workspace_size, 
-    stream, 
-    true);
-
-  return output;
-}
-
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity,            
-  ReduceOp reduce,                 
-  TransformOp transform,
-  cudaStream_t stream = nullptr, 
-  int workspace_size = 0           
-) {
-
-  // Optionally query for the SM count to size the workspace.
-  if (!workspace_size) {
-
-    int device_idx = 0;
-    cudaDeviceProp prop;
-
-    cudaError_t result = cudaGetDevice(&device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() failed");
-    }
-
-    result = cudaGetDeviceProperties(&prop, device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProp() failed");
-    }
-
-    workspace_size = int(prop.multiProcessorCount);
-  }
-
-  DeviceAllocation<ComputeType> workspace(workspace_size);
-
-  ComputeType output = TensorTransformReduce(
-    view_A,
-    view_B, 
-    identity, 
-    reduce, 
-    transform, 
-    workspace.get(), 
-    workspace_size, 
-    stream, 
-    true);
-
-  return output;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to compute the sum of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSum(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  plus<ComputeType> reduce;
-  NumericConverter<ComputeType, Element> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform, stream, workspace_size);
-}
-
-/// Helper to compute the sum of the squares of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSumSq(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform, stream, workspace_size);
-}
-
-/// Helper to compute the norm of the elements of a tensor.
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNorm(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  return std::sqrt(TensorSumSq(view, identity, stream, workspace_size));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to compute the sum of the squares of the differences of two tensors
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorSumSqDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared_difference<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view_A, view_B, identity, reduce, transform, stream, workspace_size);
-}
-
-
-/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNormDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity, stream, workspace_size));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h
deleted file mode 100644
index 4e5a50403cf8d..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
-    in this header are not specialized for any particular data layout and are therefore not
-    intended to offer the best possible performance. Rather, they are intended to be generic
-    reference implementations to support the CUTLASS unit tests.
-*/
-
-#pragma once
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_view.h"
-
-#include "cutlass/util/reference/device/tensor_foreach.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorReLuFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Coordinate in tensor's index space
-  using TensorCoord = typename TensorView::TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element threshold;
-
-
-    //
-    // Methods
-    //
-
-    Params(
-      TensorView view_ = TensorView(),
-      Element threshold_ = Element(0)
-    ):
-      view(view_), threshold(threshold_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  TensorReLuFunc(Params const &params): params(params) {
-
-  }
-
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    Element const & value = params.view.at(coord);
-    params.view.at(coord) = (value < params.threshold) ? params.threshold : value;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Apply ReLu on a tensor
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorReLu(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element threshold = Element(0)) {         ///< ReLu threshold
-  
-  using Func = detail::TensorReLuFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, threshold)
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h
deleted file mode 100644
index 04775a746ad16..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace thread {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Thread-level blocked general matrix product.
-//
-// Note, this is a reference implementation. Performance is not expected to approach peak.
-//
-template <
-  typename TensorRefA,
-  typename TensorRefB,
-  typename TensorRefC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<typename TensorRefC::Element, ScalarType>
->
-struct Gemm {
-
-  using ElementA = typename TensorRefA::Element;
-  using ElementB = typename TensorRefB::Element;
-  using ElementC = typename TensorRefC::Element;
-
-  //
-  // Data members
-  //
-
-  /// Tile for A operand
-  ElementA A_tile[OutputTile::kColumn];
-
-  /// Tile for B operand
-  ElementB B_tile[OutputTile::kRow];
-
-  /// Tile for Accumulator
-  AccumulatorType accum[OutputTile::kColumn][OutputTile::kRow];
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Gemm(AccumulatorType initial_accum = AccumulatorType(0)) {
-
-    // Clear fetch registers
-    for (int i = 0; i < OutputTile::kColumn; ++i) {
-      A_tile[i] = ElementA(0);
-    }
-
-    for (int j = 0; j < OutputTile::kRow; ++j) {
-      B_tile[j] = ElementB(0);
-    }
-
-    // Clear accumulators
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < OutputTile::kColumn; ++j) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < OutputTile::kRow; ++i) {
-        accum[j][i] = initial_accum;
-      }
-    }
-  }
-
-  /// Computes a matrix product
-  CUTLASS_HOST_DEVICE
-  Gemm & multiply_add(
-    gemm::GemmCoord problem_size,
-    TensorRefA tensor_a,
-    TensorRefB tensor_b,
-    MatrixCoord output_coord = MatrixCoord()) {
-
-    InnerProductOp inner_product_op;
-
-    // Loop over the GEMM K dimension
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int k = 0; k < problem_size.k(); ++k) {
-
-      // Fetch a slice of the A matrix
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < OutputTile::kColumn; ++i) {
-        if (output_coord.row() + i < problem_size.m()) {
-          A_tile[i] = tensor_a.at(make_Coord(output_coord.row() + i, k));
-        }
-      }
-
-      // Fetch a slice of the B matrix
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < OutputTile::kRow; ++j) {
-        if (output_coord.column() + j < problem_size.n()) {
-          B_tile[j] = tensor_b.at(make_Coord(k, output_coord.column() + j));
-        }
-      }
-
-      // Compute an accumulated matrix product
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < OutputTile::kRow; ++j) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < OutputTile::kColumn; ++i) {
-          accum[j][i] = inner_product_op(A_tile[i], B_tile[j], accum[j][i]);
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  /// Performs linear scaling of matrix product and updates output tensor
-  CUTLASS_HOST_DEVICE
-  Gemm & epilogue(
-    gemm::GemmCoord problem_size,
-    ScalarType alpha,
-    ScalarType beta,
-    TensorRefC tensor_c,
-    TensorRefC tensor_d,
-    MatrixCoord output_coord = MatrixCoord()) {
-
-    ConvertOp convert_op;
-    
-    // Update the output tensor
-    for (int j = 0; j < OutputTile::kRow; ++j) {
-      for (int i = 0; i < OutputTile::kColumn; ++i) {
-        MatrixCoord coord = output_coord + MatrixCoord(i, j);
-        if (coord.row() < problem_size.m() && coord.column() < problem_size.n()) {
-
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[j][i]) +
-            beta * ScalarType(tensor_c.at(coord))
-          );
-        }
-      }
-    }
-
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/conv.hpp b/csrc/sparse/cutlass/example/util/reference/host/conv.hpp
deleted file mode 100644
index 545dbba9a4e89..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/conv.hpp
+++ /dev/null
@@ -1,698 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for CONV in host-side code.
-*/
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-#include "cute/tensor.hpp"
-
-#include <cuda_runtime.h>
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::reference::host {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<4>(activation)) &&
-          (d_ >= 0 && d_ < size<3>(activation)) &&
-          (h_ >= 0 && h_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t h_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<3>(activation)) &&
-          (h_ >= 0 && h_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-
-} // namespace detail
-
-template<
-  class ElementAcc_,
-  class ElementScalar_,
-  class ElementCompute_,
-  class ElementC_,
-  class ElementOut_,
-  class TensorAlpha_,
-  class TensorBeta_,
-  class TensorBias_,
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>
->
-struct ConvEpilogueFusionParams {
-  using ElementAcc = ElementAcc_;
-  using ElementScalar = ElementScalar_;
-  using ElementCompute = ElementCompute_;
-  using ElementC = ElementC_;
-  using ElementOut = ElementOut_;
-  using TensorAlpha = TensorAlpha_;
-  using TensorBeta = TensorBeta_;
-  using TensorBias = TensorBias_;
-  using ActivationFunctor = ActivationFunctor_;
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-
-  TensorAlpha tensor_alpha{};
-  TensorBeta tensor_beta{};
-  TensorBias tensor_bias{};
-};
-
-template<
-  cutlass::conv::Operator ConvOp,
-  int NumSpatialDims,
-  class TensorA,
-  class TensorB,
-  class TensorC,
-  class TensorD,
-  class ShapePadding,
-  class StrideTraversal,
-  class ShapeDilation,
-  class EpilogueFusionParams
->
-struct ConvReferenceImpl {
-  // Hard code accumlulator type to float to avoid data lost in accumulating add.
-  using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
-  using ElementC = typename EpilogueFusionParams::ElementC;
-  using ElementOut = typename EpilogueFusionParams::ElementOut;
-  using ElementScalar = typename EpilogueFusionParams::ElementScalar;
-  using ElementCompute = typename EpilogueFusionParams::ElementCompute;
-  using ElementBias = typename EpilogueFusionParams::TensorBias::value_type;
-  using ActivationFunctor = typename EpilogueFusionParams::ActivationFunctor;
-
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAcc> acc_converter;
-  NumericConverter<ElementCompute, ElementC> residual_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  // Output related converter
-  NumericConverter<ElementOut, ElementCompute> output_converter;
-
-  EpilogueFusionParams& epi_fusion_params_;
-  TensorA const& tensor_a_;
-  TensorB const& tensor_b_;
-  TensorC const& tensor_c_;
-  TensorD& tensor_d_;
-
-  ShapePadding const& padding_;
-  StrideTraversal const& tstride_;
-  ShapeDilation const& dilation_;
-
-  // Epilogue activation operation
-  ActivationFunctor epi_activation;
-
-  ConvReferenceImpl(
-    TensorA const& tensor_a,
-    TensorB const& tensor_b,
-    TensorC const& tensor_c,
-    TensorD& tensor_d,
-    ShapePadding const& padding,
-    StrideTraversal const& tstride,
-    ShapeDilation const& dilation,
-    EpilogueFusionParams& epi_fusion_params)
-  : tensor_a_(tensor_a),
-    tensor_b_(tensor_b),
-    tensor_c_(tensor_c),
-    tensor_d_(tensor_d),
-    padding_(padding),
-    tstride_(tstride),
-    dilation_(dilation),
-    epi_fusion_params_(epi_fusion_params)
-  {
-    static_assert(rank(ShapePadding{}) == rank(ShapeDilation{}));
-    static_assert(rank(ShapePadding{}) == rank(StrideTraversal{}));
-  }
-
-  void compute_reference() {
-    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
-      fprop_reference(cute::Int<NumSpatialDims>{});
-    }
-    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
-      dgrad_reference(cute::Int<NumSpatialDims>{});
-    }
-    else {
-      wgrad_reference(cute::Int<NumSpatialDims>{});
-    }
-  }
-
-private:
-  // Specialization for 1D fprop kernel
-  void fprop_reference(cute::Int<1> spatial_dims) {
-    int32_t N = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t q = 0; q < Q; ++q) {
-        for (int32_t k = 0; k < K; ++k) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t s = 0; s < S; ++s) {
-            for (int32_t c = 0; c < C; ++c) {
-              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-              if (detail::is_activation_in_bounds(tensor_a_, n, w, c)) {
-                auto a = tensor_a_(c, w, n);
-                auto b = tensor_b_(c, s, k);
-                accumulator += ElementAcc(a * b);
-              }
-            }
-          }
-          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-            epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-            epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(k, q, n));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-          }
-          output = epi_activation(output);
-          tensor_d_(k, q, n) = output_converter(output);
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 2D fprop kernel
-  void fprop_reference(cute::Int<2> spatial_dims) {
-    int32_t N = size<3>(tensor_d_);
-    int32_t P = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t p = 0; p < P; ++p) {
-        for (int32_t q = 0; q < Q; ++q) {
-          for (int32_t k = 0; k < K; ++k) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t r = 0; r < R; ++r) {
-              for (int32_t s = 0; s < S; ++s) {
-                for (int32_t c = 0; c < C; ++c) {
-                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                  if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c)) {
-                    auto a = tensor_a_(c, w, h, n);
-                    auto b = tensor_b_(c, s, r, k);
-                    accumulator += ElementAcc(a * b);
-                  }
-                }
-              }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-            }
-            output = epi_activation(output);
-            tensor_d_(k, q, p, n) = output_converter(output);
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 3D fprop kernel
-  void fprop_reference(cute::Int<3> spatial_dims) {
-    int32_t N = size<4>(tensor_d_);
-    int32_t Z = size<3>(tensor_d_);
-    int32_t P = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t T = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t z = 0; z < Z; ++z) {
-        for (int32_t p = 0; p < P; ++p) {
-          for (int32_t q = 0; q < Q; ++q) {
-            for (int32_t k = 0; k < K; ++k) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t t = 0; t < T; ++t) {
-                for (int32_t r = 0; r < R; ++r) {
-                  for (int32_t s = 0; s < S; ++s) {
-                    for (int32_t c = 0; c < C; ++c) {
-                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                      if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c)) {
-                        auto a = tensor_a_(c, w, h, d, n);
-                        auto b = tensor_b_(c, s, r, t, k);
-                        accumulator += ElementAcc(a * b);
-                      }
-                    }
-                  }
-                }
-              }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-              }
-              output = epi_activation(output);
-              tensor_d_(k, q, p, z, n) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 1D dgrad kernel
-  void dgrad_reference(cute::Int<1> spatial_dims) {
-    int32_t N = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-
-#if defined(_OPENMP)
-   #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t w = 0; w < W; ++w) {
-        for (int32_t c = 0; c < C; ++c) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t k = 0; k < K; ++k) {
-            for (int32_t s = 0; s < S; ++s) {
-              int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-
-              if (q % cute::get<0>(tstride_) == 0) {
-                q /= cute::get<0>(tstride_);
-              } else {
-                continue;
-              }
-
-              if (detail::is_activation_in_bounds(tensor_a_, n, q, k)) {
-                accumulator += ElementAcc(tensor_a_(k, q, n) * tensor_b_(c, s, k));
-              }
-            }
-          }
-          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-            ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-            ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(c, w, n));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-          }
-          output = epi_activation(output);
-          tensor_d_(c, w, n) = output_converter(output);
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 2D dgrad kernel
-  void dgrad_reference(cute::Int<2> spatial_dims) {
-    int32_t N = size<3>(tensor_d_);
-    int32_t H = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t h = 0; h < H; ++h) {
-        for (int32_t w = 0; w < W; ++w) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t k = 0; k < K; ++k) {
-              for (int32_t r = 0; r < R; ++r) {
-                for (int32_t s = 0; s < S; ++s) {
-                  int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                  int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
-
-                  if (q % cute::get<0>(tstride_) == 0) {
-                    q /= cute::get<0>(tstride_);
-                  } else {
-                    continue;
-                  }
-
-                  if (p % cute::get<1>(tstride_) == 0) {
-                    p /= cute::get<1>(tstride_);
-                  } else {
-                    continue;
-                  }
-
-                  if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k)) {
-                    accumulator += ElementAcc(tensor_a_(k, q, p, n) * tensor_b_(c, s, r, k));
-                  }
-                }
-              }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-            }
-            output = epi_activation(output);
-
-            tensor_d_(c, w, h, n) = output_converter(output);
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 3D dgrad kernel
-  void dgrad_reference(cute::Int<3> spatial_dims) {
-    int32_t N = size<4>(tensor_d_);
-    int32_t D = size<3>(tensor_d_);
-    int32_t H = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<4>(tensor_b_);
-    int32_t T = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t d = 0; d < D; ++d) {
-        for (int32_t h = 0; h < H; ++h) {
-          for (int32_t w = 0; w < W; ++w) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t k = 0; k < K; ++k) {
-                for (int32_t t = 0; t < T; ++t) {
-                  for (int32_t r = 0; r < R; ++r) {
-                    for (int32_t s = 0; s < S; ++s) {
-                      int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                      int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
-                      int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
-
-                      if (q % cute::get<0>(tstride_) == 0) {
-                        q /= cute::get<0>(tstride_);
-                      } else {
-                        continue;
-                      }
-
-                      if (p % cute::get<1>(tstride_) == 0) {
-                        p /= cute::get<1>(tstride_);
-                      } else {
-                        continue;
-                      }
-
-                      if (z % cute::get<2>(tstride_) == 0) {
-                        z /= cute::get<2>(tstride_);
-                      } else {
-                        continue;
-                      }
-
-                      if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k)) {
-                        accumulator += ElementAcc(tensor_a_(k, q, p, z, n) * tensor_b_(c, s, r, t, k));
-                      }
-                    }
-                  }
-                }
-              }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-              }
-              output = epi_activation(output);
-              tensor_d_(c, w, h, d, n) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 1D wgrad kernel
-  void wgrad_reference(cute::Int<1> spatial_dims) {
-    int32_t N =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t s = 0; s < S; ++s) {
-        for (int32_t c = 0; c < C; ++c) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t n = 0; n < N; ++n) {
-            for (int32_t q = 0; q < Q; ++q) {
-              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-              bool is_in_bounds =
-                  detail::is_activation_in_bounds(tensor_b_, n, w, c);
-              if (is_in_bounds) {
-                auto act =
-                    tensor_b_(c, w, n);
-                auto xformed_act =
-                    tensor_a_(k, q, n);
-                accumulator += ElementAcc(act * xformed_act);
-              }
-            }
-          }
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(c, s, k));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-          }
-          output = epi_activation(output);
-          tensor_d_(c, s, k) = output_converter(output);
-        }
-      }
-    }
-  }
-
-  // Specialization for 2D wgrad kernel
-  void wgrad_reference(cute::Int<2> spatial_dims) {
-    int32_t N =
-        size<3>(tensor_a_);
-    int32_t P =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t R = size<2>(tensor_d_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t r = 0; r < R; ++r) {
-        for (int32_t s = 0; s < S; ++s) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t n = 0; n < N; ++n) {
-              for (int32_t p = 0; p < P; ++p) {
-                for (int32_t q = 0; q < Q; ++q) {
-                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                  bool is_in_bounds =
-                      detail::is_activation_in_bounds(tensor_b_, n, h, w, c);
-                  if (is_in_bounds) {
-                    auto act =
-                        tensor_b_(c, w, h, n);
-                    auto xformed_act =
-                        tensor_a_(k, q, p, n);
-                    accumulator += ElementAcc(act * xformed_act);
-                  }
-                }
-              }
-            }
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-            }
-            output = epi_activation(output);
-            tensor_d_(c, s, r, k) = output_converter(output);
-          }
-        }
-      }
-    }
-  }
-
-  // Specialization for 3D wgrad kernel
-  void wgrad_reference(cute::Int<3> spatial_dims) {
-    int32_t N =
-        size<4>(tensor_a_);
-    int32_t Z =
-        size<3>(tensor_a_);
-    int32_t P =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t T = size<3>(tensor_d_);
-    int32_t R = size<2>(tensor_d_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t t = 0; t < T; ++t) {
-        for (int32_t r = 0; r < R; ++r) {
-          for (int32_t s = 0; s < S; ++s) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t n = 0; n < N; ++n) {
-                for (int32_t z = 0; z < Z; ++z) {
-                  for (int32_t p = 0; p < P; ++p) {
-                    for (int32_t q = 0; q < Q; ++q) {
-                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                      bool is_in_bounds =
-                          detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c);
-                      if (is_in_bounds) {
-                        auto act =
-                            tensor_b_(c, w, h, d, n);
-                        auto xformed_act =
-                            tensor_a_(k, q, p, z, n);
-                        accumulator += ElementAcc(act * xformed_act);
-                      }
-                    }
-                  }
-                }
-              }
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-              }
-              output = epi_activation(output);
-              tensor_d_(c, s, r, t, k) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // cutlass::reference::host
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/convolution.h b/csrc/sparse/cutlass/example/util/reference/host/convolution.h
deleted file mode 100644
index f28b4a658a388..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/convolution.h
+++ /dev/null
@@ -1,802 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Reference implementation for convolution in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/functional.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include <iostream>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Forward propagation
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// y = conv2d(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementD, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int p = 0; p < problem_size.P; ++p) {
-      for (int q = 0; q < problem_size.Q; ++q) {
-        for (int k = 0; k < problem_size.K; ++k) {
-
-          int group_idx = k / (problem_size.K / problem_size.groups);
-          int channels_per_group = problem_size.C / problem_size.groups;
-
-          ElementAccumulator acc = ElementAccumulator();
-
-          for (int r = 0; r < problem_size.R; ++r) {
-            for (int s = 0; s < problem_size.S; ++s) {
-              for (int c = 0; c < channels_per_group; ++c) {
-
-                int filter_r = r;
-                int filter_s = s;
-
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-
-                int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-                int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-                if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
-
-                  ElementA a = tensor_x.at({n, h, w, c + group_idx * channels_per_group});
-                  ElementB b = tensor_w.at({k, r, s, c});
-
-                  acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-
-                }
-              }
-            }
-          }
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-
-          if (beta != ElementCompute()) {
-            c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
-          }
-
-          tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        }
-      }
-    }
-  }
-}
-
-/// Depthwise-separable convolution
-template <typename ElementA,
-          typename LayoutA,
-          typename ElementB,
-          typename LayoutB,
-          typename ElementC,
-          typename LayoutC,
-          typename ElementCompute,
-          typename ElementAccumulator = ElementCompute,
-          typename ElementD = ElementC,
-          typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-          typename InnerProductOp = multiply_add<ElementAccumulator>>
-void Depsep_Fprop(cutlass::TensorView<ElementA, LayoutA> tensor_A,
-                  cutlass::TensorView<ElementB, LayoutB> tensor_B,
-                  cutlass::TensorView<ElementC, LayoutC> tensor_C,
-                  cutlass::TensorView<ElementD, LayoutC> tensor_D,
-                  ElementCompute alpha,
-                  ElementCompute beta,
-                  cutlass::Tensor4DCoord padding = cutlass::Tensor4DCoord(),
-                  cutlass::Coord<2> conv_stride = cutlass::Coord<2>(),
-                  cutlass::Coord<2> dilation = cutlass::Coord<2>(),
-                  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < tensor_C.extent().n(); ++n) {
-    for (int p = 0; p < tensor_C.extent().h(); ++p) {
-      for (int q = 0; q < tensor_C.extent().w(); ++q) {
-        for (int g = 0; g < tensor_C.extent().c(); ++g) {
-          ElementAccumulator acc = ElementAccumulator();
-          for (int r = 0; r < tensor_B.extent().h(); ++r) {
-            for (int s = 0; s < tensor_B.extent().w(); ++s) {
-              
-              // input activation H and W
-              int h = p * conv_stride[0] - padding[0] + r * dilation[0];
-              int w = q * conv_stride[1] - padding[2] + s * dilation[1];
-
-              if (h < tensor_A.extent().h() && h >= 0 && w < tensor_A.extent().w() && w >= 0) {
-                ElementA a = tensor_A.at(cutlass::make_Coord(n, h, w, g));
-
-                ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
-                                   ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
-                                   : tensor_B.at(cutlass::make_Coord(
-                                         g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
-
-                acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-              }
-            }
-          }
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
-          tensor_D.at(cutlass::make_Coord(n, p, q, g)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad / Deconv
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dDgrad(
-  cutlass::conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementD, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  bool is_deconv = false) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int h = 0; h < problem_size.H; ++h) {
-      for (int w = 0; w < problem_size.W; ++w) {
-        for (int c = 0; c < problem_size.C; ++c) {
-
-          ElementAccumulator acc = ElementAccumulator();
-
-          for (int r = 0; r < problem_size.R; ++r) {
-            for (int s = 0; s < problem_size.S; ++s) {
-              for (int k = 0; k < problem_size.K; ++k) {
-
-                int filter_r = r;
-                int filter_s = s;
-
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-
-                int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
-                int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-                if (p >= 0 && (p % problem_size.stride_h) == 0 && 
-                    q >= 0 && (q % problem_size.stride_w) == 0) {
-
-                  p = p / problem_size.stride_h;
-                  q = q / problem_size.stride_w;
-#if 0
-                  std::cout << "row:" 
-                  << n * problem_size.H * problem_size.W +
-                    h * problem_size.W +
-                    w << " "
-                  << "n, p, q: (" 
-                  << n << ", "
-                  << p << ", "
-                  << q << ") * "
-                  << "r, s: (" 
-                  << r << ", "
-                  << s << ") [" 
-                  << ((p < problem_size.P && q < problem_size.Q) ? "true":"false") << "]"        
-                  << std::endl;
-#endif
-                  if (p < problem_size.P && q < problem_size.Q) {
-
-                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
-                    ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, r, s, k))
-                        : tensor_w.at(cutlass::make_Coord(k, r, s, c));
-
-                    acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                  }
-                }
-
-              } // for (K)
-            } // for (S)
-          } // for (R)
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-
-          if (beta != ElementCompute()) {
-            c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
-          }
-
-          tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-        } // for (C)
-      } // for (W)
-    } // for (H)
-  } // for (N)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Wgrad
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dWgrad(
-  cutlass::conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementD, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  
-  InnerProductOp inner_product_op;
-  ConvertOp convert_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int k = 0; k < problem_size.K; ++k) {
-    for (int r = 0; r < problem_size.R; ++r) {
-      for (int s = 0; s < problem_size.S; ++s) {
-        for (int c = 0; c < problem_size.C; ++c) {
-
-          ElementAccumulator acc = ElementAccumulator();
-
-          for (int n = 0; n < problem_size.N; ++n) {
-            for (int p = 0; p < problem_size.P; ++p) {
-              for (int q = 0; q < problem_size.Q; ++q) {
-                  
-                cutlass::Tensor4DCoord b_coord;
-                
-                int filter_r = r;
-                int filter_s = s; 
-
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-
-                b_coord = make_Coord(
-                    n,
-                    p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
-                    q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
-                    c);
-
-                if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
-                    b_coord.w() < problem_size.W && b_coord.w() >= 0) {
-
-                  ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
-                  ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
-                  acc = inner_product_op(a, b, acc);
-                }
-              }
-            }
-          }
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-
-          if (beta != ElementCompute()) {
-            c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
-          }
-
-          tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-        } // for (C)
-      } // for (S)
-    } // for (R)
-  } // for (K)
-}
-
-/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2d(
-  conv::Operator convolutional_operator,
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementD, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    Conv2dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  case conv::Operator::kDeconv:
-  case conv::Operator::kDgrad:
-    Conv2dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
-    break;
-
-  case conv::Operator::kWgrad:
-    Conv2dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  default:
-    break;  
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// 3D convolution 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// y = conv3d(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int z = 0; z < problem_size.Z; ++z) {
-      for (int p = 0; p < problem_size.P; ++p) {
-        for (int q = 0; q < problem_size.Q; ++q) {
-          for (int k = 0; k < problem_size.K; ++k) {
-
-            ElementAccumulator acc = ElementAccumulator();
-
-            for (int t = 0; t < problem_size.T; ++t) {
-              for (int r = 0; r < problem_size.R; ++r) {
-                for (int s = 0; s < problem_size.S; ++s) {
-                  for (int c = 0; c < problem_size.C; ++c) {
-
-                    int filter_t = t;
-                    int filter_r = r;
-                    int filter_s = s;
-
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-
-                    int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
-                    int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-                    int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-                    if (d >= 0 && d < problem_size.D && 
-                      h >=0 && h < problem_size.H && 
-                      w >= 0 && w < problem_size.W) {
-
-                      ElementA a = tensor_x.at({n, d, h, w, c});
-                      ElementB b = tensor_w.at({k, t, r, s, c});
-                      
-                      acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                    }
-                  }
-                }
-              }
-            }
-
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-
-            if (beta != ElementCompute()) {
-              c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
-            }
-
-            tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad / Deconv
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dDgrad(
-  cutlass::conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  bool is_deconv = false) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int d = 0; d < problem_size.D; ++d) {
-      for (int h = 0; h < problem_size.H; ++h) {
-        for (int w = 0; w < problem_size.W; ++w) {
-          for (int c = 0; c < problem_size.C; ++c) {
-
-            ElementAccumulator acc = ElementAccumulator();
-
-            for (int t = 0; t < problem_size.T; ++t) {
-              for (int r = 0; r < problem_size.R; ++r) {
-                for (int s = 0; s < problem_size.S; ++s) {
-                  for (int k = 0; k < problem_size.K; ++k) {
-
-                    int filter_t = t;
-                    int filter_r = r;
-                    int filter_s = s;
-
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-
-                    int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
-                    int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
-                    int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-                    if (z >= 0 && (z % problem_size.stride_d) == 0 &&
-                        p >= 0 && (p % problem_size.stride_h) == 0 && 
-                        q >= 0 && (q % problem_size.stride_w) == 0) {
-
-                      z = z / problem_size.stride_d;
-                      p = p / problem_size.stride_h;
-                      q = q / problem_size.stride_w;
-                      
-                      if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
-
-                        ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
-                        ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, t, r, s, k))
-                            : tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
-                        acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                      }
-                    }
-
-                  } // for (K)
-                } // for (S)
-              } // for (R)
-            } // for (T)
-
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-
-            if (beta != ElementCompute()) {
-              c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
-            }
-
-            tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-          } // for (C)
-        } // for (W)
-      } // for (H)
-    } // for (D)
-  } // for (N)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Wgrad
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dWgrad(
-  cutlass::conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  
-  InnerProductOp inner_product_op;
-  ConvertOp convert_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int k = 0; k < problem_size.K; ++k) {
-    for (int t = 0; t < problem_size.T; ++t) {
-      for (int r = 0; r < problem_size.R; ++r) {
-        for (int s = 0; s < problem_size.S; ++s) {
-          for (int c = 0; c < problem_size.C; ++c) {
-
-            ElementAccumulator acc = ElementAccumulator();
-
-            for (int n = 0; n < problem_size.N; ++n) {
-              for (int z = 0; z < problem_size.Z; ++z) {
-                for (int p = 0; p < problem_size.P; ++p) {
-                  for (int q = 0; q < problem_size.Q; ++q) {
-                      
-                    int filter_t = t;     
-                    int filter_r = r;
-                    int filter_s = s; 
-
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-
-                    Tensor5DCoord b_coord = make_Coord(
-                        n,
-                        z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
-                        p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
-                        q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
-                        c);
-
-                    if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
-                        b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
-                        b_coord.w() < problem_size.W && b_coord.w() >= 0) {
-
-                      ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
-                      ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
-
-                      acc = inner_product_op(a, b, acc);
-                    }
-                  }
-                }
-              }
-            }
-
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-
-            if (beta != ElementCompute()) {
-              c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
-            }
-
-            tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-          } // for (C)
-        } // for (S)
-      } // for (R)
-    } // for (T)
-  } // for (K)
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3d(
-  conv::Operator convolutional_operator,
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    Conv3dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  case conv::Operator::kDeconv:
-  case conv::Operator::kDgrad:
-    Conv3dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
-    break;
-
-  case conv::Operator::kWgrad:
-    Conv3dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  default:
-    break;  
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace host
-}  // namespace reference
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/csrc/sparse/cutlass/example/util/reference/host/error_metrics.h b/csrc/sparse/cutlass/example/util/reference/host/error_metrics.h
deleted file mode 100644
index 86db65ccc441e..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/error_metrics.h
+++ /dev/null
@@ -1,66 +0,0 @@
-
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cmath>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/util/reference/host/tensor_reduce.h"
-#include "cutlass/core_io.h"
-
-namespace cutlass  {
-namespace reference {
-namespace host {
-
-/// Helper to compute the relative error metric for tensor A_computed  w.r.t. to tensor A_reference
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorRelativeErrorMetric(
-  TensorView<Element, Layout> view_A_computed,
-  TensorView<Element, Layout> view_B_reference,
-  ComputeType identity = ComputeType()
-) {
-
-  return cutlass::reference::host::TensorNormDiff(view_A_computed, view_B_reference, identity) /
-   cutlass::reference::host::TensorNorm(view_B_reference, identity);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gemm.h b/csrc/sparse/cutlass/example/util/reference/host/gemm.h
deleted file mode 100644
index 03888131095fc..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gemm.h
+++ /dev/null
@@ -1,531 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-template<typename Out, typename In>
-struct CastIfScalar {
-  static Out cast(In in) {
-    return Out(in);
-  }
-};
-
-template<typename OutScalar, typename In>
-struct CastIfScalar<cutlass::complex<OutScalar>, In> {
-  typedef cutlass::complex<OutScalar> Out;
-  static Out cast(In in) {
-    return Out(static_cast<OutScalar>(in));
-  }
-};
-
-template<typename OutScalar, typename InScalar>
-struct CastIfScalar<cutlass::complex<OutScalar>, cutlass::complex<InScalar>> {
-  typedef cutlass::complex<OutScalar> Out;
-  typedef cutlass::complex<InScalar> In;
-  static Out cast(In in) {
-    return Out(in);
-  }
-};
-
-template<typename Out, typename In>
-Out cast_if_scalar(In in) {
-  return CastIfScalar<Out, In>::cast(in);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-              ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-
-              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
-              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
-
-              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]) +
-              beta * ScalarType(tensor_c.at(coord)));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  ComputeType initial_accum) {
-  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
-      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-      initial_accum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Gemm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddFastBF16> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add-saturate
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddSaturate> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for XOR-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpXorPopc> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, xor_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, xor_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-/// Partial specialization for AND-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpAndPopc> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, and_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, and_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddFastF32> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Batched GEMM
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a batch of GEMMs over a set of matrices of common dimension.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c,
-  AccumulatorType initial_accum) {
-
-  typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
-  typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
-  typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
-
-  for (int batch = 0;
-    batch < batch_count;
-    ++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
-    
-    Gemm<typename TensorRefCollectionA::Element,
-         typename TensorRefCollectionA::Layout,
-         typename TensorRefCollectionB::Element,
-         typename TensorRefCollectionB::Layout,
-         typename TensorRefCollectionC::Element,
-         typename TensorRefCollectionC::Layout,
-         typename TensorRefCollectionC::Element,
-         typename TensorRefCollectionC::Element>
-        gemm;
-
-    gemm(problem_size, alpha, *tensor_a_it, *tensor_b_it, beta, *tensor_c_it,
-         initial_accum);
-  }
-}
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c) {
-
-  BatchedGemm(problem_size, batch_count, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h
deleted file mode 100644
index 92da343a9c222..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/matrix_coord.h"
-
-#include "cutlass/tensor_view.h"
-
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N) {
-                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-                ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-
-                ComputeType a_ik = ComputeType(a);
-                ComputeType b_kj = ComputeType(b);
-
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_ik = conj(a_ik);
-                }
-
-                if (transform_b == ComplexTransform::kConjugate) {
-                  b_kj = conj(b_kj);
-                }
-
-                accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-              }
-            }
-          }
-        }
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N) {
-
-              tensor_d.at(coord) = convert_op(
-                alpha * ScalarType(accum[i][j]) + 
-                beta * ScalarType(tensor_c.at(coord)));
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_b.add_pointer_offset(batch_stride_B);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ElementD = ElementC
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d) {
-
-  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h b/csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h
deleted file mode 100644
index 094af8b37b695..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
-  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
-  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      complex<ComputeType> accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-
-              ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
-              ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
-
-              complex<ComputeType> a = complex<ComputeType>{
-                ComputeType(a_ik.real()),
-                ComputeType(a_ik.imag())
-              };
-
-              complex<ComputeType> b = complex<ComputeType>{
-                ComputeType(b_kj.real()),
-                ComputeType(b_kj.imag())
-              };
-
-              if (transform_a == ComplexTransform::kConjugate) {
-                a = conj(a);
-              }
-
-              if (transform_b == ComplexTransform::kConjugate) {
-                b = conj(b);
-              }
-
-              accum[i][j] = inner_product_op(a, b,  accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-
-            complex<ScalarType> acc{
-              ScalarType(accum[i][j].real()),
-              ScalarType(accum[i][j].imag())
-            };
-
-            ComplexC d_ij = tensor_c.at(coord);
-
-            complex<ScalarType> src{
-              ScalarType(d_ij.real()),
-              ScalarType(d_ij.imag())
-            };
-
-            complex<ScalarType> result = alpha * acc + beta * src;
-
-            d_ij.real() = convert_op(result.real());
-            d_ij.imag() = convert_op(result.imag());
-
-            tensor_d.at(coord) = d_ij;
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
-
-  GemmPlanarComplex(
-    problem_size, 
-    alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, 
-    tensor_c,
-    tensor_d,
-    complex<ScalarType>());
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gett.hpp b/csrc/sparse/cutlass/example/util/reference/host/gett.hpp
deleted file mode 100644
index f6984fb2ba9c5..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gett.hpp
+++ /dev/null
@@ -1,538 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GETT in host-side code.
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/relatively_equal.h"
-
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::reference::host {
-
-template<class T, class = void>
-struct ElementTraits {
-  using type = T;
-};
-
-template<class T>
-struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
-  using type = decltype(std::declval<T>().get());
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ElementAccumulator_,
-  class TensorA_,                                                                                         // (M, K, L)
-  class TensorB_                                                                                          // (N, K, L)
->
-struct GettMainloopParams {
-  using ElementAccumulator = ElementAccumulator_;
-  using TensorA = TensorA_;
-  using TensorB = TensorB_;
-  using EngineA = typename TensorA::engine_type;
-  using LayoutA = typename TensorA::layout_type;
-  using EngineB = typename TensorB::engine_type;
-  using LayoutB = typename TensorB::layout_type;
-
-  TensorA A{};
-  TensorB B{};
-
-  ComplexTransform transform_A = ComplexTransform::kNone;
-  ComplexTransform transform_B = ComplexTransform::kNone;
-  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-  class ElementScalar_,
-  class ElementScalingFactor_,
-  class ElementAccumulator_,
-  class ElementCompute_,
-  class TensorC_,                                                                                          // (M, N, L)
-  class TensorD_,                                                                                          // (M, N, L)
-  class VectorBias_ = TensorD_,                                                                            //    (M, 1)
-  class TensorAux_ = TensorD_,                                                                             // (M, N, L)
-  class VectorAlpha_ = TensorD_,                                                                           //    (M, 1)
-  class VectorBeta_ = VectorAlpha_,                                                                        //    (M, 1)
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
-  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
-  bool PerColumnBias_ = false
->
-struct GettEpilogueParams {
-  using ElementScalar = ElementScalar_;
-  using ElementScalingFactor = ElementScalingFactor_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using TensorC = TensorC_;
-  using TensorD = TensorD_;
-  using TensorAux = TensorAux_;
-  using VectorBias = VectorBias_;
-  using VectorAlpha = VectorAlpha_;
-  using VectorBeta = VectorBeta_;
-  using ActivationFunctor = ActivationFunctor_;
-  using BiasBinaryOp = BiasBinaryOp_;
-
-  using EngineC = typename TensorC::engine_type;
-  using LayoutC = typename TensorC::layout_type;
-  using EngineD =  typename TensorD::engine_type;
-  using LayoutD = typename TensorD::layout_type;
-  static constexpr bool PerColumnBias = PerColumnBias_;
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-
-  TensorC C{};
-  TensorD D{};
-  VectorBias Bias{};
-  TensorAux Aux{};
-  VectorAlpha Valpha{};
-  VectorBeta Vbeta{};
-  ElementCompute st = ElementCompute(1);
-
-  ElementAccumulator* abs_max_D = nullptr;
-  ElementAccumulator* abs_max_Aux = nullptr;
-
-  ElementScalingFactor scale_a = ElementScalingFactor(1);
-  ElementScalingFactor scale_b = ElementScalingFactor(1);
-  ElementScalingFactor scale_c = ElementScalingFactor(1);
-  ElementScalingFactor scale_d = ElementScalingFactor(1);
-  ElementScalingFactor scale_aux = ElementScalingFactor(1);
-
-  bool beta_per_channel_scaling = false;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - General Tensor-Tensor contraction reference kernel
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gett(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-
-  static int constexpr kBlockM = 64;
-  static int constexpr kBlockN = 64;
-
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(3)
-#endif
-  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
-    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
-      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
-        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
-        gett_mainloop(mainloop_params, m, n, l, acc);
-        gett_epilogue(epilogue_params, m, n, l, acc);
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Mainloop
-template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_mainloop(
-    MainloopParams const& mainloop_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
-  
-  using cute::raw_pointer_cast;
-
-  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
-  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
-
-  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
-  RingOp fma_op;
-
-  // Zero out accumulators
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-    }
-  }
-
-  // Compute on this k-block
-  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
-    // Load A
-    ElementAccumulator a_frag[kBlockM];
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
-        
-        if (mainloop_params.transform_A == ComplexTransform::kConjugate) {
-          a_frag[m_b] = conj(a_frag[m_b]);
-        }
-      } else {
-        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // Load B
-    ElementAccumulator b_frag[kBlockN];
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
-
-        if (mainloop_params.transform_B == ComplexTransform::kConjugate) {
-          b_frag[n_b] = conj(b_frag[n_b]);
-        }
-      } else {
-        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // do compute
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      for (int n_b = 0; n_b < kBlockN; ++n_b) {
-        acc[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc[m_b][n_b]);
-      }
-    }
-
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Epilogue
-template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_epilogue(
-    EpilogueParams const& epilogue_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementCompute = typename EpilogueParams::ElementCompute;
-  using ElementC = typename EpilogueParams::TensorC::value_type;
-  using ElementD = typename EpilogueParams::TensorD::value_type;
-  using ElementAux = typename EpilogueParams::TensorAux::value_type;
-  using ElementBias = typename EpilogueParams::VectorBias::value_type;
-  using ElementScalar = typename EpilogueParams::ElementScalar;
-  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
-  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
-  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
-
-  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
-  constexpr bool IsScalingAndAmaxOutputNeeded = 
-      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
-
-  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
-      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
-
-  constexpr bool IsReLUAuxNeeded =
-      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
-       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and 
-      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
-  constexpr bool IsClamp =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>;
-
-  constexpr bool IsBackpropFusion =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
-
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
-  NumericConverter<ElementCompute, ElementC> source_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
-
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
-
-  // Abs max converter
-  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
-
-  // Output related converter
-  NumericConverter<ElementD, ElementCompute> destination_converter;
-  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
-  NumericConverter<ElementBias, ElementCompute> dBias_converter;
-
-  // Epilogue operations
-  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
-  multiplies<ElementCompute> mul;
-  plus<ElementCompute> add;
-
-  // Activation operation
-  ActivationFunctor activation;
-
-  // Bias binary operation
-  BiasBinaryOp bias_op;
-
-  // Do conversion
-  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
-  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
-  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
-  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
-  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
-  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
-  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
-
-  // Init local var
-  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
-  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
-
-  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
-  converted_beta = mul(converted_beta, converted_scale_c);
-
-  ElementCompute inter_accum[kBlockM][kBlockN];
-
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    ElementCompute local_dBias = ElementCompute(0);
-
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
-        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
-        // per-row alpha
-        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
-          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
-          converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
-        }
-        ElementCompute output = mul(converted_alpha, converted_acc);
-
-        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
-          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
-          output = bias_op(output, converted_bias);
-        }
-
-        if (raw_pointer_cast(epilogue_params.C.data())) {
-          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
-          // per-row beta
-          if (epilogue_params.Vbeta.data()) {
-            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
-            converted_beta = mul(converted_beta, converted_scale_c);
-          }
-          output = epilogue_fma(converted_beta, converted_src, output);
-        }
-
-        if constexpr (IsBackpropFusion) {
-          ElementAux aux_input = ElementAux(0);
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
-          }
-
-          output = activation(output, aux_source_converter(aux_input));
-          local_dBias = add(local_dBias, output);
-        }
-        else {
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            auto aux_output = output;
-            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
-              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
-            }
-
-            if constexpr (IsReLUAuxNeeded) {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
-            } else {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
-            }
-          }
-
-          if constexpr (IsClamp) { // Treat Clamp as ReLU
-            output = activation(output, {0, std::numeric_limits<ElementCompute>::max()});
-          }
-          else {
-            output = activation(output);
-          }
-        }
-
-        if constexpr (IsScalingAndAmaxOutputNeeded) {
-          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-          local_abs_max_output = amax_op(local_abs_max_output, output);
-          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
-        }
-
-        inter_accum[m_b][n_b] = ElementCompute(output);
-      }
-    } // n_b
-
-    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
-      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
-        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
-        local_dBias = add(local_dBias, converted_dBias);
-        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
-      }
-    }
-  } // m_b
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
-      }
-    }
-  }
-
-#if defined(_OPENMP)
-  #pragma omp critical(Abs_Max_Data_Update)
-#endif
-  {
-    if constexpr (IsScalingAndAmaxOutputNeeded) {
-      if (epilogue_params.abs_max_D) {
-        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
-          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
-      }
-    }
-
-    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-      if (epilogue_params.abs_max_Aux) {
-        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
-            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class TensorType>
-auto make_layout_rank3(const TensorType& tensor) {
-  // append a batch mode of size 1 if we do not have tensors that are rank 3
-  return make_layout(
-      make_shape(cute::get<0>(tensor.shape()), cute::get<1>(tensor.shape()), cute::Int<1>{}),
-      make_stride(cute::get<0>(tensor.stride()), cute::get<1>(tensor.stride()), int64_t(cosize(tensor.layout()))));
-}
-
-/// GEMM - General Matrix-Matrix contraction without conjugation options
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gemm3x(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-  using namespace cute;
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
-
-  if constexpr (cute::rank(typename MainloopParams::LayoutA{}) == 2) {
-    cute::Layout layout_A = make_layout_rank3(mainloop_params.A);
-    cute::Layout layout_B = make_layout_rank3(mainloop_params.B);
-    cute::Layout layout_C = make_layout_rank3(epilogue_params.C);
-    cute::Layout layout_D = make_layout_rank3(epilogue_params.D);
-    cute::Layout layout_Aux = make_layout_rank3(epilogue_params.Aux);
-    cute::Layout layout_Bias = make_layout_rank3(epilogue_params.Bias);
-    cute::Layout layout_Valpha = make_layout_rank3(epilogue_params.Valpha);
-    cute::Layout layout_Vbeta = make_layout_rank3(epilogue_params.Vbeta);
-    
-    auto TensorA = make_tensor(mainloop_params.A.data(), layout_A);
-    auto TensorB = make_tensor(mainloop_params.B.data(), layout_B);
-    auto TensorC = make_tensor(epilogue_params.C.data(), layout_C);
-    auto TensorD = make_tensor(epilogue_params.D.data(), layout_D);
-    auto TensorAux = make_tensor(epilogue_params.Aux.data(), layout_Aux);
-    auto VectorBias = make_tensor(epilogue_params.Bias.data(), layout_Bias);
-    auto VectorAlpha = make_tensor(epilogue_params.Valpha.data(), layout_Valpha);
-    auto VectorBeta = make_tensor(epilogue_params.Vbeta.data(), layout_Vbeta);
-
-    // Reconstruct mainloop params
-    GettMainloopParams<typename MainloopParams::ElementAccumulator,
-                       decltype(TensorA),
-                       decltype(TensorB)>
-        mainloop_params_converted{TensorA,
-                                  TensorB,
-                                  mainloop_params.transform_A,
-                                  mainloop_params.transform_B};
-
-    // Reconstruct epilogue params
-    GettEpilogueParams<typename EpilogueParams::ElementScalar,
-                       typename EpilogueParams::ElementScalingFactor,
-                       typename EpilogueParams::ElementAccumulator,
-                       typename EpilogueParams::ElementCompute,
-                       decltype(TensorC),
-                       decltype(TensorD),
-                       decltype(VectorBias),
-                       decltype(TensorAux),
-                       decltype(VectorAlpha),
-                       decltype(VectorBeta)
-                      >
-        epilogue_params_converted{epilogue_params.alpha,
-                                  epilogue_params.beta,
-                                  TensorC,
-                                  TensorD,
-                                  VectorBias,
-                                  TensorAux,
-                                  VectorAlpha,
-                                  VectorBeta,
-                                  epilogue_params.abs_amax_D,
-                                  epilogue_params.abs_amax_Aux,
-                                  epilogue_params.scale_a,
-                                  epilogue_params.scale_b,
-                                  epilogue_params.scale_c,
-                                  epilogue_params.scale_d,
-                                  epilogue_params.scale_aux
-                                  };
-
-    Gett(mainloop_params_converted, epilogue_params_converted);
-  }
-  else {
-    // if we already have a batch mode, just pass it through
-    Gett(mainloop_params, epilogue_params);
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // cutlass::reference::host
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/rank_2k.h b/csrc/sparse/cutlass/example/util/reference/host/rank_2k.h
deleted file mode 100644
index 2a99bc03a35ba..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/rank_2k.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for Rank 2k update in host-side code.
-    
-    
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  FillMode FillModeC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_rank2k(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, 
-    "Tensors must be of rank 2");
-
-  static_assert(
-    FillModeC == FillMode::kLower || 
-    FillModeC == FillMode::kUpper, 
-    "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp = typename platform::conditional<(FillModeC == FillMode::kLower), 
-                                                    std::greater_equal<int>, 
-                                                    std::less_equal<int>>::type;
-
-  // Note: batch is ignored.
-  // Note: M is same as N for Rank 2k update
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp compare_op;
-
-  for (int row_block = 0; row_block < N; row_block += Nblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Nblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Nblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Nblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < N && col < N && compare_op(row, col)) 
-            {
-
-              // A x B^T
-              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-              ElementB b_t = tensor_b.at(MatrixCoord(col, k_block));
-
-              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
-              ComputeType compute_b_t(cast_if_scalar<ComputeType>(b_t));
-
-              accum[i][j] = inner_product_op(compute_a, compute_b_t, accum[i][j]);
-
-              // B x A^T
-              ElementB b = tensor_b.at(MatrixCoord(row, k_block));
-              ElementA a_t = tensor_a.at(MatrixCoord(col, k_block));
-
-              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
-              ComputeType compute_a_t(cast_if_scalar<ComputeType>(a_t));
-
-              accum[i][j] = inner_product_op(compute_b, compute_a_t, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Nblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < N && col < N && 
-              ( (FillModeC == FillMode::kLower && row >= col) || 
-                (FillModeC == FillMode::kUpper && row <= col) )
-          ) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]) +
-              beta * ScalarType(tensor_c.at(coord)));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general Rank 2k update (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  FillMode FillModeC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_rank2k(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  ComputeType initial_accum) {
-  compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
-               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
-      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-      initial_accum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  FillMode FillModeC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Rank2K;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, 
-          typename ElementB, typename LayoutB, 
-          typename ElementC, typename LayoutC, FillMode FillModeC,
-          typename ScalarType, typename ComputeType>
-struct Rank2K<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h b/csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h
deleted file mode 100644
index 090019c100396..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h
+++ /dev/null
@@ -1,318 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
-
-    
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include <assert.h>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
-  assert(M==N);
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N &&
-                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col) )               
-                ) {
-                
-                // A x B^T (Symmetric) or A x B^H (Hermitian)
-                // complex conjugation on operandB (b_t) is function of blas3 computation
-                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-                ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
-                              conj(tensor_b.at(MatrixCoord(col, k_block))) : 
-                              tensor_b.at(MatrixCoord(col, k_block));
-
-                ComputeType a_ik = ComputeType(a);
-                ComputeType b_jk = ComputeType(b_t);
-
-                // complex conjugation is a function of operand layouts
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_ik = conj(a_ik);
-                }
-                // complex conjugation is a function of operand layouts
-                if (transform_b == ComplexTransform::kConjugate) {
-                  b_jk = conj(b_jk);
-                }
-
-                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
-              }
-            }
-          }
-        }
-
-        /* HER2K need two epilogues to handle complex alpha value */
-        if ( blas_mode == BlasMode::kHermitian ) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              MatrixCoord coord = MatrixCoord(row, col);
-
-              if (row < M && col < N && 
-                  ((fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col))
-                ) {
-
-                ScalarType c = tensor_c.at(coord);
-                // The imaginary parts of the diagonal elements of 
-                // a complex data type are assumed and set to zero
-                if (blas_mode == BlasMode::kHermitian) {
-                  c = (row == col) ? real(c) : c;
-                }
-
-                tensor_d.at(coord) = convert_op(alpha * 
-                  ScalarType(accum[i][j]) + 
-                  beta * c);
-              }
-            }
-          }
-          
-          /* Zeoring out accum for second HERK */
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              accum[i][j] = initial_accum;
-            }
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N &&
-                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col) )               
-                ) {
-
-                // B x A^T (Symmetric) or B x A^H (Hermitian)
-                // complex conjugation on operandB (a_t) is function of blas3 computation
-                ElementB b = tensor_b.at(MatrixCoord(row, k_block));
-                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
-                                conj(tensor_a.at(MatrixCoord(col, k_block))):
-                                tensor_a.at(MatrixCoord(col, k_block));
-
-                ComputeType b_ik = ComputeType(b);
-                ComputeType a_jk = ComputeType(a_t);
-                
-                // complex conjugation here is a function of operand layouts
-                if (transform_b == ComplexTransform::kConjugate) {
-                  b_ik = conj(b_ik);
-                }
-                // complex conjugation here is a function of operand layouts
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_jk = conj(a_jk);
-                }
-
-                accum[i][j] = inner_product_op(b_ik, a_jk, accum[i][j]);
-              }
-            }
-          }
-        }
-
-        ScalarType alpha_hermitian = (blas_mode == BlasMode::kHermitian) ? 
-                                      conj(alpha) : alpha;
-        ScalarType beta_hermitian = (blas_mode == BlasMode::kHermitian) ? 
-                                      1 : beta;
-        
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N && 
-                ((fill_mode_c == FillMode::kLower && row >= col) || 
-                 (fill_mode_c == FillMode::kUpper && row <= col))
-              ) {
-
-              ScalarType d = (blas_mode == BlasMode::kHermitian) ? 
-                             tensor_d.at(coord) : tensor_c.at(coord);
-
-              ScalarType tmp_d = convert_op(
-                alpha_hermitian * ScalarType(accum[i][j]) + 
-                beta_hermitian * d);
-
-              if (blas_mode == BlasMode::kHermitian && row == col ) {
-                tensor_d.at(coord) = real(tmp_d);
-              } else {
-                tensor_d.at(coord) = tmp_d;
-              }
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_b.add_pointer_offset(batch_stride_B);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  FillMode fill_mode_c,
-  BlasMode blas_mode) {
-
-  Rank2KComplex(
-    problem_size, alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, tensor_c, tensor_d, 
-    ScalarType(0),
-    fill_mode_c,
-    blas_mode);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h b/csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h
deleted file mode 100644
index ef44270a314a4..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
-
-    
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include <assert.h>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
-  assert(M==N);
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N &&
-                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col) )               
-                ) {
-                
-                // A x A^T (Symmetric) or A x A^H (Hermitian)
-                // complex conjugation on operandB (a_t) (function of blas3 computation)
-                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
-                              conj(tensor_a.at(MatrixCoord(col, k_block))) : 
-                              tensor_a.at(MatrixCoord(col, k_block));
-
-                ComputeType a_ik = ComputeType(a);
-                ComputeType b_jk = ComputeType(a_t);
-
-                // complex conjugation (function of input layouts)
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_ik = conj(a_ik);
-                }
-                // complex conjugation (function of input layouts)
-                if (transform_a == ComplexTransform::kConjugate) {
-                  b_jk = conj(b_jk);
-                }
-
-                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
-
-              }
-            }
-          }
-        }
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N && 
-                ((fill_mode_c == FillMode::kLower && row >= col) || 
-                 (fill_mode_c == FillMode::kUpper && row <= col))
-              ) {
-
-              ScalarType c = tensor_c.at(coord);
-              // The imaginary parts of the diagonal elements of 
-              // a complex data type are assumed and set to zero
-              if (blas_mode == BlasMode::kHermitian) {
-                c = (row == col) ? real(c) : c;
-              }
-
-              ScalarType tmp_d = convert_op(
-                alpha * ScalarType(accum[i][j]) + 
-                beta * c);
-
-              if (blas_mode == BlasMode::kHermitian && row == col ) {
-                tensor_d.at(coord) = real(tmp_d);
-              } else {
-                tensor_d.at(coord) = tmp_d;
-              }
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void RankKComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  FillMode fill_mode_c,
-  BlasMode blas_mode) {
-
-  Rank2KComplex(
-    problem_size, alpha, 
-    tensor_a, transform_a, 
-    beta, tensor_c, tensor_d, 
-    ScalarType(0),
-    fill_mode_c,
-    blas_mode);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/symm.h b/csrc/sparse/cutlass/example/util/reference/host/symm.h
deleted file mode 100644
index a585caf73f64f..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/symm.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for SYMM update in host-side code.
-    
-    
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_symm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, 
-    "Tensors must be of rank 2");
-
-  static_assert(SideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(
-    FillModeA == FillMode::kLower || 
-    FillModeA == FillMode::kUpper, 
-    "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp_w_diag =  typename TrMatrixCompareOp<FillModeA, DiagType::kNonUnit>::Type;
-  using CompareOp_wo_diag = typename TrMatrixCompareOp<FillModeA, DiagType::kZero>::Type;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp_w_diag compare_op_1;
-  CompareOp_wo_diag compare_op_2;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a_1 = ElementA();
-              ElementB b_1 = ElementB();
-              ElementA a_2 = ElementA();
-              ElementB b_2 = ElementB();
-
-              // A x B or B x A (with diagonal)
-              if (SideModeA == SideMode::kLeft) {
-                a_1 = (compare_op_1(row, k_block)) ? 
-                      (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
-                b_1 = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a_1 = tensor_b.at(MatrixCoord(row, k_block));
-                b_1 = (compare_op_1(k_block, col)) ? 
-                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
-              }
-
-              ComputeType compute_a_1(cast_if_scalar<ComputeType>(a_1));
-              ComputeType compute_b_1(cast_if_scalar<ComputeType>(b_1));
-
-              accum[i][j] = inner_product_op(compute_a_1, compute_b_1, accum[i][j]);
-
-              // A^T x B or B x A^T (without diagonal)
-              if (SideModeA == SideMode::kLeft) {
-                a_2 = (compare_op_2(k_block, row)) ? 
-                      (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
-                b_2 = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a_2 = tensor_b.at(MatrixCoord(row, k_block));
-                b_2 = (compare_op_2(col, k_block)) ? 
-                      tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
-              }
-
-              ComputeType compute_a_2(cast_if_scalar<ComputeType>(a_2));
-              ComputeType compute_b_2(cast_if_scalar<ComputeType>(b_2));
-
-              accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]) +
-              beta * ScalarType(tensor_c.at(coord)));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general Symm update (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_symm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  ComputeType initial_accum) {
-  compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
-               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
-      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-      initial_accum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Symm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, 
-          SideMode SideModeA, FillMode FillModeA,
-          typename ElementB, typename LayoutB, 
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/symm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/symm_complex.h
deleted file mode 100644
index 2618feaa70cee..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/symm_complex.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued SYMM update in host-side code.
-
-    
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include <assert.h>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  BlasMode BlasMode_ = BlasMode::kSymmetric,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_symm_complex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-  
-  static SideMode const kSideModeA = SideModeA;
-  static FillMode const kFillModeA = FillModeA;
-  static BlasMode const kBlasMode  = BlasMode_;
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  static_assert(kSideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(
-    kFillModeA == FillMode::kLower || 
-    kFillModeA == FillMode::kUpper, 
-    "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp_w_diag =  typename TrMatrixCompareOp<kFillModeA, DiagType::kNonUnit>::Type;
-  using CompareOp_wo_diag = typename TrMatrixCompareOp<kFillModeA, DiagType::kZero>::Type;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp_w_diag compare_op_1;
-  CompareOp_wo_diag compare_op_2;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N) 
-              {
-                ElementA a_1 = ElementA();
-                ElementB b_1 = ElementB();
-                ElementA a_2 = ElementA();
-                ElementB b_2 = ElementB();
-                
-                // A x B or B x A (with diagonal)
-                if (kSideModeA == SideMode::kLeft) {
-                  a_1 = (compare_op_1(row, k_block)) ? 
-                        (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
-                  b_1 = tensor_b.at(MatrixCoord(k_block, col));
-                } else if (kSideModeA == SideMode::kRight) {
-                  a_1 = tensor_b.at(MatrixCoord(row, k_block));
-                  b_1 = (compare_op_1(k_block, col)) ? 
-                        tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
-                }
-                ComputeType compute_a_1 = ComputeType(a_1);
-                ComputeType compute_b_1 = ComputeType(b_1);
-
-                // The imaginary parts of the diagonal elements of 
-                // a complex data type are assumed and set to zero
-                if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kLeft && row == k_block) {
-                  compute_a_1 = real(compute_a_1);
-                } else if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kRight && k_block == col) {
-                  compute_b_1 = real(compute_b_1);
-                }
-
-                accum[i][j] = inner_product_op(compute_a_1, compute_b_1,  accum[i][j]);
-
-                // A^T x B or B x A^T (without diagonal)
-                if (kSideModeA == SideMode::kLeft) {
-                  a_2 = (compare_op_2(k_block, row)) ? 
-                        (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
-                  b_2 = tensor_b.at(MatrixCoord(k_block, col));
-                  if (kBlasMode == BlasMode::kHermitian)
-                    a_2 = conj(a_2);
-                } else if (kSideModeA == SideMode::kRight) {
-                  a_2 = tensor_b.at(MatrixCoord(row, k_block));
-                  b_2 = (compare_op_2(col, k_block)) ? 
-                        tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
-                  if (kBlasMode == BlasMode::kHermitian)
-                    b_2 = conj(b_2);
-                }
-
-                ComputeType compute_a_2 = ComputeType(a_2);
-                ComputeType compute_b_2 = ComputeType(b_2);
-
-                accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
-              }
-            }
-          }
-        }
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N) {
-
-              ScalarType c = tensor_c.at(coord);
-
-              tensor_d.at(coord) = convert_op(
-                alpha * ScalarType(accum[i][j]) + 
-                beta * c);
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_b.add_pointer_offset(batch_stride_B);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  BlasMode BlasMode_ = cutlass::BlasMode::kSymmetric,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
->
-struct SymmComplex;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA,
-          SideMode SideModeA, FillMode FillModeA, 
-          typename ElementB, typename LayoutB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType,
-          BlasMode BlasMode_>
-struct SymmComplex<ElementA, LayoutA, 
-                   SideModeA, FillModeA,
-                   ElementB, LayoutB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, BlasMode_,
-                   arch::OpMultiplyAddComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm_complex<ElementA, LayoutA,
-                 SideModeA, FillModeA,
-                 ElementB, LayoutB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for gaussian multiply-add 
-template <typename ElementA, typename LayoutA,
-          SideMode SideModeA, FillMode FillModeA,
-          typename ElementB, typename LayoutB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType,
-          BlasMode BlasMode_>
-struct SymmComplex<ElementA, LayoutA, 
-                   SideModeA, FillModeA, 
-                   ElementB, LayoutB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, BlasMode_,
-                   arch::OpMultiplyAddGaussianComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm_complex<ElementA, LayoutA,
-                 SideModeA, FillModeA,
-                 ElementB, LayoutB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h
deleted file mode 100644
index df164a37e9297..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h
+++ /dev/null
@@ -1,423 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/relatively_equal.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/tensor_view_planar_complex.h"
-
-#include "cutlass/util/distribution.h"
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorEqualsFunc {
-
-  //
-  // Data members
-  //
-
-  TensorView<Element, Layout> lhs;
-  TensorView<Element, Layout> rhs;
-  bool result;
-
-  /// Ctor
-  TensorEqualsFunc(): result(true) { }
-
-  /// Ctor
-  TensorEqualsFunc(
-    TensorView<Element, Layout> const &lhs_,
-    TensorView<Element, Layout> const &rhs_
-  ) :
-    lhs(lhs_), rhs(rhs_), result(true) { }
-
-  /// Visits a coordinate
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    Element lhs_ = lhs.at(coord);
-    Element rhs_ = rhs.at(coord);
-
-    if (lhs_ != rhs_) {
-      result = false;
-    }
-  }
-
-  /// Returns true if equal
-  operator bool() const {
-    return result;
-  }
-};
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorRelativelyEqualsFunc {
-
-  //
-  // Data members
-  //
-
-  TensorView<Element, Layout> lhs;
-  TensorView<Element, Layout> rhs;
-  Element epsilon;
-  Element nonzero_floor;
-  bool result;
-
-  /// Ctor
-  TensorRelativelyEqualsFunc(
-    TensorView<Element, Layout> const &lhs_,
-    TensorView<Element, Layout> const &rhs_,
-    Element epsilon_,
-    Element nonzero_floor_
-  ) :
-    lhs(lhs_),
-    rhs(rhs_),
-    epsilon(epsilon_),
-    nonzero_floor(nonzero_floor_),
-    result(true) { }
-
-  /// Visits a coordinate
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    Element lhs_ = lhs.at(coord);
-    Element rhs_ = rhs.at(coord);
-
-    if (!relatively_equal(lhs_, rhs_, epsilon, nonzero_floor)) {
-      result = false;
-    }
-  }
-
-  /// Returns true if equal
-  operator bool() const {
-    return result;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorEquals(
-  TensorView<Element, Layout> const &lhs,
-  TensorView<Element, Layout> const &rhs) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
-  TensorForEach(
-    lhs.extent(),
-    func
-  );
-
-  return bool(func);
-}
-
-/// Returns true if two tensor views are equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs,
-  TensorViewPlanarComplex<Element, Layout> const &rhs) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> real_func(
-    {lhs.data(), lhs.layout(), lhs.extent()},
-    {rhs.data(), rhs.layout(), rhs.extent()}
-  );
-
-  TensorForEach(
-    lhs.extent(),
-    real_func
-  );
-
-  if (!bool(real_func)) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> imag_func(
-    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()}, 
-    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()}
-    );
-
-  TensorForEach(
-    lhs.extent(),
-    imag_func
-  );
-
-  return bool(imag_func);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are relatively equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorRelativelyEquals(
-  TensorView<Element, Layout> const &lhs,
-  TensorView<Element, Layout> const &rhs,
-  Element epsilon,
-  Element nonzero_floor) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorRelativelyEqualsFunc<Element, Layout> func(lhs, rhs, epsilon, nonzero_floor);
-  TensorForEach(
-    lhs.extent(),
-    func
-  );
-
-  return bool(func);
-}
-
-/// Returns true if two tensor views are relatively equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorRelativelyEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs,
-  TensorViewPlanarComplex<Element, Layout> const &rhs,
-  Element epsilon,
-  Element nonzero_floor) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorRelativelyEqualsFunc<Element, Layout> real_func(
-    {lhs.data(), lhs.layout(), lhs.extent()},
-    {rhs.data(), rhs.layout(), rhs.extent()},
-    epsilon,
-    nonzero_floor
-  );
-
-  TensorForEach(
-    lhs.extent(),
-    real_func
-  );
-
-  if (!bool(real_func)) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> imag_func(
-    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()},
-    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()},
-    epsilon,
-    nonzero_floor
-  );
-
-  TensorForEach(
-    lhs.extent(),
-    imag_func
-  );
-
-  return bool(imag_func);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are NOT equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorNotEquals(
-  TensorView<Element, Layout> const &lhs,
-  TensorView<Element, Layout> const &rhs) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return true;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
-  TensorForEach(
-    lhs.extent(),
-    func
-  );
-
-  return !bool(func);
-}
-
-/// Returns true if two tensor views are equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorNotEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs,
-  TensorViewPlanarComplex<Element, Layout> const &rhs) {
-
-  return !TensorEquals(lhs, rhs);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorContainsFunc {
-
-  //
-  // Data members
-  //
-
-  TensorView<Element, Layout> view;
-  Element value;
-  bool contains;
-  Coord<Layout::kRank> location;
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  TensorContainsFunc(): contains(false) { }
-
-  /// Ctor
-  TensorContainsFunc(
-    TensorView<Element, Layout> const &view_,
-    Element value_
-  ) :
-    view(view_), value(value_), contains(false) { }
-
-  /// Visits a coordinate
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    if (view.at(coord) == value) {
-      if (!contains) {
-        location = coord;
-      }
-      contains = true;
-    }
-  }
-
-  /// Returns true if equal
-  operator bool() const {
-    return contains;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if a value is present in a tensor
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorContains(
-  TensorView<Element, Layout> const & view,
-  Element value) {
-
-  detail::TensorContainsFunc<Element, Layout> func(
-    view,
-    value
-  );
-
-  TensorForEach(
-    view.extent(),
-    func
-  );
-
-  return bool(func);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns a pair containing a boolean of whether a value exists in a tensor and the location of
-/// of the first occurrence. If the value is not contained in the tensor, the second element of the
-/// pair is undefined.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-std::pair<bool, Coord<Layout::kRank> > TensorFind(
-  TensorView<Element, Layout> const & view,
-  Element value) {
-
-  detail::TensorContainsFunc<Element, Layout> func(
-    view,
-    value
-  );
-
-  TensorForEach(
-    view.extent(),
-    func
-  );
-
-  return std::make_pair(bool(func), func.location);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp
deleted file mode 100644
index a1f3f5b14e6f0..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-
-// Cute includes
-#include "cute/tensor.hpp"
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are equal.
-template <
-  typename TensorL,
-  typename TensorR
->
-bool TensorEquals(
-  TensorL lhs,
-  TensorR rhs) {
-
-  // Extents must be identical
-  if (cute::size(lhs) != cute::size(rhs)) {
-    return false;
-  }
-
-  for (int64_t idx = 0; idx < cute::size(lhs); ++idx) {
-    if (lhs(idx) != rhs(idx)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-/// Returns true if two tensor views are NOT equal.
-template <
-  typename TensorL,
-  typename TensorR
->
-bool TensorNotEquals(
-  TensorL lhs,
-  TensorR rhs) {
-
-  return TensorEquals(lhs, rhs);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h
deleted file mode 100644
index 0b963b72e9152..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Helper to convert between types
-template <
-  typename DstElement,
-  typename SrcElement
->
-struct TrivialConvert {
-
-  TrivialConvert() { }
-
-  DstElement operator()(SrcElement src) const {
-    return DstElement(src);
-  }
-};
-
-/// Helper to conditionally copy between tensor views.
-template <
-  typename DstElement,
-  typename DstLayout,
-  typename SrcElement,
-  typename SrcLayout,
-  typename F
->
-struct TensorCopyIf {
-
-  using DstTensorView = TensorView<DstElement, DstLayout>;
-  using SrcTensorView = TensorView<SrcElement, SrcLayout>;
-
-  //
-  // Data members
-  //
-
-  DstTensorView dst;
-  SrcTensorView src;
-  F convert;
-
-  //
-  // Methods
-  //
-
-  TensorCopyIf() { }
-
-  TensorCopyIf(
-    DstTensorView const &dst_, 
-    SrcTensorView const &src_,
-    F const &convert_): dst(dst_), src(src_), convert(convert_) {}
-
-  /// Copies based on destination and source bounds
-  void operator()(Coord<DstLayout::kRank> const &coord) {
-    if (dst.contains(coord) && src.contains(coord)) {
-      dst.at(coord) = convert(src.at(coord));
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src,
-  F const &transform) {
-
-  using CopyIf = detail::TensorCopyIf<
-    DstElement,
-    DstLayout,
-    SrcElement,
-    SrcLayout,
-    F>;
-
-  CopyIf copy_if(dst, src, transform);
-
-  TensorForEach(dst.extent(), copy_if);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
-/// to avoid out of bounds accesses.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorRef<SrcElement, SrcLayout> src,
-  F const &transform) {
-
-  using CopyIf = detail::TensorCopyIf<
-    DstElement,
-    DstLayout,
-    SrcElement,
-    SrcLayout,
-    F>;
-
-  TensorView<SrcElement, SrcLayout> src_view(src, dst.extent());
-
-  CopyIf copy_if(dst, src_view, transform);
-
-  TensorForEach(dst.extent(), copy_if);
-}
-
-/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
-/// to avoid out of bounds accesses.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorRef<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src,
-  F const &transform) {
-
-  using CopyIf = detail::TensorCopyIf<
-    DstElement,
-    DstLayout,
-    SrcElement,
-    SrcLayout,
-    F>;
-
-  TensorView<DstElement, DstLayout> dst_view(dst, src.extent());
-
-  CopyIf copy_if(dst_view, src, transform);
-
-  TensorForEach(src.extent(), copy_if);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
-/// if SrcElement can be converted to DstElement.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout            /// Source tensor's layout
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src) {
-
-  detail::TrivialConvert<DstElement, SrcElement> convert;
-
-  TensorCopy(dst, src, convert);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
-/// if SrcElement can be converted to DstElement.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorRef<SrcElement, SrcLayout> src) {
-
-  detail::TrivialConvert<DstElement, SrcElement> convert;
-
-  TensorCopy(dst, src, convert);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
-/// if SrcElement can be converted to DstElement.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout            /// Source tensor's layout
->
-void TensorCopy(
-  TensorRef<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src) {
-
-  detail::TrivialConvert<DstElement, SrcElement> convert;
-
-  TensorCopy(dst, src, convert);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h
deleted file mode 100644
index 42ce2183b6a24..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/functional.h"
-
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to apply a binary operator in place
-template <
-  typename ElementA, 
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementD,
-  typename LayoutD,
-  typename BinaryFunc>
-struct TensorFuncBinaryOp {
-
-  //
-  // Data members
-  //
-
-  /// View of left-hand-side tensor
-  TensorView<ElementD, LayoutD> view_d;
-  TensorRef<ElementA, LayoutA> view_a;
-  TensorRef<ElementB, LayoutB> view_b;
-  BinaryFunc func;
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  TensorFuncBinaryOp() { }
-
-  /// Constructor
-  TensorFuncBinaryOp(
-    TensorView<ElementD, LayoutD> const & view_d_,
-    TensorRef<ElementA, LayoutA> const & view_a_,
-    TensorRef<ElementB, LayoutB> const & view_b_,
-    BinaryFunc func = BinaryFunc()
-  ):
-    view_d(view_d_), view_a(view_a_), view_b(view_b_), func(func) { }
-
-  /// Equality check
-  void operator()(Coord<LayoutD::kRank> const &coord) const {
-    view_d.at(coord) = func(
-      ElementD(view_a.at(coord)),
-      ElementD(view_b.at(coord))
-    );
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Adds two tensors and stores in the destination tensor: d = a + b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorAdd(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::plus<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func); 
-}
-
-/// Adds a tensor in place: d = d .+ a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorAdd(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorAdd(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Subtracts two tensors and stores in the destination tensor: d = a - b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorSub(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-  ) {
-
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::minus<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Subtracts two tensors in place: d = d .- a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorSub(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-  ) {
-  
-  TensorSub(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Multiplies two tensors and stores in the destination tensor: d = a .* b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorMul(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-  
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::multiplies<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Multiplies tensors in place: d = d .* a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorMul(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorMul(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Divides two tensors and stores in the destination tensor: d = a ./ b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorDiv(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-  
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::divides<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Divides tensors in place: d = d ./ a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorDiv(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorDiv(d, d, a);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Divides two tensors and stores in the destination tensor: d = a ./ b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorModulus(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-  
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::divides<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Divides tensors in place: d = d ./ a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorModulus(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorDiv(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h
deleted file mode 100644
index b9f0c84d9a2a9..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h
+++ /dev/null
@@ -1,1718 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-#include <random>
-#include <stdexcept>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/subbyte_reference.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/tensor_view_planar_complex.h"
-#include "cutlass/blas3.h"
-
-#include "cutlass/util/distribution.h"
-#include "tensor_foreach.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Element value;
-
-  //
-  // Methods
-  //
-
-  TensorFillFunc(
-    TensorView const &view_ = TensorView(), 
-    Element value_ = Element(0)
-  ): view(view_), value(value_) { }
-
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    view.at(coord) = value;
-  }
-};
-
-/// Returns a pair of values of the Gaussian distribution generated by the Box Muller method 
-struct BoxMullerFunc {
-
-  BoxMullerFunc() {}
-
-  void operator()(
-    double* rnd,                     ///< Size-2 vector to be filled with random values
-    double  mean = 0,                ///< Mean of the Gaussian distribution
-    double  stddev = 1,              ///< Standard deviation of the Gaussian distribution
-    double  pi = std::acos(-1)) const {
-
-    double u1 = double(std::rand()) / double(RAND_MAX);
-    double u2 = double(std::rand()) / double(RAND_MAX);
-    rnd[0] = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-    rnd[1] = std::sqrt(-2 * std::log(u1)) * std::sin(2 * pi * u2);
-    rnd[0] = mean + stddev * rnd[0];
-    rnd[1] = mean + stddev * rnd[1];
-  }
-};
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFill(
-  TensorView<Element, Layout> dst,    ///< destination tensor 
-  Element val = Element(0)) {               ///< value to uniformly fill it with
-
-  detail::TensorFillFunc<Element, Layout> func(dst, val);
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element,                                                   ///< Element type
-  typename Layout>                                                    ///< Layout function
-void TensorFill(
-  TensorViewPlanarComplex<Element, Layout> dst,                       ///< destination tensor 
-  cutlass::complex<Element> val = cutlass::complex<Element>(0)) {     ///< value to uniformly fill it with
-
-  TensorFill(dst.view_real(), val.real());
-  TensorFill(dst.view_imag(), val.imag());
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomGaussianFunc {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-  double pnz;
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0, 
-    double mean_ = 0, 
-    double stddev_ = 1,
-    int int_scale_ = -1,
-    double pnz_ = 1.0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-
-    // Box-Muller transform to generate random numbers with Normal distribution
-    double u1 = double(std::rand()) / double(RAND_MAX);
-    double u2 = double(std::rand()) / double(RAND_MAX);
-
-    // Compute Gaussian random value
-    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-    rnd = mean + stddev * rnd;
-
-    // Scale and convert final result
-    Element result;
-
-    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
-    std::random_device rnd_device;
-    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz);
-    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
-
-    // Sample from the Gaussian distribution for a nonzero element
-    if (bernoulli_result) {
-      if (int_scale >= 0) {
-        rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
-        result = static_cast<Element>(rnd);
-      }
-      else {
-        result = static_cast<Element>(rnd);
-      }
-    }
-    else {
-      result = static_cast<Element>(0);
-    }
-
-    // Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
-    if (exclude_zero && result == Element(0)) {
-      if (rnd > 0) {
-        rnd += 1;
-      } else {
-        rnd -= 1;
-      }
-      result = Element(rnd);
-    }    
-
-    return result;
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomGaussianFunc<complex<Element> > {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-  double pnz;
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0, 
-    double mean_ = 0, 
-    double stddev_ = 1,
-    int int_scale_ = -1,
-    double pnz_ = 1.0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  complex<Element> operator()() const {
-
-    Element reals[2];
-
-    double rnd[2];
-    detail::BoxMullerFunc func;
-    func(rnd, mean, stddev, pi);
-
-    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
-    std::random_device rnd_device;
-    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz);
-    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
-
-    // Sample from the Gaussian distribution for a nonzero element
-    if (bernoulli_result) {
-      if (int_scale >= 0) {
-        rnd[0] = double(int(rnd[0] * double(1 << int_scale)));
-        rnd[1] = double(int(rnd[1] * double(1 << int_scale)));
-        reals[0] = from_real<Element>(rnd[0] / double(1 << int_scale));
-        reals[1] = from_real<Element>(rnd[1] / double(1 << int_scale));
-      }
-      else {
-        reals[0] = from_real<Element>(rnd[0]);
-        reals[1] = from_real<Element>(rnd[1]);
-      }
-    }
-    else {
-      reals[0] = from_real<Element>(0);
-      reals[1] = from_real<Element>(0);
-    }
-
-    // Note that this will invalidate the above else statement because it unsets zero elements
-    if (exclude_zero &&
-        reals[0] == from_real<Element>(0.0) &&
-        reals[1] == from_real<Element>(0.0)) {
-
-      if (rnd[0] > 0.0) {
-        rnd[0] += 1.0;
-      } else {
-        rnd[0] -= 1.0;
-      }
-      reals[0] = from_real<Element>(rnd[0]);
-    }
-
-    return complex<Element>(reals[0], reals[1]);
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomGaussianFunc<Quaternion<Element> > {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-  double pnz;
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0,
-    double mean_ = 0,
-    double stddev_ = 1,
-    int int_scale_ = -1,
-    double pnz_ = 1.0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  Quaternion<Element> operator()() const {
-
-    Element reals[4];
-
-    double rnd1[2];
-    double rnd2[2];
-    detail::BoxMullerFunc func;
-    func(rnd1, mean, stddev, pi);
-    func(rnd2, mean, stddev, pi);
-
-    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
-    std::random_device rnd_device;
-    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz);
-    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
-
-    // Sample from the Gaussian distribution for a nonzero element
-    if (bernoulli_result) {
-      if (int_scale >= 0) {
-        rnd1[0] = double(int(rnd1[0] * double(1 << int_scale)));
-        rnd1[1] = double(int(rnd1[1] * double(1 << int_scale)));
-        rnd2[0] = double(int(rnd2[0] * double(1 << int_scale)));
-        rnd2[1] = double(int(rnd2[1] * double(1 << int_scale)));
-
-        reals[0] = from_real<Element>(rnd1[0] / double(1 << int_scale));
-        reals[1] = from_real<Element>(rnd1[1] / double(1 << int_scale));
-        reals[2] = from_real<Element>(rnd2[0] / double(1 << int_scale));
-        reals[3] = from_real<Element>(rnd2[1] / double(1 << int_scale));
-      }
-      else {
-        reals[0] = from_real<Element>(rnd1[0]);
-        reals[1] = from_real<Element>(rnd1[1]);
-        reals[2] = from_real<Element>(rnd2[0]);
-        reals[3] = from_real<Element>(rnd2[1]);
-      }
-    }
-    else {
-      reals[0] = from_real<Element>(0);
-      reals[1] = from_real<Element>(0);
-      reals[2] = from_real<Element>(0);
-      reals[3] = from_real<Element>(0);
-    }
-
-    // Note that this will invalidate the above else statement because it unsets zero elements
-    if (exclude_zero &&
-        reals[0] == from_real<Element>(0) &&
-        reals[1] == from_real<Element>(0) &&
-        reals[2] == from_real<Element>(0) &&
-        reals[3] == from_real<Element>(0)) {
-
-      if (rnd1[0] > 0.0) {
-        rnd1[0] += 1.0;
-      } else {
-        rnd1[0] -= 1.0;
-      }
-      reals[0] = from_real<Element>(rnd1[0]);
-    }
-
-    return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillGaussianFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomGaussianFunc<Element> func;
-
-  //
-  // Methods
-  //
-
-  /// Construction of Gaussian RNG functor.
-  TensorFillGaussianFunc(
-    TensorView view_ = TensorView(),
-    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>()
-  ):
-    view(view_), func(func_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
-    view.at(coord) = func();
-  }
-};
-
-/// Computes a random Gaussian distribution for a rank-2 tensor
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillSymmetricGaussianFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomGaussianFunc<Element> func;
-  cutlass::FillMode fill_mode;
-
-  //
-  // Methods
-  //
-
-  /// Construction of Gaussian RNG functor.
-  TensorFillSymmetricGaussianFunc(
-    TensorView view_ = TensorView(),
-    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>(),
-    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
-  ):
-    view(view_), func(func_), fill_mode(fill_mode_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
-    // Fill half of matrix based on FillMode
-    if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kLower &&
-        coord[0] >= coord[1]) {
-      view.at(coord) = func();
-    } else if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kUpper &&
-        coord[0] <= coord[1]) {
-      view.at(coord) = func();
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomGaussian(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0,                     ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  bool exclude_zero = false) {            ///< Exclude zeros from tensor init.
-  
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);
-
-  detail::TensorFillGaussianFunc<Element, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomGaussian(
-  TensorViewPlanarComplex<Element, Layout> dst,         ///< destination tensor
-  uint64_t seed,                                        ///< seed for RNG
-  double mean = 0,                                      ///< Gaussian distribution's mean
-  double stddev = 1,                                    ///< Gaussian distribution's standard deviation
-  int bits = -1,                                        ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0,                                   ///  are not truncated to zero. Permits reducing precision of
-                                                        ///  data.
-  bool exclude_zero = false) {                          ///< Exclude zeros from tensor init.
-  
-  TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
-  TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillSymmetricRandomGaussian(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
-
-  detail::TensorFillSymmetricGaussianFunc<Element, Layout> func(
-    dst,
-    random_func,
-    fill_mode
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values of a Gaussian distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomGaussian(
-  Element *ptr,                           ///< destination buffer
-  size_t capacity,                        ///< number of elements
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  
-
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ReferenceFactory<Element>::get(ptr, i) = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomUniformFunc {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  double pnan;
-private:
-  using engine_type = std::mt19937;
-public:
-  engine_type bernoulli_rnd;
-  std::bernoulli_distribution bernoulli_dist;
-
-  bool exclude_zero;
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1,
-    double pnan_ = 0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
-    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
-    , bernoulli_dist(pnan_)
-    , exclude_zero(exclude_zero_) 
-    {
-      std::srand((unsigned)seed);
-      
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero) {
-        min = (min == 0.0) ? min + 1: min;
-        range = (max == 0.0) ? range - 1: range; 
-      }
-  }
-
-
-  /// Compute random value and update RNG state
-  Element operator()() {
-
-    // Sample from NaN distribution.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
-        return Element(NAN);
-      }
-    }
-
-    double rnd = double(std::rand()) / double(RAND_MAX);
-
-    rnd = min + range * rnd;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-    if (int_scale >= 0) {
-      rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
-      result = static_cast<Element>(Real(rnd));
-    }
-    else {
-      result = static_cast<Element>(Real(rnd));
-    }
-
-    if (exclude_zero && result == Element(0)) {
-      if (rnd > 0.0) {
-        rnd = std::min(min + range, rnd + 1.0);
-      } else {
-        rnd = std::max(min, rnd - 1.0);
-      }
-      result = static_cast<Element>(Real(rnd));
-    }
-
-    return result;
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomUniformFunc<complex<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  double pnan;
-private:
-  using engine_type = std::mt19937;
-public:
-  engine_type bernoulli_rnd;
-  std::bernoulli_distribution bernoulli_dist;
-
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1,
-    double pnan_ = 0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
-    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
-    , bernoulli_dist(pnan_)
-    , exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero) {
-        min = (min == 0.0) ? min + 1: min;
-        range = (max == 0.0) ? range - 1: range; 
-      }
-  }
-
-
-  /// Compute random value and update RNG state
-  complex<Element> operator()() {
-
-    // Sample from NaN distribution.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
-        return Element(NAN);
-      }
-    }
-
-    Element reals[2];
-
-    for (int i = 0; i < 2; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-
-      if (exclude_zero && 
-          i == 0 &&
-          reals[0] == from_real<Element>(0.0)) {
-
-        if (rnd > 0.0) {
-          rnd = std::min(min + range, rnd + 1.0);
-        } else {
-          rnd = std::max(min, rnd - 1.0);
-        }
-        reals[0] = from_real<Element>(Real(rnd));
-      }
-
-    }
-
-    return complex<Element>(reals[0], reals[1]);
-  }
-};
-
-/// Partial specialization for initializing a Quaternion value.
-template <typename Element>
-struct RandomUniformFunc<Quaternion<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  double pnan;
-private:
-  using engine_type = std::mt19937;
-public:
-  engine_type bernoulli_rnd;
-  std::bernoulli_distribution bernoulli_dist;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0,
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1,
-    double pnan_ = 0
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
-    bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
-    bernoulli_dist(pnan_)
-  {
-    std::srand((unsigned)seed);
-  }
-
-
-  /// Compute random value and update RNG state
-  Quaternion<Element> operator()() {
-
-    // Sample from NaN distribution.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
-        return Element(NAN);
-      }
-    }
-
-    Element reals[4];
-
-    for (int i = 0; i < 4; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-    }
-
-    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
-  }
-};
-
-/// Computes a random uniform distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomUniformFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomUniformFunc<Element> func;
-
-  //
-  // Methods
-  //
-
-  /// Construction of uniform RNG functor.
-  TensorFillRandomUniformFunc(
-    TensorView view_ = TensorView(),
-    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>()
-  ):
-    view(view_), func(func_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    view.at(coord) = func();
-  }
-};
-
-/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a uniform distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillSymmetricRandomUniformFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomUniformFunc<Element> func;
-  cutlass::FillMode fill_mode;
-
-  //
-  // Methods
-  //
-
-  /// Construction of uniform RNG functor.
-  TensorFillSymmetricRandomUniformFunc(
-    TensorView view_ = TensorView(),
-    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
-    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
-  ):
-    view(view_), func(func_), fill_mode(fill_mode_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) {
-    // Fill half of matrix based on FillMode
-    if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kLower &&
-        coord[0] >= coord[1]) {
-      view.at(coord) = func();
-    } else if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kUpper &&
-        coord[0] <= coord[1]) {
-      view.at(coord) = func();
-    }
-  }
-};
-
-/// Computes a random Uniform distribution and pads diagonal with zeros
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillPadDiagonalRandomUniformFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomUniformFunc<Element> func;
-  cutlass::FillMode fill_mode;
-  int alignment;
-
-  //
-  // Methods
-  //
-
-  /// Construction of uniform RNG functor.
-  TensorFillPadDiagonalRandomUniformFunc(
-    TensorView view_ = TensorView(),
-    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
-    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid,
-    int alignment_ = 1
-  ):
-    view(view_), func(func_), fill_mode(fill_mode_), alignment(alignment_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) {
-    // Fill half of matrix based on FillMode
-    if (Layout::kRank == 2 && 
-        (fill_mode == cutlass::FillMode::kLower) &&
-        (coord[0] >= coord[1]) || 
-        ((coord[1] - coord[0]) >= alignment)) {
-      view.at(coord) = func();
-    } else if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kUpper &&
-        (coord[0] <= coord[1]) ||
-        ((coord[0] - coord[1]) >= alignment)) {
-      view.at(coord) = func();
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values of a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  bool exclude_zero = false) {            ///< Exclude zero from tensor init  
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);
-
-  detail::TensorFillRandomUniformFunc<Element, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with random values of a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorViewPlanarComplex<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                                       ///< seed for RNG
-  double max = 1,                                      ///< upper bound of distribution
-  double min = 0,                                      ///< lower bound for distribution
-  int bits = -1,                                       ///< If non-negative, specifies number of fractional bits that
-                                                       ///  are not truncated to zero. Permits reducing precision of
-                                                       ///  data.
-  double pnan = 0,                                     ///< Percentage of NaN elements.
-  bool exclude_zero = false) {                         ///< Exclude zero from tensor init 
-
-  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
-  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
-}
-
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorView<Quaternion<Element>, Layout> dst,        ///< destination tensor
-  uint64_t seed,                                      ///< seed for RNG
-  double max = 1,                                     ///< upper bound of distribution
-  double min = 0,                                     ///< lower bound for distribution
-  int bits = -1) {                                    ///< If non-negative, specifies number of fractional bits that 
-                                                      ///  are not truncated to zero. Permits reducing precision of
-                                                      ///  data.                 
-  detail::RandomUniformFunc<Quaternion<Element>> random_func(seed, max, min, bits);
-
-  detail::TensorFillRandomUniformFunc<Quaternion<Element>, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillSymmetricRandomUniform(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
-
-  detail::TensorFillSymmetricRandomUniformFunc<Element, Layout> func(
-    dst,
-    random_func,
-    fill_mode
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with random values with a uniform random distribution pads zeros along diagonal
-template <
-  typename Element,                       ///< Element type
-  typename Layout>                        ///< Layout function
-void TensorFillPadDiagonalRandomUniform(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  int alignment = 1 
-) {
-
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
-
-  detail::TensorFillPadDiagonalRandomUniformFunc<Element, Layout> func(
-    dst,
-    random_func,
-    fill_mode,
-    alignment
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element                        ///< Element type
->
-void BlockFill(
-  Element *ptr,
-  size_t capacity,
-  Element val
-  ) {                                       
-  for (size_t i = 0; i < capacity; ++i) {
-    ReferenceFactory<Element>::get(ptr, i) = val;
-  }
-}
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomUniform(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0) {                      ///< Percentage of NaN elements.
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ReferenceFactory<Element>::get(ptr, i) = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillDiagonalFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Element diag;
-  Element other;
-
-  //
-  // Methods
-  //
-
-  TensorFillDiagonalFunc(
-    TensorView const &view_ = TensorView(),
-    Element diag_ = Element(1),
-    Element other_ = Element(0)
-  ):
-    view(view_), diag(diag_), other(other_) { }
-
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    view.at(coord) = (is_diag ? diag : other);
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor everywhere with a unique value for its diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillDiagonal(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  Element diag = Element(1),              ///< value to write in the diagonal
-  Element other = Element(0)) {           ///< value to write off the diagonal
-
-  detail::TensorFillDiagonalFunc<Element, Layout> func(
-    dst,
-    diag,
-    other
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to fill a tensor's diagonal with 1 and 0 everywhere else.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillIdentity(
-  TensorView<Element, Layout> dst) {               ///< destination tensor
-
-  TensorFillDiagonal(dst, Element(1), Element(0));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateDiagonal(
-  TensorView<Element, Layout> dst,                 ///< destination tensor
-  Element val = Element(1)) {
-
-  typename Layout::Index extent = dst.extent().min();
-
-  for (typename Layout::Index i = 0; i < extent; ++i) {
-    Coord<Layout::kRank> coord(i);
-    dst.at(coord) = val;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateOffDiagonalFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Element other;
-
-  //
-  // Methods
-  //
-
-  TensorUpdateOffDiagonalFunc(
-    TensorView const &view_ = TensorView(),
-    Element other_ = Element(0)
-  ):
-    view(view_), other(other_) { }
-
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    if (!is_diag) {
-      view.at(coord) = other;
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateOffDiagonal(
-  TensorView<Element, Layout> dst,      ///< destination tensor
-  Element other = Element(1)) {
-
-  detail::TensorUpdateOffDiagonalFunc<Element, Layout> func(
-    dst,
-    other
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillLinearFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Array<Element, Layout::kRank> v;
-  Element s;
-
-  //
-  // Methods
-  //
-  
-  TensorFillLinearFunc() { }
-
-  /// Constructs functor
-  TensorFillLinearFunc(
-    TensorView const &view_,
-    Array<Element, Layout::kRank> const & v_,
-    Element s_ = Element(0)
-  ):
-    view(view_), v(v_), s(s_) { }
-
-  /// Updates the tensor
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    
-    Element sum(s);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank; ++i) {
-      sum += Element(coord[i]) * v[i];
-    }
-
-    view.at(coord) = sum;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills tensor with a linear combination of its coordinate and another vector
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillLinear(
-  TensorView<Element, Layout> dst,      ///< destination tensor
-  Array<Element, Layout::kRank> const & v,
-  Element s = Element(0)) {
-
-  detail::TensorFillLinearFunc<Element, Layout> func(
-    dst,
-    v,
-    s
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills tensor with a linear combination of its coordinate and another vector
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillSequential(
-  TensorView<Element, Layout> dst,     ///< destination tensor
-  Element s = Element(0)) {
-
-  Array<Element, Layout::kRank> stride;
-
-  stride[0] = Element(1);
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 1; i < Layout::kRank; ++i) {
-    stride[i] = stride[i - 1] * Element(dst.extent()[i - 1]);
-  }
-
-  TensorFillLinear(dst, stride, s);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values from a distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandom(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,
-  Distribution dist,
-  bool exclude_zero = false               ///< If true, excludes 0.
-                                          ///  Note that setting this flag will result in more 1's,
-                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
-) {
-
-  using Real = typename RealType<Element>::Type;
-
-  if (dist.kind == Distribution::Gaussian) {
-    TensorFillRandomGaussian(
-      view,
-      seed,
-      dist.gaussian.mean,
-      dist.gaussian.stddev,
-      dist.int_scale,
-      dist.gaussian.pnz,
-      exclude_zero);
-  } else if (dist.kind == Distribution::Uniform) {
-    TensorFillRandomUniform(
-      view,
-      seed,
-      dist.uniform.max,
-      dist.uniform.min,
-      dist.int_scale,
-      dist.uniform.pnan,
-      exclude_zero);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequential(
-  Element *ptr,
-  int64_t capacity,
-  Element v = Element(1),
-  Element s = Element(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
-                                        8)>::get(ptr, i) = s;
-
-    s = Element(s + v);
-    ++i;
-  }
-}
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequentialModN(
-  Element *ptr,
-  int64_t capacity,
-  int64_t mod,
-  int64_t v = int64_t(1),
-  int64_t s = int64_t(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
-                                        8)>::get(ptr, i) = Element(s);
-
-    s = int64_t(s + v) % mod;
-    ++i;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillRandom(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,
-  Distribution dist) {
-
-  if (dist.kind == Distribution::Gaussian) {
-    BlockFillRandomGaussian<Element>(
-      ptr, 
-      capacity, 
-      seed, 
-      dist.gaussian.mean, 
-      dist.gaussian.stddev, 
-      dist.int_scale,
-      dist.gaussian.pnz);
-  }
-  else if (dist.kind == Distribution::Uniform) {
-    BlockFillRandomUniform<Element>(
-      ptr, 
-      capacity, 
-      seed, 
-      dist.uniform.max,
-      dist.uniform.min, 
-      dist.int_scale,
-      dist.uniform.pnan);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomSparseMetaFunc {
-  
-  uint64_t seed;
-  int range;
-  int MetaSizeInBits;
-
-  //
-  // Methods
-  //
-
-  RandomSparseMetaFunc(
-    uint64_t seed_ = 0, 
-    int MetaSizeInBits_ = 2
-  ):
-    seed(seed_), MetaSizeInBits(MetaSizeInBits_) {
-      std::srand((unsigned)seed);
-      if (MetaSizeInBits_ == 2) {
-        range = 6;
-      }
-      else if (MetaSizeInBits_ == 4) {
-        range = 2;
-      }
-      else {
-        throw std::invalid_argument("Invalid MetaSizeInBits");
-      }
-    }
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
-    Element TwoToOneMeta[2] = {0x4, 0xe};
-
-    Element * MetaArray = (MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
-
-    Element result = 0x0;
-
-    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
-      int rnd = std::rand() % range;
-      Element meta = MetaArray[rnd];
-
-      result = (Element)(result | ((Element)(meta << (i * 4))));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random sparse meta
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomSparseMetaFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomSparseMetaFunc<Element> func;
-
-  //
-  // Methods
-  //
-
-  /// Construction of Gaussian RNG functor.
-  TensorFillRandomSparseMetaFunc(
-    TensorView view_ = TensorView(),
-    RandomSparseMetaFunc<Element> func_ = RandomSparseMetaFunc<Element>()
-  ):
-    view(view_), func(func_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
-
-    view.at(coord) = func();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,                 ///< Element type
-  typename Layout>                  ///< Layout function
-void TensorFillRandomSparseMeta(
-  TensorView<Element, Layout> dst,  ///< destination tensor
-  uint64_t seed,                    ///< seed for RNG
-  int MetaSizeInBits) {             ///< 2 bit or 4 bit
-
-  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
-
-  detail::TensorFillRandomSparseMetaFunc<Element, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomSparseMeta(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits) {                   ///< 2 bit or 4bit
-
-  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ptr[i] = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a ell block index matrix with random values with a uniform random distribution.
-template <
-  typename Element,                                ///< Element type
-  typename Layout>                                 ///< Layout function
-void TensorFillRandomEllIdx(
-  TensorView<Element, Layout> dst,                 ///< destination tensor
-  uint64_t seed,                                   ///< seed for RNG
-  int rows, int ell_cols, int cols) {              ///< dimension of the matrix 
-
-  std::srand((unsigned)seed);
-
-  for (int i = 0; i < rows; ++i) {
-    int col_idx = std::rand() % cols;
-   
-    for (int j = 0; j < ell_cols; ++j) {
-      dst.at({i, j}) = col_idx;
-
-      if (col_idx != -1) {
-        if (col_idx == (cols - 1)) {
-          col_idx = -1;
-        } else {
-          col_idx = std::rand() % (cols - col_idx - 1) + col_idx + 1;
-        }
-      }
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies a diagonal in from host memory without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalIn(
-  TensorView<Element, Layout> dst,          ///< destination tensor
-  Element const *ptr) {                     ///< dense buffer of elements
-
-  typename Layout::Index extent = dst.extent().min();
-  
-  for (typename Layout::Index i = 0; i < extent; ++i) {
-    Coord<Layout::kRank> coord(i);
-    dst.at(coord) = ReferenceFactory<Element>::get(ptr, i);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies the diagonal of a tensor into a dense buffer in host memory.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalOut(
-  Element *ptr,                               ///< dense buffer of elements
-  TensorView<Element, Layout> src) {          ///< source tensor
-
-  typename Layout::Index extent = src.extent().min();
-  
-  for (typename Layout::Index i = 0; i < extent; ++i) {
-    Coord<Layout::kRank> coord(i);
-    ReferenceFactory<Element>::get(ptr, i) = src.at(coord);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp
deleted file mode 100644
index 86a54e2ee06b7..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp
+++ /dev/null
@@ -1,432 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-
-// Cute includes
-#include "cute/tensor.hpp"
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Uniform and procedural tensor fills
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a scalar element
-template <typename Tensor>
-void TensorFill(Tensor dst, typename Tensor::value_type element) {
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = element;
-  }
-}
-
-/// Fills a tensor with the contents of its layout
-template <typename Tensor>
-void TensorFillSequential(Tensor dst) {
-
-  auto layout = dst.layout();
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = layout(idx);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Random uniform values
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomUniformFunc {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
-
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-
-    double rnd = double(std::rand()) / double(RAND_MAX);
-
-    rnd = min + range * rnd;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-    
-    if (int_scale >= 0) {
-      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
-      result = static_cast<Element>(Real(rnd));
-    }
-    else {
-      result = static_cast<Element>(Real(rnd));
-    }
-
-    return result;
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomUniformFunc<complex<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
-
-
-  /// Compute random value and update RNG state
-  complex<Element> operator()() const {
-
-    Element reals[2];
-
-    for (int i = 0; i < 2; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-    }
-
-    return complex<Element>(reals[0], reals[1]);
-  }
-};
-
-/// Partial specialization for initializing a Quaternion value.
-template <typename Element>
-struct RandomUniformFunc<Quaternion<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0,
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
-
-
-  /// Compute random value and update RNG state
-  Quaternion<Element> operator()() const {
-
-    Element reals[4];
-
-    for (int i = 0; i < 4; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-    }
-
-    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Tensor>                ///< Tensor object
-void TensorFillRandomUniform(
-  Tensor dst,                             ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.   
-
-  detail::RandomUniformFunc<typename Tensor::value_type> random_func(seed, max, min, bits);
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = random_func();
-  }
-}
-
-/// Fills a block with random values with a uniform random distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomUniform(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ptr[i] = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Random Gaussian
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomGaussianFunc {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0, 
-    double mean_ = 0, 
-    double stddev_ = 1,
-    int int_scale_ = -1
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-
-    // Box-Muller transform to generate random numbers with Normal distribution
-    double u1 = double(std::rand()) / double(RAND_MAX);
-    double u2 = double(std::rand()) / double(RAND_MAX);
-
-    // Compute Gaussian random value
-    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-    rnd = mean + stddev * rnd;
-
-    // Scale and convert final result
-    Element result;
-
-    if (int_scale >= 0) {
-      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
-      result = static_cast<Element>(rnd);
-    }
-    else {
-      result = static_cast<Element>(rnd);
-    }
-
-    return result;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Tensor
->
-void TensorFillRandomGaussian(
-  Tensor  dst,                            ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  
-  detail::RandomGaussianFunc<typename Tensor::value_type> random_func(seed, mean, stddev, bits);
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = random_func();
-  }
-}
-
-/// Fills a block with random values with a Gaussian distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomGaussian(
-  Element *ptr,                           ///< destination buffer
-  size_t capacity,                        ///< number of elements
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ptr[i] = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequential(
-  Element *ptr,
-  int64_t capacity,
-  Element v = Element(1),
-  Element s = Element(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-
-    ptr[i] = Element(s + v);
-    ++i;
-  }
-}
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequentialModN(
-  Element *ptr,
-  int64_t capacity,
-  int64_t mod,
-  int64_t v = int64_t(1),
-  int64_t s = int64_t(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-
-    ptr[i] = static_cast<Element>(int32_t(int64_t(s + v) % mod));
-    ++i;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h
deleted file mode 100644
index 43ff17362c21b..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-
-namespace cutlass  {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines several helpers
-namespace detail {
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank, int RankRemaining>
-struct TensorForEachHelper {
-
-  /// Index of the active rank
-  static int const kActiveRank = Rank - RankRemaining - 1;
-
-  /// Constructor for general rank
-  TensorForEachHelper(
-    Func &func,
-    Coord<Rank> const &extent,
-    Coord<Rank> &coord) {
-
-    for (int i = 0; i < extent.at(kActiveRank); ++i) {
-      coord[kActiveRank] = i;
-      TensorForEachHelper<Func, Rank, RankRemaining - 1>(func, extent, coord);
-    }
-  }
-};
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank>
-struct TensorForEachHelper<Func, Rank, 0> {
-
-  /// Index of the active rank
-  static int const kActiveRank = Rank - 1;
-
-  /// Constructor for fastest changing rank
-  TensorForEachHelper(
-    Func &func,
-    Coord<Rank> const &extent,
-    Coord<Rank> &coord) {
-
-    for (int i = 0; i < extent.at(kActiveRank); ++i) {
-      coord[kActiveRank] = i;
-      func(coord);
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Iterates over the index space of a tensor
-template <
-  typename Func,          ///< function applied to each point in a tensor's index space
-  int Rank>               ///< rank of index space
-void TensorForEach(Coord<Rank> extent, Func & func) {
-  Coord<Rank> coord;
-  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Iterates over the index space of a tensor and calls a C++ lambda
-template <
-  typename Func,          ///< function applied to each point in a tensor's index space
-  int Rank>               ///< rank of index space
-void TensorForEachLambda(Coord<Rank> extent, Func func) {
-  Coord<Rank> coord;
-  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, typename Func>
-struct BlockForEach {
-
-  /// Constructor performs the operation.
-  BlockForEach(
-    Element *ptr, 
-    size_t capacity,
-    typename Func::Params params = typename Func::Params()) {
-  
-    Func func(params);
-
-    for (size_t index = 0; index < capacity; ++index) {
-      ptr[index] = func();
-    }    
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h
deleted file mode 100644
index 8a7240665550d..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-
-#include "cutlass/cutlass.h"
-
-// The contents of this file have been moved  to 'tensor_reduce' to cover other types of reductions.
-
-#include "cutlass/util/reference/host/tensor_reduce.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h
deleted file mode 100644
index 048352ae29514..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cmath>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/util/reference/detail/linear_to_coordinate.h"
-#include "cutlass/core_io.h"
-
-namespace cutlass  {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform
-) {
-
-  for (int64_t idx = 0; idx < int64_t(view.size()); ++idx) {
-    typename Layout::TensorCoord coord;
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
-
-    if (view.contains(coord)) {
-      Element x = view.at(coord);
-      identity = reduce(identity, transform(x));
-    }
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform) {
-  
-  if (view_A.extent() != view_B.extent()) {
-    throw std::runtime_error("Tensor extents must match.");
-  }
-
-  for (int64_t idx = 0; idx < int64_t(view_A.size()); ++idx) {
-
-    typename Layout::TensorCoord coord;
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
-
-    if (view_A.contains(coord)) {
-      Element a = view_A.at(coord);
-      Element b = view_B.at(coord);
-      identity = reduce(identity, transform(a, b));
-    }
-  }
-
-  return identity;
-}
-
-/// Helper to compute the sum of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSum(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  NumericConverter<ComputeType, Element> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the sum of the squares of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSumSq(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the norm of the elements of a tensor.
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNorm(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSq(view, identity));
-}
-
-/// Helper to compute the sum of the squares of the differences of two tensors
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorSumSqDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared_difference<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view_A, view_B, identity, reduce, transform);
-}
-
-
-/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNormDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp
deleted file mode 100644
index 5ea5154107fcb..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-
-// Cute includes
-#include "cute/tensor.hpp"
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Tensor reductions
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Tensor,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  Tensor view,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform
-) {
-
-  for (int64_t idx = 0; idx < cute::size(view); ++idx) {
-    identity = reduce(identity, transform(view(idx)));
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename TensorA,
-  typename TensorB,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorA view_A,
-  TensorB view_B,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform) {
-  
-  if (cute::size(view_A) != cute::size(view_B)) {
-    throw std::runtime_error("Tensor sizes must match.");
-  }
-
-  for (int64_t idx = 0; idx < cute::size(view_A); ++idx) {
-    identity = reduce(identity, transform(view_A(idx), view_B(idx)));
-  }
-
-  return identity;
-}
-
-/// Helper to compute the sum of the elements of a tensor
-template <
-  typename Tensor,
-  typename ComputeType = typename Tensor::value_type
->
-ComputeType TensorSum(
-  Tensor view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  NumericConverter<ComputeType, typename Tensor::value_type> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the sum of the squares of the elements of a tensor
-template <
-  typename Tensor,
-  typename ComputeType = typename Tensor::value_type
->
-ComputeType TensorSumSq(
-  Tensor view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared<typename Tensor::value_type, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the norm of the elements of a tensor.
-template <
-  typename Tensor,
-  typename ComputeType = double
->
-ComputeType TensorNorm(
-  Tensor view,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSq(view, identity));
-}
-
-/// Helper to compute the sum of the squares of the differences of two tensors
-template <
-  typename TensorA,
-  typename TensorB,
-  typename ComputeType = double
->
-ComputeType TensorSumSqDiff(
-  TensorA view_A,
-  TensorB view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared_difference<typename TensorA::value_type, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view_A, view_B, identity, reduce, transform);
-}
-
-
-/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
-template <
-  typename TensorA,
-  typename TensorB,
-  typename ComputeType = double
->
-ComputeType TensorNormDiff(
-  TensorA view_A,
-  TensorB view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/trmm.h b/csrc/sparse/cutlass/example/util/reference/host/trmm.h
deleted file mode 100644
index 08b979254278c..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/trmm.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for TRMM in host-side code.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_trmm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  static_assert(SideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
-                , "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
- 
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp compare_op;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a = ElementA();
-              ElementB b = ElementB();
-
-              if (SideModeA == SideMode::kLeft) {
-                a = (compare_op(row, k_block)) ? 
-                            (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
-                if (row == k_block && DiagTypeA == DiagType::kUnit) {
-                  a = ElementA(1);
-                }
-                b = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a = tensor_b.at(MatrixCoord(row, k_block));
-                b = (compare_op(k_block, col)) ? 
-                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
-                if (k_block == col && DiagTypeA == DiagType::kUnit) {
-                  b = ElementA(1);
-                }
-              }
-                            
-              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
-              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
-
-              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Trmm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, SideMode SideModeA,
-           FillMode FillModeA, DiagType DiagTypeA, 
-           typename ElementB, typename LayoutB,
-           typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
-            ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
-                 ElementC, LayoutC, ScalarType, ComputeType, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h
deleted file mode 100644
index 86e58a035b481..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued TRMM in host-side code.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  ComplexTransform TransformA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  ComplexTransform TransformB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_trmm_complex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  static_assert(SideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
-                , "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
-  
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
- 
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp compare_op;
-  
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a = ElementA();
-              ElementB b = ElementB();
-              
-              if (SideModeA == SideMode::kLeft) {
-                a = (compare_op(row, k_block)) ? 
-                              (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
-                if (row == k_block && DiagTypeA == DiagType::kUnit) {
-                  a = ElementA(1);
-                }
-                b = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a = tensor_b.at(MatrixCoord(row, k_block));
-                b = (compare_op(k_block, col)) ? 
-                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
-                if (k_block == col && DiagTypeA == DiagType::kUnit) {
-                  b = ElementA(1);
-                }
-              }
-
-              ComputeType a_ik = ComputeType(a);
-              ComputeType b_kj = ComputeType(b);
-              
-              // Conjugate, and hence hermitian, is only allowed for the triangular matrix
-              if (SideModeA == SideMode::kLeft && TransformA == ComplexTransform::kConjugate) {
-                a_ik = conj(a_ik);
-              } else if (SideModeA == SideMode::kRight && TransformA == ComplexTransform::kConjugate) {
-                b_kj = conj(b_kj);
-              }
-
-              accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  ComplexTransform TransformA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  ComplexTransform TransformB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
->
-struct TrmmComplex;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
-          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
-          typename ElementB, typename LayoutB, ComplexTransform TransformB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct TrmmComplex<ElementA, LayoutA, TransformA, 
-                   SideModeA, FillModeA, DiagTypeA,
-                   ElementB, LayoutB, TransformB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, arch::OpMultiplyAddComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_trmm_complex<ElementA, LayoutA, TransformA,
-                 SideModeA, FillModeA, DiagTypeA,
-                 ElementB, LayoutB, TransformB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for gaussian multiply-add 
-template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
-          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
-          typename ElementB, typename LayoutB, ComplexTransform TransformB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct TrmmComplex<ElementA, LayoutA, TransformA, 
-                   SideModeA, FillModeA, DiagTypeA,
-                   ElementB, LayoutB, TransformB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, arch::OpMultiplyAddGaussianComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_trmm_complex<ElementA, LayoutA, TransformA,
-                 SideModeA, FillModeA, DiagTypeA,
-                 ElementB, LayoutB, TransformB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/tensor_view_io.h b/csrc/sparse/cutlass/example/util/tensor_view_io.h
deleted file mode 100644
index 4f6bdd686b8f0..0000000000000
--- a/csrc/sparse/cutlass/example/util/tensor_view_io.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-**************************************************************************************************/
-#pragma once
-
-#include "cutlass/core_io.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/tensor_view_planar_complex.h"
-#include "cutlass/complex.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Helper to write the least significant rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorView_WriteLeastSignificantRank(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (idx) {
-      out.width(0);
-      out << ", ";
-    }
-    if (idx || coord) {
-      out.width(width);
-    }
-    out << ScalarIO<Element>(view.at(coord));
-  }
-
-  return out;
-}
-
-/// Helper to write a rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorView_WriteRank(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  // If called on the least significant rank, write the result as a row
-  if (rank + 1 == Layout::kRank) {
-    return TensorView_WriteLeastSignificantRank(out, view, start_coord, rank, width);
-  }
-
-  // Otherwise, write a sequence of rows and newlines
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (rank + 2 == Layout::kRank) {
-      // Write least significant ranks asa matrix with rows delimited by "\n"
-      if (idx) {
-        out << ",\n";
-      }
-      TensorView_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
-    }
-    else {
-      // Higher ranks are separated by newlines
-      if (idx) {
-        out << ",\n\n";
-      }
-      TensorView_WriteRank(out, view, coord, rank + 1, width);
-    }
-  }
-
-  return out;
-}
-
-/// Helper to write the least significant rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorViewPlanarComplex_WriteLeastSignificantRank(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (idx) {
-      out.width(0);
-      out << ", ";
-    }
-    if (idx || coord) {
-      out.width(width);
-    }
-
-    complex<Element> x = view.at(coord);
-    out << x;
-  }
-
-  return out;
-}
-
-/// Helper to write a rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorViewPlanarComplex_WriteRank(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  // If called on the least significant rank, write the result as a row
-  if (rank + 1 == Layout::kRank) {
-    return TensorViewPlanarComplex_WriteLeastSignificantRank(out, view, start_coord, rank, width);
-  }
-
-  // Otherwise, write a sequence of rows and newlines
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (rank + 2 == Layout::kRank) {
-      // Write least significant ranks asa matrix with rows delimited by ";\n"
-      if (idx) {
-        out << ";\n";
-      }
-      TensorViewPlanarComplex_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
-    }
-    else {
-      // Higher ranks are separated by newlines
-      if (idx) {
-        out << "\n";
-      }
-      TensorViewPlanarComplex_WriteRank(out, view, coord, rank + 1, width);
-    }
-  }
-
-  return out;
-}
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& TensorViewWrite(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return detail::TensorView_WriteRank(out, view, Coord<Layout::kRank>(), 0, out.width());
-}
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& operator<<(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return TensorViewWrite(out, view);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& TensorViewWrite(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return detail::TensorViewPlanarComplex_WriteRank(out, view, Coord<Layout::kRank>(), 0, out.width());
-}
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& operator<<(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return TensorViewWrite(out, view);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass

From 1b381c9347f173240db697fab6ada646c2951711 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 20:20:31 +0000
Subject: [PATCH 39/92] Update code

---
 csrc/sparse/cutlass/generator/README.md       | 143 ----------
 .../cutlass/generator/autogen_manifest.py     | 145 ----------
 csrc/sparse/cutlass/generator/generator.py    | 145 ----------
 .../cutlass/generator/generator_types.py      |  77 ------
 .../cutlass/generator/kernel_compiler.py      | 128 ---------
 .../cutlass/generator/kernel_generator.py     | 249 ------------------
 .../cutlass/generator/scaled_mm_c3x.jinja     |  57 ----
 .../generator/scaled_mm_c3x_fnprototype.jinja |   7 -
 .../sparse/cutlass/generator/tools/heatmap.py | 242 -----------------
 .../cutlass/generator/tools/select_kernels.py | 244 -----------------
 csrc/sparse/cutlass/generator/tools/utils.py  |  63 -----
 csrc/sparse/cutlass/generator/utils.py        |  48 ----
 12 files changed, 1548 deletions(-)
 delete mode 100644 csrc/sparse/cutlass/generator/README.md
 delete mode 100644 csrc/sparse/cutlass/generator/autogen_manifest.py
 delete mode 100644 csrc/sparse/cutlass/generator/generator.py
 delete mode 100644 csrc/sparse/cutlass/generator/generator_types.py
 delete mode 100644 csrc/sparse/cutlass/generator/kernel_compiler.py
 delete mode 100644 csrc/sparse/cutlass/generator/kernel_generator.py
 delete mode 100644 csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja
 delete mode 100644 csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja
 delete mode 100644 csrc/sparse/cutlass/generator/tools/heatmap.py
 delete mode 100644 csrc/sparse/cutlass/generator/tools/select_kernels.py
 delete mode 100644 csrc/sparse/cutlass/generator/tools/utils.py
 delete mode 100644 csrc/sparse/cutlass/generator/utils.py

diff --git a/csrc/sparse/cutlass/generator/README.md b/csrc/sparse/cutlass/generator/README.md
deleted file mode 100644
index 523d767074820..0000000000000
--- a/csrc/sparse/cutlass/generator/README.md
+++ /dev/null
@@ -1,143 +0,0 @@
-## Cutlass Kernel Generator and Benchmark Sweeps
-
-#### Basic Idea
- - Expose a C++ interface for the function to benchmark. The interface must be
-   templated with the hyper-parameters we desire to sweep over.
- - Generate .cu files using jinja templates that use the exposed interface.
-   Look at `scaled_mm_c3x.jinja`
- - Generate torch bindings for the functions in the .cu files.
- - Build vllm to include all the generated .cu files. Look at `nm_cutlass_c.cmake`
- - Run the benchmarking script to sweep over problem shapes and all the generated
-   cutlass kernels. Look at `benchmarks/cutlass_benchmarks/bench_v2.py`
-
-#### Important Files
- - scaled_mm_c3x.jinja / simple_gemm_c3x.jinja : Jinja templated files for functions to generate.
- - scaled_mm_c3x_fnprototype.jinja / simple_gemm_c3x_fnprototype.jinja : Jinja templated files for the C++ function declarations.
- - generator_types.py : This file contains all the information regarding the function type we intend to generate.
-        For example, at the time of writing, we have ScaledMMGeneratorType and SimpleGemmGeneratorType.
-        The ScaledMMGeneratorType points to the correct jinja templates to use and also defines the
-        correct torch biniding `ops.impl` and `ops.def` string. This is where we register new GeneratorTypes
-        if we add more function-generators in the future.
- - autogen_manifest.py : Defines hyper-parameter sets.
- - kernel_generator.py : All utilities that are responsible for filling out the jinja templates
-        based on the given set of hyper-parameter args.
- - generator.py : Bridges autogen_manifest.py and kernel_generator.py. This is the `main` driver
-        scripts that we use to generate kernels.
- - kernel_compiler.py : Not all sets of hyperparameters are valid. The KernelCompiler, attempts an
-        nvcc compile on the generated kernel file and kernel_generator/generator accepts/rejects
-        the generated kernel based this compilation status.
-
-#### Adding a new function to generate
-
-##### Step 1
-    - Like mentioned before, expose a C++ interface for the function to generate. The interface
-    must be templated with the hyper-parameters we desire to sweep over.
-
-##### Step 2
-    - Create jinja templates.
-        1. Create a jinja template file that is representative of the kernel we wish to generate. 
-        2. Create a separate jinja template file that has the function declaration.
-    - Refer to `scaled_mm_c3x.jinja` and `scaled_mm_c3x_fnprototype.jinja`
-
-##### Step 3
-    - Create a GeneratorType in generator_types.py
-    - The GeneratorType is the datastructure that communicates,
-        1. What jinja template files to use
-        2. What is the torch_bindings `ops.def` and `ops.impl` arguments
-    - Refer to ScaledMMGeneratorType
-
-##### Step 4
-    - In autogen_manifest, create a list of hyper-parameter sets that are to be translated into kernel files.
-    - Look at the construction of Cutlass3xArgsTest in autogen_manifest.py
-
-##### Commands to generate kernels:
-    - Example command:
-    python3 csrc/quantization/cutlass_w8a8/generator/generator.py --generator-type scaled_mm --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test --cuda-dir /usr/local/cuda-12.5 --cutlass-args-list Cutlass3xArgsTest
-
-    Here: 
-        - --generator-type : The description of the desired GeneratorType in generator_types.py
-        - --vllm-root-dir : The root-dir of your vllm project
-        - --py-venv-dir : The root-dir of your python environment
-        - --cuda-dir : cuda dir to use
-        - --cutlass-args-list : the name of the list of hyper-parameter sets that you created in autogen_manifest.py
-
-    Expectations:
-     The generator attempts to generate one kernel for every hyper-parameter set.
-        - The generator looks generates the kernel file
-        - The generator attempts to compile the generated kernel file
-        - If compilation succeeds, it keeps the generated kernel file. Deletes it otherwise.
-
-    The generator records the status of the compilation for each kernel it tries to compile. If some kernel is known to 
-    have succeeded in a previous run, it simply generates it and doesnot attempt a re-compile.
-
-##### Commands to build
-    - The normal vllm build command should work.
-    - i.e. either `pip3 install -e .` or `python3 setup.py --build_ext --inplace`
-    Expectation:
-        Compilation should be successful and you should see .so files like, `_nm_cutlass_*_C.so` in the vllm folder
-
-##### How to benchmark
-The benchmarking scripts have been updated to grab all the auto-generated cutlass kernels. Look at 
-`get_autogen_functions` in `benchmarks/cutlass_benchmarks/bench_v2.py`.
-
-Example command:
-python3 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 --with-arg-pool 32 --with-cuda-graph 32 square_bench --dim-start 128 --dim-end 256 --dim-increment 128
-
-Expectations:
-    You should see output similar to, 
-     ```
-     attempting import vllm._nm_cutlass_0_C
-     #autogen functions found 3
-    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecializedCooperative_PersistentScheduler_kGemm_float_fp8
-    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
-    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedPingpongFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
-     ```
-
-##### Benchmark Heatmaps and Optimal Kernel Set Selection
-Typically a hyper-parameter sweep produces 100s of kernels. It could be hard to read the terminal outputs
-of benchmarking scripts. The w8a8_benchmarks.py script when used with the model_bench command, produces
-a pickle file that contains the benchmark information for all the {kernel, gemm-shape} pairs benchmarked.
-
-###### Kernel Selection Problem
-When we run a hyper-parameter sweep, we are interested in finding a minimal a set of kernels that is the
-optimal for the gemm-shapes benchmarked. `tools/select_kernels.py` solves this optimization problem.
-
-Example:
- python3 select_kernels.py --input-pkl ./model_bench-torch.float8_e4m3fn-1729989172.pkl --min-gemm-efficiency 0.98
-
- This example invocation of the select_kernels.py script,
-  - Reads the input pickle file and gathers the benchmark information of all the {kernel, gemm-shape} pairs.
-  - Normalizes the benchmark information with respect to gemm shapes. i.e. the best performing
-    kernel for some gemm-shape is given a value of 1.0. A kernel with a value of `x` ( `x` < 1.0)
-    indicates that that kernel's performance is `x` times that of the optimal kernel.
-  - The script ignores all the {kernel, gemm-shape} pairs where the kernel efficiency is < min_gemm_efficiency.
-    In this case the script only considers the {kernel, gemm-shape} pairs where the normalized value
-    is in range [0.98, 1.0]
-  - The script then determines the optimal and minimal kernel set.
-
-###### Visualization problem
-Reading the w8a8_benchmarks.py terminal output can get overwhelming. The script `tools/heatmap.py`
-consumes a model_bench pickle file and produces a heatmap for better consumption of the results.
-
-Example:
-  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --plot-all-ops
-
-  Normalizes all the {kernel, gemm-shape} information in the model_bench pickle file (refer to "Kernel Selection Problem"
-  for how the data is normalized). and renders the normalized benchmark information as a heatmap.
-
-Example:
-  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --select-kernels
-
-  Effectively runs select_kernel.py on the input pkl file and renders the selected kernels as heatmap.
-
-
-
-
-
-
-
-tools/select_kernel.py :  
-
-
-
-
diff --git a/csrc/sparse/cutlass/generator/autogen_manifest.py b/csrc/sparse/cutlass/generator/autogen_manifest.py
deleted file mode 100644
index 5e27bf81a1b29..0000000000000
--- a/csrc/sparse/cutlass/generator/autogen_manifest.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import copy
-from dataclasses import dataclass
-from itertools import product
-from typing import Tuple
-
-
-@dataclass
-class Cutlass3xArgs:
-    dtype_str: str
-    arch: int
-    tile_shape: Tuple[int, int, int]
-    cluster_shape: Tuple[int, int, int]
-    kernel_schedule: str
-    epilogue_schedule: str
-    tile_schedule: str
-    gemm_mode: str
-    acc_type: str
-
-    def with_tile_shape(self, ts):
-        clone = copy.deepcopy(self)
-        clone.tile_shape = ts
-        return clone
-
-    def with_cluster_shape(self, cs):
-        clone = copy.deepcopy(self)
-        clone.cluster_shape = cs
-        return clone
-
-    def with_tile_schedule(self, ts):
-        clone = copy.deepcopy(self)
-        clone.tile_schedule = ts
-        return clone
-
-    def with_kernel_schedule(self, ks):
-        clone = copy.deepcopy(self)
-        clone.kernel_schedule = ks
-        return clone
-
-    def with_epilogue_schedule(self, es):
-        clone = copy.deepcopy(self)
-        clone.epilogue_schedule = es
-        return clone
-
-    def with_gemm_mode(self, gm):
-        clone = copy.deepcopy(self)
-        clone.gemm_mode = gm
-        return clone
-
-    def with_acc_type(self, acc):
-        clone = copy.deepcopy(self)
-        clone.acc_type = acc
-        return clone
-
-    def with_dtype_str(self, dtype_str):
-        clone = copy.deepcopy(self)
-        clone.dtype_str = dtype_str
-        return clone
-
-
-DefaultCutlass3xArgsFP8 = Cutlass3xArgs(
-    "fp8", 90, (128, 128, 128), (1, 2, 1),
-    "cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative",
-    "cutlass::epilogue::TmaWarpSpecializedCooperative",
-    "cutlass::gemm::PersistentScheduler",
-    "cutlass::gemm::GemmUniversalMode::kGemm", "float")
-
-## Kernel Schedules
-## All
-# struct KernelMultistage { };
-# struct KernelCpAsyncWarpSpecialized { };
-# struct KernelCpAsyncWarpSpecializedPingpong { };
-# struct KernelCpAsyncWarpSpecializedCooperative { };
-# struct KernelTma { };
-# struct KernelTmaWarpSpecialized { };
-# struct KernelTmaWarpSpecializedPingpong { };
-# struct KernelTmaWarpSpecializedCooperative { };
-# struct KernelPtrArrayTmaWarpSpecializedCooperative { };
-## FP8
-# struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
-# struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { }; # noqa
-# struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { }; #noqa
-# struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };  #noqa
-
-## Epilogue policies
-# struct NoSmemWarpSpecialized {};
-# struct PtrArrayNoSmemWarpSpecialized {};
-# struct TmaWarpSpecialized {};
-# struct TmaWarpSpecializedCooperative {};
-
-## Tile scheduler
-# struct PersistentScheduler { };
-# struct StreamKScheduler { };
-
-## Kgemms
-# kGemm
-# kGemmSplitKParallel,
-# kBatched,
-# kArray,
-# kGrouped,
-# kInvalid
-
-cluster_shapes = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (4, 1, 1),
-                  (1, 4, 1), (8, 1, 1), (1, 8, 1), (4, 4, 1)]
-tile_shapes_m = [64, 128, 256]
-tile_shapes_n = [64, 128, 256]
-tile_shapes_k = [32, 64, 128, 256]
-tile_shapes = list(product(tile_shapes_m, tile_shapes_n, tile_shapes_k))
-
-kernel_schedules = [
-    "cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum",
-    "cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum",
-    "cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum"
-]
-
-epilogue_schedules = [
-    "cutlass::epilogue::TmaWarpSpecialized",
-    "cutlass::epilogue::TmaWarpSpecializedCooperative"
-]
-
-tile_schedules = [
-    "cutlass::gemm::PersistentScheduler", "cutlass::gemm::StreamKScheduler"
-]
-
-gemm_modes = ["cutlass::gemm::GemmUniversalMode::kGemm"]
-
-acc_types = ["float"]
-
-#epilogue_schedules_v2 = ["cutlass::epilogue::NoSmemWarpSpecialized"]
-gemm_modes_v2 = ["cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel"]
-acc_types_v2 = ["cutlass::half_t"]
-
-## Make Cutlass3xArgsTest
-
-Cutlass3xArgsTest = []
-
-for ts, cs, ks, es, tile_schedule, gm, at in product(
-        tile_shapes, cluster_shapes, kernel_schedules, epilogue_schedules,
-        tile_schedules, gemm_modes, acc_types):
-
-    Cutlass3xArgsTest.append(
-        DefaultCutlass3xArgsFP8.with_tile_shape(ts).with_cluster_shape(cs).
-        with_kernel_schedule(ks).with_epilogue_schedule(es).with_tile_schedule(
-            tile_schedule).with_gemm_mode(gm).with_acc_type(at))
-
-Cutlass3xArgsTest = Cutlass3xArgsTest
diff --git a/csrc/sparse/cutlass/generator/generator.py b/csrc/sparse/cutlass/generator/generator.py
deleted file mode 100644
index 9d49b45e7489d..0000000000000
--- a/csrc/sparse/cutlass/generator/generator.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import pprint
-from dataclasses import dataclass
-from multiprocessing.pool import ThreadPool
-from typing import List, Optional
-
-import autogen_manifest
-from autogen_manifest import Cutlass3xArgs
-from generator_types import GeneratorType, GeneratorTypes
-from kernel_compiler import KernelCompiler
-from kernel_generator import GeneratorOutput, KernelGenerator
-from tqdm import tqdm
-
-
-@dataclass
-class GenerateFromArgInput:
-    generator_type: Optional[GeneratorType] = None
-    args: Optional[Cutlass3xArgs] = None
-    kernel_compiler: Optional[KernelCompiler] = None
-
-
-def generate_from_arg(input: GenerateFromArgInput) -> GeneratorOutput:
-    """
-    Kernel generation for a single Cutlass3xArg
-    """
-    generator_type, args, kernel_compiler = (input.generator_type, input.args,
-                                             input.kernel_compiler)
-    return KernelGenerator.generate(generator_type, args, kernel_compiler)
-
-
-def generate_from_args_mt(generator_type: GeneratorType,
-                          args: List[Cutlass3xArgs],
-                          kernel_compiler: KernelCompiler,
-                          num_threads: int = 32) -> GeneratorOutput:
-    """
-    Kernel generator for a list of Cutlass3xArgs with multi-threading.
-    """
-    generator_outputs = GeneratorOutput()
-    # create thread pool with {num_threads} threads
-    pool = ThreadPool(processes=num_threads)
-    inputs = [
-        GenerateFromArgInput(generator_type, x, kernel_compiler) for x in args
-    ]
-    result = pool.map_async(generate_from_arg, inputs)
-    for r in result.get():
-        generator_outputs.merge(r)
-    return generator_outputs
-
-
-def main(args):
-    pprint.pprint(args)
-
-    cutlass_args_list = getattr(autogen_manifest, args.cutlass_args_list)
-    print(f"Generating {len(cutlass_args_list)} cuda files ...")
-
-    generator_type: GeneratorType = GeneratorType.from_str(args.generator_type)
-
-    additional_compile_args = [x.strip() for x in args.additional_compile_args]
-    kernel_compiler: KernelCompiler = KernelCompiler(
-        vllm_root_dir=args.vllm_root_dir,
-        py_venv_dir=args.py_venv_dir,
-        cuda_dir=args.cuda_dir,
-        py_version=args.py_version,
-        additional_args=additional_compile_args,
-        test_compile=args.test_compile)
-    kernel_compiler.init_compile_cache()
-
-    generator_outputs = GeneratorOutput()
-    batch_size = 100  # Compile-and-Generate batch_size items at a time
-    for idx in tqdm(range(0, len(cutlass_args_list), batch_size)):
-        print(f"Total {len(cutlass_args_list)}"
-              f" | Success {len(generator_outputs.success_file_names)}"
-              f"| Fail {len(generator_outputs.failed_file_names)}")
-
-        chunk_generator_output = generate_from_args_mt(
-            generator_type, cutlass_args_list[idx:idx + batch_size],
-            kernel_compiler)
-        generator_outputs.merge(chunk_generator_output)
-
-        # Store intermediate results
-        # fill-out ops.h
-        KernelGenerator.write_ops(generator_type, generator_outputs.file_paths,
-                                  generator_outputs.fn_names,
-                                  generator_outputs.fn_decls)
-        # store result batch
-        kernel_compiler.cache.add(generator_outputs.success_file_names,
-                                  generator_outputs.failed_file_names)
-        kernel_compiler.cache.store()
-
-
-if __name__ == "__main__":
-
-    import argparse
-    parser = argparse.ArgumentParser(description='''
-            Autogen cutlass kernels
-            Example: 
-            python3 csrc/quantization/cutlass_w8a8/generator/generator.py \
-                 --generator-type scaled_sparse_mm \
-                 --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ \
-                 --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test \
-                 --cuda-dir /usr/local/cuda-12.5
-            ''')
-
-    parser.add_argument("--generator-type",
-                        required=True,
-                        choices=[x.description() for x in GeneratorTypes])
-    parser.add_argument("--cutlass-args-list",
-                        required=True,
-                        type=str,
-                        default=None,
-                        help='''
-                        The cutlass args list variable name constructed in
-                        autogen_manifest.py. The variable name is imported
-                        as,
-                        getattr(autogen_manifest, args.cutlass_args_list)
-                        ''')
-    parser.add_argument('--test-compile',
-                        action='store_true',
-                        help='''
-                        Runs as usual but,
-                            - Prints compiler errors
-                            - Doesn't update the kernel compiler cache.
-                        ''')
-    parser.add_argument("--vllm-root-dir",
-                        required=True,
-                        type=str,
-                        default=None,
-                        help="Root directory of vllm source code")
-    parser.add_argument("--py-venv-dir",
-                        required=True,
-                        type=str,
-                        default=None,
-                        help="py venv root directory")
-    parser.add_argument("--cuda-dir",
-                        type=str,
-                        default=None,
-                        help="CUDA dir example: /usr/local/cuda-12.5")
-    parser.add_argument(
-        "--py-version",
-        type=str,
-        default="3.10",
-        help="Python version to use. Used in fetching the python includes")
-    parser.add_argument("--additional-compile-args", nargs='*', default=[])
-
-    args = parser.parse_args()
-    main(args)
diff --git a/csrc/sparse/cutlass/generator/generator_types.py b/csrc/sparse/cutlass/generator/generator_types.py
deleted file mode 100644
index b18d624f88699..0000000000000
--- a/csrc/sparse/cutlass/generator/generator_types.py
+++ /dev/null
@@ -1,77 +0,0 @@
-"""
-Generator function types.
-
-Defines necessary information about each function type to generate.
-"""
-
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import List
-
-from utils import get_script_dir
-
-
-class GeneratorType(ABC):
-    SCRIPT_DIR = get_script_dir()
-
-    @staticmethod
-    def description() -> str:
-        raise NotImplementedError
-
-    @abstractmethod
-    def fn_defn_jinja_filepath(self) -> Path:
-        # Function definition jinja - the entrypoint to the function to
-        # generate.
-        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x.jinja for
-        # an example.
-        raise NotImplementedError
-
-    @abstractmethod
-    def fn_decl_jinja_filepath(self) -> Path:
-        # Function decl jinja - the c++ function declaration of the function
-        # to generate.
-        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x_fnprototype.jinja #noqa
-        # for an example.
-
-        raise NotImplementedError
-
-    @abstractmethod
-    def ops_def(self, fn_name: str) -> str:
-        # torch binding ops.def template.
-        raise NotImplementedError
-
-    @abstractmethod
-    def ops_impl(self, fn_name: str) -> str:
-        # torch binding ops.impl template.
-        raise NotImplementedError
-
-    @staticmethod
-    def from_str(s: str) -> "GeneratorType":
-        if ScaledMMGenerator.description() == s:
-            return ScaledMMGenerator()
-        raise ValueError("Unknown generator type string {s}")
-
-
-class ScaledMMGenerator(GeneratorType):
-
-    def __init__(self):
-        super().__init__()
-
-    @staticmethod
-    def description():
-        return "scaled_sparse_mm"
-
-    def fn_defn_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x.jinja"
-
-    def fn_decl_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_fnprototype.jinja"
-
-    def ops_def(self, fn_name: str) -> str:
-        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor e, Tensor b, Tensor a_scales, Tensor b_scales) -> ()");'  #noqa
-
-    def ops_impl(self, fn_name: str) -> str:
-        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
-
-
-GeneratorTypes: List[GeneratorType] = [ScaledMMGenerator]
diff --git a/csrc/sparse/cutlass/generator/kernel_compiler.py b/csrc/sparse/cutlass/generator/kernel_compiler.py
deleted file mode 100644
index b4fa1cf434567..0000000000000
--- a/csrc/sparse/cutlass/generator/kernel_compiler.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""
-Utilities to invoke the kernel compiler.
-When generating cutlass kernels, we attempt an nvcc compile to make sure that
-there won't be any issues at vllm build time.
-"""
-
-import pickle as pkl
-import subprocess
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import List, Optional
-
-# Global compile cache path that stores information about which kernels
-# compiled successfully and which failed.
-CACHE_FILE_PATH = Path('./kernels_compile_cache.pkl')
-
-
-class KernelCompileCache:
-
-    def __init__(self, test_compile=False):
-        # If test_compile is true, we override the cache operations so it
-        # is a no-op.
-        self.test_compile = test_compile
-
-        # self.bad_kernels are kernels that failed compilation
-        # self.good_kernels are kernels that succeeded compilation
-        if not CACHE_FILE_PATH.exists() or self.test_compile:
-            self.bad_kernels = []
-            self.good_kernels = []
-        else:
-            # Load from cache
-            data = None
-            with open(str(CACHE_FILE_PATH), 'rb') as f:
-                data = pkl.load(f)
-            self.bad_kernels, self.good_kernels = data
-        print(f"#bad kernels {len(self.bad_kernels)},"
-              f"#good kernels {len(self.good_kernels)} loaded from cache ...")
-
-    def is_bad_kernel(self, kernel_file_name: str):
-        if self.test_compile:
-            return False
-        return kernel_file_name in self.bad_kernels
-
-    def is_good_kernel(self, kernel_file_name: str):
-        if self.test_compile:
-            return False
-        return kernel_file_name in self.good_kernels
-
-    def add(self, success: List[str], fail: List[str]):
-        self.good_kernels.extend(success)
-        self.bad_kernels.extend(fail)
-        # Remove duplicates
-        self.good_kernels = list(set(self.good_kernels))
-        self.bad_kernels = list(set(self.bad_kernels))
-
-    def store(self):
-        if self.test_compile:
-            return
-        print(f"Storing #badkernels {len(self.bad_kernels)}, "
-              f"#goodkernels {len(self.good_kernels)}")
-        with open(str(CACHE_FILE_PATH), 'wb+') as f:
-            pkl.dump((self.bad_kernels, self.good_kernels), f)
-
-
-@dataclass
-class KernelCompiler:
-    # vllm source code directory path
-    vllm_root_dir: Optional[str] = None
-    # python venv directory path
-    py_venv_dir: Optional[str] = None
-    # cuda directory path. example : /usr/local/cuda-12.5
-    cuda_dir: Optional[str] = None
-    #python version
-    py_version: str = '3.10'
-    # any additional flags
-    additional_args: List[str] = field(default_factory=lambda: [])
-    # kernel compile cache. Cache that holds history of which kernels
-    # succeeded and failed compilation.
-    cache: Optional[KernelCompileCache] = None
-    # Print nvcc compile information and override cache updates.
-    test_compile: bool = False
-
-    def init_compile_cache(self):
-        self.cache = KernelCompileCache(self.test_compile)
-
-    def compile(self, cu_file: str, gencode_arch: str) -> bool:
-        compile_command_base = [
-            'nvcc',
-            '-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1',
-            f'-I{self.vllm_root_dir}/csrc',
-            f'-I{self.vllm_root_dir}/.deps/cutlass-src/include',  #noqa
-            '-isystem',
-            f'/usr/include/python{self.py_version}',
-            '-isystem',
-            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include',
-            '-isystem',
-            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include/torch/csrc/api/include',  #noqa
-            '-isystem',
-            f'{self.cuda_dir}/include',
-            '-gencode',
-            f'arch=compute_{gencode_arch},code=sm_{gencode_arch}',
-            '-DONNX_NAMESPACE=onnx_c2',
-            '-Xcudafe',
-            '-DNDEBUG',
-            '-std=c++17',
-            '-Xcompiler=-fPIC',
-            '--expt-relaxed-constexpr',
-            '--threads=1',
-            '-D_GLIBCXX_USE_CXX11_ABI=0'] + self.additional_args
-        if gencode_arch == 90:
-            compile_command_base += ['-gencode', 'arch=compute_90a,code=sm_90a']
-
-        result = subprocess.run(compile_command_base + ['-c', cu_file],
-                                capture_output=True)
-
-        if self.test_compile:
-            print(f"Compiling {cu_file} : \n"
-                  f"   Successful compilation: {result.returncode == 0}\n"
-                  f"   stdout : {result.stdout}\n"
-                  f"   stderr : {result.stderr}\n")
-
-        if result.returncode == 0:
-            # Cleanup generated object code on successful compile.
-            object_file_path = Path("./" + Path(cu_file).stem + '.o')
-            assert object_file_path.exists(), object_file_path
-            object_file_path.unlink()
-
-        return result.returncode == 0
diff --git a/csrc/sparse/cutlass/generator/kernel_generator.py b/csrc/sparse/cutlass/generator/kernel_generator.py
deleted file mode 100644
index 4e57b7313ae4e..0000000000000
--- a/csrc/sparse/cutlass/generator/kernel_generator.py
+++ /dev/null
@@ -1,249 +0,0 @@
-"""
-Kernel Generator classes / functions.
-"""
-
-import shutil
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import List, Tuple
-
-import jinja2
-import utils
-from autogen_manifest import Cutlass3xArgs
-from generator_types import GeneratorType
-from kernel_compiler import KernelCompiler
-
-
-@dataclass
-class GeneratorOutput:
-    # Used in torch_bindings generation
-    file_paths: List[str] = field(default_factory=lambda: [])
-    fn_names: List[str] = field(default_factory=lambda: [])
-    fn_decls: List[str] = field(default_factory=lambda: [])
-    # Used in cache update
-    failed_file_names: List[str] = field(default_factory=lambda: [])
-    success_file_names: List[str] = field(default_factory=lambda: [])
-
-    def merge(self, output: "GeneratorOutput"):
-        self.file_paths.extend(output.file_paths)
-        self.fn_names.extend(output.fn_names)
-        self.fn_decls.extend(output.fn_decls)
-        self.failed_file_names.extend(output.failed_file_names)
-        self.success_file_names.extend(output.success_file_names)
-
-
-## Abstract generator
-
-
-class KernelGenerator_(ABC):
-    SCRIPT_DIR = utils.get_script_dir()
-    GENERATE_DIR = SCRIPT_DIR / "generated"
-
-    @staticmethod
-    def write_torch_bindings(generator_type: GeneratorType,
-                             fn_names: List[str], fn_decls: List[str],
-                             ops_macro: str, dir_path: str):
-        s = "#pragma once\n"
-        s += "#include<torch/torch.h>\n"
-        s += f"#define {ops_macro} \\\n"
-        for fn_name in fn_names:
-            s += generator_type.ops_def(fn_name) + '\\\n'
-            s += generator_type.ops_impl(fn_name) + '\\\n'
-        s += "\n"
-
-        for fn_decl in fn_decls:
-            s += f'{fn_decl}\n'
-
-        # write ops.h
-        file_path = Path(dir_path) / "ops.h"
-        with open(str(file_path), 'w+') as f:
-            f.write(s)
-
-        # write torch_bindings.cpp
-        s = ""
-        s += '\n#include "core/registration.h"'
-        s += '\n#include <torch/library.h>'
-        s += '\n#include "ops.h"'
-        s += '\nTORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {'
-        s += f'\n {ops_macro}'
-        s += '\n}'
-        s += '\nREGISTER_EXTENSION(TORCH_EXTENSION_NAME)'
-        s += '\n'
-
-        tb_path = Path(dir_path) / "torch_bindings.cpp"
-        with open(str(tb_path), 'w+') as f:
-            f.write(s)
-
-    @staticmethod
-    def write_ops(generator_type: GeneratorType,
-                  file_paths: List[str],
-                  fn_names: List[str],
-                  fn_decls: List[str],
-                  ops_macro: str,
-                  batch_size: int = 100):
-        """
-        batch_size defines the number of files per .so.
-        If there are a 1000 filenames, then with batch_size 100, we generate
-        10 directories, each directory containing 100 kernels. Each directory
-        is converted into a .so during vllm compile.
-        """
-
-        assert len(file_paths) == len(fn_names)
-        assert len(file_paths) == len(fn_decls)
-
-        dir_name = 0
-        for i in range(0, len(file_paths), batch_size):
-
-            dir_path: Path = KernelGenerator_.GENERATE_DIR / f'{dir_name}'
-            dir_path.mkdir(exist_ok=True)
-
-            # Move files to dir
-            for file_path in file_paths[i:i + batch_size]:
-                if Path(file_path).exists():
-                    try:
-                        shutil.move(file_path, str(dir_path))
-                    except shutil.Error:
-                        # File already exists
-                        pass
-
-            KernelGenerator_.write_torch_bindings(generator_type,
-                                                  fn_names[i:i + batch_size],
-                                                  fn_decls[i:i + batch_size],
-                                                  ops_macro, dir_path)
-
-            dir_name += 1  #noqa
-
-    @staticmethod
-    def last_namespace(s):
-        return s.split('::')[-1]
-
-    @staticmethod
-    @abstractmethod
-    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
-                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
-        ...
-
-
-class KernelGenerator(KernelGenerator_):
-    OPS_MACRO = "CUTLASS_DEFS"
-
-    @staticmethod
-    def generate_name(description: str, args: Cutlass3xArgs):
-
-        return 'autogen_{}_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}'.format(
-            description, args.arch, args.tile_shape[0], args.tile_shape[1],
-            args.tile_shape[2], args.cluster_shape[0], args.cluster_shape[1],
-            args.cluster_shape[2],
-            KernelGenerator_.last_namespace(args.kernel_schedule),
-            KernelGenerator_.last_namespace(args.epilogue_schedule),
-            KernelGenerator_.last_namespace(args.tile_schedule),
-            KernelGenerator_.last_namespace(args.gemm_mode),
-            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str)
-
-    @staticmethod
-    def generate_filename(description: str, args: Cutlass3xArgs):
-
-        f = '{}/autogen_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}_{}'.format(
-            KernelGenerator_.GENERATE_DIR, description, args.tile_shape[0],
-            args.tile_shape[1], args.tile_shape[2], args.cluster_shape[0],
-            args.cluster_shape[1], args.cluster_shape[2],
-            KernelGenerator_.last_namespace(args.kernel_schedule),
-            KernelGenerator_.last_namespace(args.epilogue_schedule),
-            KernelGenerator_.last_namespace(args.tile_schedule),
-            KernelGenerator_.last_namespace(args.gemm_mode),
-            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str,
-            args.arch)
-
-        f = f + ".cu"
-        return f
-
-    @staticmethod
-    def generate_kernel_file(generator_type: GeneratorType,
-                             args: Cutlass3xArgs) -> Tuple[str, str]:
-        """
-        Generate a .cu file that respects args and return,
-         - The function name of the generated function.
-         - The c++ function declaration of the generated function.
-        The return values are used in generating the torch bindings.
-        """
-
-        # Make the generate dir
-        KernelGenerator_.GENERATE_DIR.mkdir(exist_ok=True)
-
-        # Get jinja templates
-        jenv = jinja2.Environment(loader=jinja2.FileSystemLoader("/"))
-        fn_defn_template = jenv.get_template(
-            str(generator_type.fn_defn_jinja_filepath()))
-        fn_decl_template = jenv.get_template(
-            str(generator_type.fn_decl_jinja_filepath()))
-
-        # Generate code
-        fn_name = KernelGenerator.generate_name(generator_type.description(),
-                                                args)
-        fn_decl = fn_decl_template.render(_name=fn_name)
-        code: str = fn_defn_template.render(
-            _name=fn_name,
-            _torch_input_dtype=utils.to_torch_dtype_str(args.dtype_str),
-            _cutlass_input_dtype=utils.to_cutlass_dtype_str(args.dtype_str),
-            _tile_shape=utils.get_as_cutlass3x_gemm_shape(args.tile_shape),
-            _cluster_shape=utils.get_as_cutlass3x_gemm_shape(
-                args.cluster_shape),
-            _kernel_schedule=args.kernel_schedule,
-            _epilogue_schedule=args.epilogue_schedule,
-            _tile_schedule=args.tile_schedule,
-            _gemm_mode=args.gemm_mode,
-            _acc_type=args.acc_type)
-
-        filename = KernelGenerator.generate_filename(
-            generator_type.description(), args)
-        if utils.file_contents_same(filename, code):
-            return (fn_name, fn_decl)
-
-        # write code
-        with open(filename, "w+") as f:
-            f.write(code)
-
-        return (fn_name, fn_decl)
-
-    @staticmethod
-    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
-                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
-        generator_output = GeneratorOutput()
-
-        filepath = KernelGenerator.generate_filename(
-            generator_type.description(), args)
-        filename = Path(filepath).name
-
-        if kernel_compiler.cache.is_bad_kernel(filename):
-            # We know that this kernel wouldn't compile. Abort
-            return generator_output
-
-        fn_name, fn_decl = KernelGenerator.generate_kernel_file(
-            generator_type, args)
-
-        if not kernel_compiler.cache.is_good_kernel(filename):
-            # We dont have any information about this kernel in the cache.
-            # try compiling
-            compile_success = kernel_compiler.compile(filepath,
-                                                      gencode_arch=args.arch)
-            if compile_success:
-                generator_output.success_file_names.append(filename)
-            else:
-                generator_output.failed_file_names.append(filename)
-                if not kernel_compiler.test_compile:
-                    # Remove generated file
-                    Path(filepath).unlink()
-                    return generator_output
-
-        generator_output.file_paths.append(filepath)
-        generator_output.fn_names.append(fn_name)
-        generator_output.fn_decls.append(fn_decl)
-
-        return generator_output
-
-    @staticmethod
-    def write_ops(generator_type: GeneratorType, file_paths: List[str],
-                  fn_names: List[str], fn_decls: List[str]):
-        return KernelGenerator_.write_ops(generator_type, file_paths, fn_names,
-                                          fn_decls, KernelGenerator.OPS_MACRO)
diff --git a/csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja b/csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja
deleted file mode 100644
index 9f7e886ed9510..0000000000000
--- a/csrc/sparse/cutlass/generator/scaled_mm_c3x.jinja
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <stddef.h>
-#include <torch/all.h>
-#include "cutlass/cutlass.h"
-#include "sparse/cutlass/sparse_scaled_mm_c3x.cuh"
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &e,
-                torch::Tensor const &b,
-                torch::Tensor const &a_scales,
-                torch::Tensor const &b_scales) {
-
-  using TileShape =  {{ _tile_shape }};
-  using ClusterShape = {{ _cluster_shape }};
-  using KernelSchedule = typename {{ _kernel_schedule }};
-  using EpilogueSchedule = typename {{ _epilogue_schedule }};
-  using TileSchedule = typename {{ _tile_schedule }};
-  using AccType = {{ _acc_type }};
-  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
-
-  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
-  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (out.dtype() == torch::kBFloat16) {
-    using Cutlass3xGemm =
-      cutlass_3x_gemm<cutlass::float_e4m3_t,
-                      cutlass::bfloat16_t,
-                      ScaledEpilogue,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      EpilogueSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_sparse_gemm_caller<Cutlass3xGemm>(
-        out, a, e, b, a_scales, b_scales);
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    using Cutlass3xGemm =
-      cutlass_3x_gemm<cutlass::float_e4m3_t,
-                      cutlass::half_t,
-                      ScaledEpilogue,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      EpilogueSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_sparse_gemm_caller<Cutlass3xGemm>(
-        out, a, e, b, a_scales, b_scales);
-  }
-}
diff --git a/csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja b/csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja
deleted file mode 100644
index aa975f05d3db6..0000000000000
--- a/csrc/sparse/cutlass/generator/scaled_mm_c3x_fnprototype.jinja
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &e,
-                torch::Tensor const &b,
-                torch::Tensor const &a_scales,
-                torch::Tensor const &b_scales);
diff --git a/csrc/sparse/cutlass/generator/tools/heatmap.py b/csrc/sparse/cutlass/generator/tools/heatmap.py
deleted file mode 100644
index b6059ff4e7ca7..0000000000000
--- a/csrc/sparse/cutlass/generator/tools/heatmap.py
+++ /dev/null
@@ -1,242 +0,0 @@
-import pickle as pkl
-from pathlib import Path
-from typing import List, Optional
-
-import matplotlib.pyplot as plt
-import numpy as np
-from select_kernels import select_kernels
-from utils import Data, make_heatmap_data, measurement_to_data
-
-
-def plot_heatmap(data: np.array,
-                 y_labels: List[str],
-                 x_labels: List[str],
-                 save_filename='heatmap.png'):
-    # min because of some matplotlib render restrictions.
-    fig_size_x = min(len(x_labels), 320)
-    fig_size_y = len(y_labels) + 25
-    fig, ax = plt.subplots(figsize=(fig_size_x, fig_size_y))
-    im = ax.imshow(data, cmap="Reds", vmin=0.0, vmax=1.0, interpolation=None)
-
-    cbar = ax.figure.colorbar(im, ax=ax, cmap="Reds")
-    cbar.ax.set_ylabel("Hot == Closer to peak perf.", rotation=90, va="top")
-
-    # Show all ticks and label them with the respective list entries
-    ax.set_xticks(np.arange(len(x_labels)), labels=x_labels)
-    ax.set_yticks(np.arange(len(y_labels)), labels=y_labels)
-
-    # Rotate the tick labels and set their alignment.
-    plt.setp(ax.get_xticklabels(), rotation=90)
-
-    # Loop over data dimensions and create text annotations.
-    for i in range(len(y_labels)):
-        for j in range(len(x_labels)):
-            ax.text(j,
-                    i,
-                    data[i, j],
-                    ha="center",
-                    va="center",
-                    color="w",
-                    fontsize=6.0)
-
-    #ax.colorbar()
-
-    ax.set_title("GEMM shape vs Best cutlass op")
-    #ax.set_aspect('equal')
-    fig.tight_layout()
-
-    #fig.set_dpi(300)
-    #plt.show()
-    print(f"Save location : {save_filename}")
-    fig.savefig(save_filename, dpi=100)
-    #fig.savefig(save_filename, dpi=10)
-
-
-def select_top_k_kernels(gemm_ops: np.array,
-                         gemm_problems: List[str],
-                         ops: List[str],
-                         k: int = 100) -> List[str]:
-    """
-    Simple top_k kernel selection. 
-    Gather the top-k best performing kernels for each gemm problem and
-    return the union.
-    """
-    n_rows = len(gemm_problems)
-
-    max_kernels_per_gemm_shape = 100  # k-value
-    gemm_efficiency_threshold = 0.90
-
-    selected_ops = []
-    for r in range(n_rows):
-        gemm_ops_list = np.copy(gemm_ops[r])
-        sorted_indices = list(reversed(np.argsort(gemm_ops_list).tolist()))
-
-        selected_shape_ops = []
-        for x in sorted_indices:
-            if 'autogen' not in ops[x]:
-                # select only autogen kernels/ops
-                continue
-            if len(selected_shape_ops) >= max_kernels_per_gemm_shape:
-                break
-            # we have reached the min requirement. Decide to break based on
-            # the gemm_efficiency threshold.
-            if gemm_ops_list[x] < gemm_efficiency_threshold:
-                break
-            else:
-                selected_shape_ops.append(ops[x])
-
-        selected_ops.append(selected_shape_ops)
-
-        op_scores = []
-        for idx in range(len(selected_shape_ops)):
-            if 'autogen' not in ops[sorted_indices[idx]]:
-                continue
-            op_scores.append(gemm_ops_list[sorted_indices[idx]])
-        print(f"Gemm problem {gemm_problems[r]} "
-              f"- #kernels {len(selected_shape_ops)} "
-              f"- selected kernel range [ {min(op_scores)} , "
-              f"{max(op_scores)} ] ")
-
-    # Merge all ops to create a final list
-    selected_ops = [set(x) for x in selected_ops]
-    selected_ops_set = set()
-    for x in selected_ops:
-        selected_ops_set = selected_ops_set.union(x)
-
-    print(f"#Selected ops set {len(selected_ops_set)}")
-    for x in selected_ops_set:
-        print(x)
-    return list(selected_ops_set)
-
-
-def remove_less_performant_kernels(gemm_ops: np.array, ops: List[str]):
-    """
-    Removes kernel that are relatively less performant from gemm_ops.
-    """
-    n_ops = gemm_ops.shape[1]
-    assert n_ops == len(ops)
-
-    gemm_ops_predicated = gemm_ops < 0.75
-    ops_predicated = np.all(gemm_ops_predicated, axis=0)
-
-    bad_cols = list(range(n_ops))
-    bad_cols = list(filter(lambda x: ops_predicated[x], bad_cols))
-    bad_cols = sorted(list(set(bad_cols)), reverse=True)
-    for bc in bad_cols:
-        ops.pop(bc)
-        gemm_ops = np.delete(gemm_ops, bc, 1)
-
-    return gemm_ops, ops
-
-
-def plot(gemm_ops: np.array,
-         gemm_problems: List[str],
-         ops: List[str],
-         save_filename: str,
-         prune_ops: bool = False):
-    if prune_ops:
-        gemm_ops, ops = remove_less_performant_kernels(gemm_ops, ops)
-        print(f"Pruned gemm_ops {gemm_ops.shape}")
-
-    plot_heatmap(gemm_ops, gemm_problems, ops, save_filename)
-
-
-def select_kernels_and_plot(gemm_problems: List[str], ops: List[str],
-                            data: List[str], save_filename: str):
-
-    autogen_ops = list(filter(lambda x: x.startswith('autogen'), ops))
-    cutlass_ops = list(filter(lambda x: x.startswith('cutlass'), ops))
-    pytorch_ops = list(filter(lambda x: x.startswith('pytorch'), ops))
-    assert len(autogen_ops) + len(cutlass_ops) + len(pytorch_ops) == len(ops)
-
-    print("Selecting the autogen kernels ..")
-    # select the best autogen kernels
-    gemm_autogenops = make_heatmap_data(gemm_problems, autogen_ops, data)
-    selected_autogen_ops = select_kernels(gemm_autogenops,
-                                          gemm_problems,
-                                          autogen_ops,
-                                          min_gemm_efficiency=0.99)
-
-    # prepare plot data
-    selected_ops = selected_autogen_ops + cutlass_ops + pytorch_ops
-    gemm_ops = make_heatmap_data(gemm_problems, selected_ops, data)
-    print("Plotting autogen kernels ...")
-    plot(gemm_ops, gemm_problems, selected_ops, save_filename)
-
-
-def from_measurements(args):
-    pkl_files: List[str] = args.input_pkl
-    save_file: Optional[str] = args.save_file
-    data: List[Data] = []
-
-    for pkl_file in pkl_files:
-        with open(pkl_file, 'rb') as f:
-            pkl_data = pkl.load(f)
-            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
-
-    ops: List[str] = list(map(lambda x: x.description, data))
-    ops = sorted(list(set(ops)))
-
-    gemm_problems: List[str] = list(map(lambda x: (x.m, x.n, x.k), data))
-    gemm_problems = sorted(list(set(gemm_problems)))
-
-    print(f"#gemm_problems {len(gemm_problems)}")
-    print(f"#gemm_ops {len(ops)}")
-
-    # plot all data as heat map
-    if args.plot_all_ops:
-        gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
-        out_file: str = pkl_file.replace(
-            '.pkl', '_heatmap.png') if save_file is None else save_file
-        plot(gemm_ops, gemm_problems, ops, save_filename=out_file)
-
-    if args.select_kernels:
-        out_file = None
-        if save_file:
-            out_file = Path(save_file).with_suffix("_selected.png")
-        else:
-            out_file = pkl_file.replace('.pkl', 'selected_heatmap.png')
-        select_kernels_and_plot(gemm_problems, ops, data, out_file)
-
-
-def main(args):
-    from_measurements(args)
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='''
-        Plot bench measurements pkl.
-        Example invocation: 
-        Plot all the ops in model bench pickle file:
-            python3 heatmap.py \
-              --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
-              --plot-all-ops
-        Run select kernel on the input-pkl and plot the selected ops.
-            python3 heatmap.py \
-               --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
-               --select-kernels
-        ''')
-
-    parser.add_argument("--input-pkl",
-                        "-i",
-                        nargs="+",
-                        required=True,
-                        type=str,
-                        help=("This is typically the pickle file output by "
-                              "w8a8_benchmarks.py 's model_bench command"))
-    parser.add_argument("--save-file", "-o", required=False, type=str)
-    parser.add_argument("--select-kernels",
-                        action='store_true',
-                        help="Run kernel selection and plot the heatmap "
-                        "for the selected kernels")
-    parser.add_argument("--plot-all-ops",
-                        action='store_true',
-                        help="plot heatmap for all ops")
-    args = parser.parse_args()
-
-    if not args.plot_all_ops and not args.select_kernels:
-        print("Argument error : Please provide at least one argument among"
-              "[--plot-all-ops, --select-kernels]")
-
-    main(args)
diff --git a/csrc/sparse/cutlass/generator/tools/select_kernels.py b/csrc/sparse/cutlass/generator/tools/select_kernels.py
deleted file mode 100644
index aea3289ed232d..0000000000000
--- a/csrc/sparse/cutlass/generator/tools/select_kernels.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import pickle as pkl
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-from utils import Data, make_heatmap_data, measurement_to_data
-
-
-@dataclass
-class Interval:
-    s: int  # start of interval
-    e: int  # end of interval
-    eff: float  # efficiency of the kernel in that range.
-
-    def x_in_interval(self, x: int) -> bool:
-        return self.s <= x and x <= self.e
-
-    def is_overlap(self, s, e):
-        return s <= self.e and self.s <= e
-
-
-@dataclass
-class KernelIntervals:
-    name: str
-    intervals: List[Interval]
-
-    def spanning_interval(self, pi: int) -> Optional[Interval]:
-        for i in self.intervals:
-            if i.x_in_interval(pi):
-                return i
-        return None
-
-
-class SelectKernelMeta:
-
-    def __init__(self, gemm_ops: np.array, gemm_problems: List[str],
-                 ops: List[str], min_gemm_efficiency: float):
-        self.gemm_ops = np.copy(gemm_ops)
-        self.gemm_problems = gemm_problems
-        self.ops = ops
-        self.min_gemm_efficiency = min_gemm_efficiency
-
-        self.n_problems = len(self.gemm_problems)
-        self.n_kernels = len(self.ops)
-
-        # Convert to kernel ranges
-        self.problem_indices = {x: idx for idx, x in enumerate(gemm_problems)}
-        self.kernel_indices = {x: idx for idx, x in enumerate(ops)}
-
-        self.kernel_intervals: List[KernelIntervals] = []
-        for ki in range(self.n_kernels):
-            self.kernel_intervals.append(self.make_kernel_intervals(ki))
-
-    def avg_efficiency(self, p_s: int, p_e: int, ki: int) -> float:
-        """
-        Average efficiency of the ki kernel for the gemm shapes in
-        range [p_s, p_e]
-        """
-        vals = self.gemm_ops[:, ki].tolist()[p_s:p_e + 1]
-        return sum(vals) / len(vals)
-
-    # TODO (varun) : Revisit kernel scores to use only the intervals we actually
-    # use for specific kernels.
-    def kernel_set_score(self, p_s: int, p_e: int, kernel_indices: set[int]):
-        """
-        Compute a score for a set of kernels for the gemm shape indices in
-        range [p_s, p_e]
-        """
-        if len(kernel_indices) == 0:
-            return 0.0
-        ki_scores = []
-        for ki in kernel_indices:
-            interval_scores = []
-            for i in self.kernel_intervals[ki].intervals:
-                if i.is_overlap(p_s, p_e):
-                    interval_scores.append(i.eff)
-            assert len(interval_scores) > 0
-            ki_scores.append(sum(interval_scores) / len(interval_scores))
-        assert len(ki_scores) > 0
-        return sum(ki_scores) / len(ki_scores)
-
-    def make_kernel_intervals(self, ki: int) -> KernelIntervals:
-        s = None
-        e = None
-        kernel_intervals: KernelIntervals = KernelIntervals(self.ops[ki], [])
-        for pi in range(self.n_problems):
-            if self.gemm_ops[pi][ki] < self.min_gemm_efficiency:
-                # record range
-                if e:
-                    assert s is not None
-                    kernel_intervals.intervals.append(
-                        Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
-                s, e = None, None
-            else:
-                s = pi if s is None else s
-                e = pi
-        if e:
-            assert s is not None
-            kernel_intervals.intervals.append(
-                Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
-        # sort intervals in the kernel
-        kernel_intervals.intervals = sorted(kernel_intervals.intervals,
-                                            key=lambda x: x.s)
-        return kernel_intervals
-
-
-def map_gemm_to_kernel(kernel_indices: List[int],
-                       meta: SelectKernelMeta) -> Dict[int, int]:
-    """
-    For every gemm problem in meta.gemm_problems, select a kernel from
-    kernel_indices and return as a dict.
-    """
-    gemm_to_kernel_map = {}
-
-    for pi in range(meta.n_problems):
-        kernels_for_pi = []
-        for ki in kernel_indices:
-            if meta.kernel_intervals[ki].spanning_interval(pi):
-                kernels_for_pi.append(ki)
-        assert len(kernels_for_pi) != 0
-
-        # select the kernel with max efficiency
-        eff_ki = [(meta.gemm_ops[pi][ki], ki) for ki in kernels_for_pi]
-        max_eff_ki = max(eff_ki, key=lambda x: x[0])[1]
-        gemm_to_kernel_map[pi] = max_eff_ki
-
-    return gemm_to_kernel_map
-
-
-def select_kernels_dp(
-        p_s: int,
-        p_e: int,  # Problem start index and problem end index
-        meta: SelectKernelMeta,
-        solution_cache: Dict[Tuple[int, int], set]) -> set[int]:
-    """
-    Compute the best set of kernels for the gemm problem shapes,
-    meta.gemm_problems[p_s:p_e].
-    """
-    if p_s > p_e:
-        return set([])
-    assert p_s <= p_e
-    assert p_s >= 0 and p_e >= 0
-    assert p_s < meta.n_problems and p_e < meta.n_problems
-
-    if solution_cache.get((p_s, p_e), None) is not None:
-        return solution_cache.get((p_s, p_e))
-
-    spanning_kernels: List[Tuple[int, Interval]] = []
-    for ki in range(meta.n_kernels):
-        span_i = meta.kernel_intervals[ki].spanning_interval(p_s)
-        assert span_i is None or (span_i.s <= p_s and span_i.e >= p_s)
-        if span_i is not None:
-            spanning_kernels.append((ki, span_i))
-
-    assert len(spanning_kernels) != 0, \
-            (f"Cannot find a spanning kernel in range ({p_s}, {p_e})"
-            f"- gemm {meta.gemm_problems[p_s]} to {meta.gemm_problems[p_e]}"
-            f". Try reducing the min_gemm_efficiency")
-    ki_solutions: List[set[int]] = []
-    for ki, span in spanning_kernels:
-        ki_solutions.append(
-            set([ki]).union(
-                select_kernels_dp(span.e + 1, p_e, meta, solution_cache)))
-
-    # find the solution with minimum number of kernels.
-    sol = min(ki_solutions, key=lambda x: len(x))
-    solution_cache[(p_s, p_e)] = sol
-    return sol
-
-
-def select_kernels(gemm_ops: np.array, gemm_problems: List[str],
-                   ops: List[str], min_gemm_efficiency: float) -> List[str]:
-    """
-    Given a list of gemm problem shapes, gemm_problems, a list of autogen
-    kernel operations ops, normalized benchmarking information and a
-    minimum operation efficiency to consider, this function, finds that
-    smallest set of kernels such that kernels in the satisfies the
-    min_gemm_efficiency for all the gemm shapes. 
-    """
-    solution_cache = {}
-    meta = SelectKernelMeta(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
-    kernels = select_kernels_dp(0, meta.n_problems - 1, meta, solution_cache)
-
-    gemm_to_kernel_map = map_gemm_to_kernel(list(kernels), meta)
-
-    print(f"#kernels found {len(kernels)}")
-    for pi in range(meta.n_problems):
-        print(f"Problem {meta.gemm_problems[pi]} - "
-              f"Kernel {meta.ops[gemm_to_kernel_map[pi]]} "
-              f"eff. ({gemm_ops[pi][gemm_to_kernel_map[pi]]}) ")
-
-    kernel_names = [ops[ki] for ki in kernels]
-    return kernel_names
-
-
-def from_measurements(pkl_files: List[str], min_gemm_efficiency: float):
-    data: List[Data] = []
-
-    for pkl_file in pkl_files:
-        with open(pkl_file, 'rb') as f:
-            pkl_data = pkl.load(f)
-            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
-
-    ops = list(map(lambda x: x.description, data))
-    ops = sorted(list(set(ops)))
-    # have only autogen kernels
-    ops = list(filter(lambda x: 'autogen' in x, ops))
-
-    gemm_problems = list(map(lambda x: (x.m, x.n, x.k), data))
-    gemm_problems = sorted(list(set(gemm_problems)))
-
-    print(f"#gemm_problems {len(gemm_problems)}")
-    print(f"#gemm_ops {len(ops)}")
-
-    gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
-    select_kernels(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
-
-
-def main(pkl_files: List[str], min_gemm_efficiency: float):
-    from_measurements(pkl_files, min_gemm_efficiency)
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description=("Select minimal set of kernels in some model_bench "
-                     "pkl file such that the set of kernels satisfy"
-                     "the min-gemm-efficiency for all the gemm shapes in"
-                     "the model_bench"))
-    parser.add_argument("--input-pkl",
-                        "-i",
-                        nargs="+",
-                        required=True,
-                        type=str)
-    parser.add_argument(
-        "--min-gemm-efficiency",
-        type=float,
-        default=0.95,
-        help="Gemms that are less than this for a particular gemm shape is"
-        "disregarded")
-    args = parser.parse_args()
-
-    main(args.input_pkl, args.min_gemm_efficiency)
diff --git a/csrc/sparse/cutlass/generator/tools/utils.py b/csrc/sparse/cutlass/generator/tools/utils.py
deleted file mode 100644
index 74d5c9ddbd08d..0000000000000
--- a/csrc/sparse/cutlass/generator/tools/utils.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from dataclasses import dataclass
-from typing import List
-
-import numpy as np
-from torch.utils.benchmark import Measurement as TMeasurement
-
-
-@dataclass
-class Data:
-    m: int
-    k: int
-    n: int
-    description: str
-    time: float
-    tflops: float
-
-
-def parse_mkn(mkn_str: str):
-    # mkn_str : MKN=(16x1024x512)
-    mkn_tuple = mkn_str.split("=")[1]
-    # mkn_tuple : (16x1024x512)
-    mkn_prod = mkn_tuple[1:-1]
-    # mkn_prod : 16x1024x512
-    mkn_tuple = tuple(mkn_prod.split("x"))
-    return (int(mkn_tuple[0]), int(mkn_tuple[1]), int(mkn_tuple[2]))
-
-
-def measurement_to_data(measurement: TMeasurement) -> Data:
-    m, k, n = parse_mkn(measurement.sub_label)
-    t_ops = 2 * m * k * n / 1024 / 1024 / 1024 / 1024
-    tflops = t_ops / measurement.median
-    return Data(m, k, n, measurement.task_spec.description, measurement.median,
-                tflops)
-
-
-def make_heatmap_data(gemm_problems: List[str], ops: List[str],
-                      data: List[Data]) -> np.array:
-    """
-        gemm_problems : List of gemm problem shapes
-        ops : List of operations (kernels)
-        data : List of Data that contains benchmark information for all
-            op-gemmshape pairs.
-        Normalize all the benchmark information w.r.t. to its gemm-shape
-        and return the normalized benchmark information as a numpy array.
-    """
-    gemm_ops: List[List[float]] = [[0.0] * len(ops)
-                                   for _ in range(len(gemm_problems))]
-    for op_idx, op in enumerate(ops):
-        op_data = list(filter(lambda x: x.description == op, data))
-        for gemm_idx, gemm in enumerate(gemm_problems):
-            m, n, k = gemm
-            selected = list(
-                filter(lambda x: x.m == m and x.n == n and x.k == k, op_data))
-            if len(selected) >= 1:
-                gemm_ops[gemm_idx][op_idx] = float(selected[0].tflops)
-
-    for gemm_idx in range(len(gemm_problems)):
-        max_tflops = max(gemm_ops[gemm_idx])
-        for op_idx in range(len(ops)):
-            gemm_ops[gemm_idx][op_idx] = round(
-                gemm_ops[gemm_idx][op_idx] / max_tflops, 2)
-
-    return np.array(gemm_ops)
diff --git a/csrc/sparse/cutlass/generator/utils.py b/csrc/sparse/cutlass/generator/utils.py
deleted file mode 100644
index 8121412e42a47..0000000000000
--- a/csrc/sparse/cutlass/generator/utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""
-Utils used in generating cutlass kernels.
-"""
-
-import os
-from pathlib import Path
-from typing import Tuple
-
-## Utilities ####
-
-
-def to_torch_dtype_str(dtype_str):
-    if dtype_str == "int8":
-        return "torch::kInt8"
-    if dtype_str == "fp8":
-        return "torch::kFloat8_e4m3fn"
-    raise ValueError("unknown type")
-
-
-def to_cutlass_dtype_str(dtype_str):
-    if dtype_str == "int8":
-        return "int8_t"
-    if dtype_str == "fp8":
-        return "cutlass::float_e4m3_t"
-    raise ValueError("unknown type")
-
-
-def get_script_dir() -> Path:
-    return Path(os.path.dirname(os.path.realpath(__file__)))
-
-
-def get_as_cutlass_gemm_shape(shape: Tuple[int, int, int]):
-    return f'cutlass::gemm::GemmShape<{shape[0]}, {shape[1]}, {shape[2]}>'
-
-
-def get_as_cutlass3x_gemm_shape(shape: Tuple[int, int, int]):
-    return f'Shape<_{shape[0]}, _{shape[1]}, _{shape[2]}>'
-
-
-def file_contents_same(filepath, contents):
-    if not Path(filepath).exists():
-        return
-
-    f_contents = None
-    with open(filepath, "r") as f:
-        f_contents = f.read()
-
-    return f_contents == contents

From 4e31076addf5d7c60208f7ea58d5270ec2a62ab4 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 20:33:52 +0000
Subject: [PATCH 40/92] Update code

---
 .../cutlass_benchmarks/dense_mm/bench_v2.py   |  76 ----
 .../cutlass_benchmarks/sparse_mm/bench_v2.py  | 328 +-----------------
 .../cutlass_w8a8/generator/README.md          | 143 --------
 .../generator/autogen_manifest.py             | 164 ---------
 .../cutlass_w8a8/generator/generator.py       | 145 --------
 .../cutlass_w8a8/generator/generator_types.py | 125 -------
 .../cutlass_w8a8/generator/kernel_compiler.py | 128 -------
 .../generator/kernel_generator.py             | 245 -------------
 .../generator/scaled_mm_c3x.jinja             |  56 ---
 .../generator/scaled_mm_c3x_fnprototype.jinja |   6 -
 .../generator/scaled_mm_c3x_streamk.jinja     |  87 -----
 .../scaled_mm_c3x_streamk_fnprototype.jinja   |   7 -
 .../generator/simple_gemm_c3x.jinja           |  48 ---
 .../simple_gemm_c3x_fnprototype.jinja         |   4 -
 .../cutlass_w8a8/generator/tools/heatmap.py   | 242 -------------
 .../generator/tools/select_kernels.py         | 244 -------------
 .../cutlass_w8a8/generator/tools/utils.py     |  63 ----
 .../cutlass_w8a8/generator/utils.py           |  48 ---
 18 files changed, 6 insertions(+), 2153 deletions(-)
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/README.md
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/generator.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/generator_types.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/kernel_generator.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/tools/utils.py
 delete mode 100644 csrc/quantization/cutlass_w8a8/generator/utils.py

diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
index e65b7f45407ba..20318ff920136 100644
--- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
@@ -213,32 +213,6 @@ def __exit__(self, exc_type, exc_value, traceback):
             print(f"exc traceback {traceback}")
 
 
-def get_autogen_functions():
-    import importlib
-    from importlib.util import find_spec
-
-    # import vllm nm_cutlass modules so torch._C can find it
-    m_idx = 0
-    m_name = f'vllm._nm_cutlass_{m_idx}_C'
-    while find_spec(m_name):
-        print(f"attempting import {m_name}")
-        importlib.import_module(m_name)
-        m_idx += 1
-        m_name = f'vllm._nm_cutlass_{m_idx}_C'
-
-    dispatch_names = torch._C._dispatch_get_all_op_names()
-    autogen_dispatch_names = [x for x in dispatch_names if 'autogen' in x]
-    assert all([x.startswith('_nm_cutlass') for x in autogen_dispatch_names])
-    autogen_dispatch_modules_names = [(getattr(torch.ops,
-                                               x.split('::')[0]),
-                                       x.split('::')[1])
-                                      for x in autogen_dispatch_names]
-    name_fn = [(name, getattr(m, name))
-               for m, name in autogen_dispatch_modules_names]
-    print(f"#autogen functions found {len(name_fn)}")
-    return name_fn
-
-
 def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
               with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
@@ -307,56 +281,6 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
                  torch.bfloat16) as bench:
         timers.append(bench.run())
 
-    def attempt_run(fn, *args, **kwargs) -> bool:
-        try:
-            fn(*args, **kwargs)
-            return True
-        except Exception as e:
-            print(f"Failed to run {autogen_name} because {e} ...")
-            return False
-
-    autogen_name_fn = get_autogen_functions()
-    for autogen_name, autogen_fn in autogen_name_fn:
-        print(f"Bench autogen {autogen_name}")
-        out = torch.empty((m, n), dtype=torch.bfloat16, device="cuda")
-        if "scaled_mm_streamk" in autogen_name:
-
-            # Reduction mode and decomposition mode
-            run_options = [("Deterministic", "StreamK"),
-                           ("Nondeterministic", "StreamK")]
-            for run_option in run_options:
-                reduction_mode, decomposition_mode = run_option
-                if not attempt_run(autogen_fn, out, As[0], Bs[0],
-                                   reduction_mode, decomposition_mode,
-                                   scale_a, scale_b):
-                    continue
-
-                description = (f'{autogen_name}_'
-                              f'{reduction_mode}_'
-                              f'{decomposition_mode}')
-
-                with BenchMM(cuda_graph_params, label,
-                             sub_label, description, autogen_fn, out,
-                             ArgPool(As), ArgPool(Bs),
-                             reduction_mode, decomposition_mode,
-                             scale_a, scale_b) as bench:
-                    timers.append(bench.run())
-
-        elif "scaled_mm" in autogen_name:
-            if not attempt_run(autogen_fn, out, As[0], Bs[0], scale_a, scale_b):
-                continue
-            with BenchMM(cuda_graph_params, label,
-                         sub_label, autogen_name, autogen_fn, out, ArgPool(As),
-                         ArgPool(Bs), scale_a, scale_b) as bench:
-                timers.append(bench.run())
-        else:
-            assert "simple_gemm" in autogen_name
-            if not attempt_run(autogen_fn, out, As[0], Bs[0]):
-                continue
-            with BenchMM(cuda_graph_params, label, sub_label, autogen_name,
-                         autogen_fn, out, ArgPool(As), ArgPool(Bs)) as bench:
-                timers.append(bench.run())
-
     return timers
 
 
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
index 11e4a8815a8ed..19ae3a9dbdfaf 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -224,32 +224,6 @@ def __exit__(self, exc_type, exc_value, traceback):
             print(f"exc traceback {traceback}")
 
 
-def get_autogen_functions():
-    import importlib
-    from importlib.util import find_spec
-
-    # import vllm nm_cutlass modules so torch._C can find it
-    m_idx = 0
-    m_name = f'vllm._nm_cutlass_{m_idx}_C'
-    while find_spec(m_name):
-        # print(f"attempting import {m_name}")
-        importlib.import_module(m_name)
-        m_idx += 1
-        m_name = f'vllm._nm_cutlass_{m_idx}_C'
-
-    dispatch_names = torch._C._dispatch_get_all_op_names()
-    autogen_dispatch_names = [x for x in dispatch_names if 'autogen' in x]
-    assert all([x.startswith('_nm_cutlass') for x in autogen_dispatch_names])
-    autogen_dispatch_modules_names = [(getattr(torch.ops,
-                                               x.split('::')[0]),
-                                       x.split('::')[1])
-                                      for x in autogen_dispatch_names]
-    name_fn = [(name, getattr(m, name))
-               for m, name in autogen_dispatch_modules_names]
-    # print(f"#autogen functions found {len(name_fn)}")
-    return name_fn
-
-
 def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue):
     """
     Run a single kernel benchmark in an isolated process.
@@ -325,16 +299,6 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue)
                             ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
                             scale_b, scale_a, torch.bfloat16)
 
-        elif kernel_type == 'autogen_kernel':
-            # Get the autogen kernel
-            kernel_num = kernel_config['kernel_num']
-            kernel_name, autogen_fn = get_autogen_functions()[kernel_num]
-
-            # Create appropriate benchmark based on kernel type
-            bench = BenchMM(cuda_graph_params, label, sub_label,
-                            kernel_name, autogen_fn, out, 
-                            ArgPool(BComps), ArgPool(Es), ArgPool(AsT),
-                            scale_b, scale_a)
 
         # Run the benchmark
         result = bench.run()
@@ -393,7 +357,7 @@ def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
 def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
     MULTI_GPU_MULTI_PROCESS = False  # Set to False for single GPU testing
     if MULTI_GPU_MULTI_PROCESS:
-        gpus_list = [5]
+        gpus_list = [0]
         task_queue = Queue()
         result_queue = Queue()
 
@@ -423,14 +387,8 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
             completed += 1
 
             # Print progress
-            if config['kernel_type'] == 'autogen_kernel':
-                kernel_num = config['kernel_num']
-                kernel_name = get_autogen_functions()[kernel_num][0]
-                status = "Success" if success else "Failed"
-                print(f"{status}: autogen {kernel_num} {kernel_name}")
-            else:
-                status = "Success" if success else "Failed"
-                print(f"{status}: {config['kernel_type']}")
+            status = "Success" if success else "Failed"
+            print(f"{status}: {config['kernel_type']}")
 
         # Cleanup workers
         for worker in workers:
@@ -443,7 +401,7 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
     else:
         """Run kernel benchmarks in a single process."""
         results = []
-        gpu_id = 5  # Using the same GPU as before
+        gpu_id = 0  # Using the same GPU as before
         torch.cuda.set_device(gpu_id)
         # configs = configs[:10]  # Keep the original slice
         
@@ -515,27 +473,11 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
                                     ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
                                     scale_b, scale_a, torch.bfloat16)
 
-                elif kernel_type == 'autogen_kernel':
-                    # Get the autogen kernel
-                    kernel_num = config['kernel_num']
-                    kernel_name, autogen_fn = get_autogen_functions()[kernel_num]
-
-                    # Create appropriate benchmark based on kernel type
-                    bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    kernel_name, autogen_fn, out, 
-                                    ArgPool(BComps), ArgPool(Es), ArgPool(AsT),
-                                    scale_b, scale_a)
-
                 # Run the benchmark
                 result = bench.run()
                 
                 # Print progress
-                if kernel_type == 'autogen_kernel':
-                    kernel_num = config['kernel_num']
-                    kernel_name = get_autogen_functions()[kernel_num][0]
-                    print(f"Success: autogen {kernel_num} {kernel_name}")
-                else:
-                    print(f"Success: {kernel_type}")
+                print(f"Success: {kernel_type}")
                     
                 results.append((True, result, config))
                 
@@ -551,253 +493,11 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
         return results
 
 
-
-def test_autogen_kernel_process(kernel_config: Dict, gpu_id: int, queue: Queue):
-    """
-    Test run a single autogen kernel in an isolated process.
-    Puts (kernel_num, success) tuple in the queue.
-    """
-    try:
-        torch.cuda.set_device(gpu_id)
-        
-        # Initialize test tensors (using smaller dimensions for quick testing)
-        test_m, test_k, test_n = 256, 256, 256  # Small test dimensions
-        dtype = kernel_config['dtype']
-        kernel_num = kernel_config['kernel_num']
-        
-        # Create minimal test tensors
-        BComps, Es, As, Bs = make_n_rand_sparse_tensors(1, dtype, test_m, test_n, test_k)
-        AsT = [x.t() for x in As]
-        BsT = [x.t() for x in Bs]
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        out = torch.zeros((test_m, test_n), dtype=torch.bfloat16, device="cuda")
-        
-        # Get the autogen kernel
-        kernel_name, autogen_fn = get_autogen_functions()[kernel_num]
-        
-        # Test run based on kernel type
-        autogen_fn(out, BComps[0], Es[0], AsT[0], scale_a, scale_b)
-            
-        # Run a second time to ensure stability
-        torch.cuda.synchronize()
-        autogen_fn(out, BComps[0], Es[0], AsT[0], scale_a, scale_b)
-        torch.cuda.synchronize()
-        
-        queue.put((kernel_num, True))
-        
-    except Exception as e:
-        print(f"Kernel {kernel_num} ({kernel_name if 'kernel_name' in locals() else 'unknown'}) failed test: {str(e)}")
-        queue.put((kernel_num, False))
-    finally:
-        # Explicit cleanup
-        torch.cuda.empty_cache()
-
-
-def test_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
-    """Worker process that spawns individual test processes for each kernel."""
-    try:
-        while True:
-            try:
-                kernel_config = task_queue.get_nowait()
-                if kernel_config is None:  # Poison pill
-                    break
-                
-                # Create a new process queue for this specific test
-                process_queue = Queue()
-                
-                # Create and start a new process for this kernel test
-                p = Process(target=test_autogen_kernel_process, 
-                          args=(kernel_config, gpu_id, process_queue))
-                p.start()
-                
-                # Wait for result with timeout
-                try:
-                    kernel_num, success = process_queue.get(timeout=30)  # 30 second timeout
-                    result_queue.put((kernel_num, success))
-                except Empty:
-                    print(f"Kernel {kernel_config['kernel_num']} timed out")
-                    result_queue.put((kernel_config['kernel_num'], False))
-                
-                # Cleanup
-                p.join(timeout=1)  # Give it 1 second to join
-                if p.is_alive():
-                    p.terminate()
-                    p.join()
-                
-            except Empty:
-                break
-            except Exception as e:
-                print(f"Error in GPU {gpu_id} test worker: {str(e)}")
-                print(traceback.format_exc())
-                if 'kernel_config' in locals():
-                    result_queue.put((kernel_config['kernel_num'], False))
-                
-    finally:
-        print(f"GPU {gpu_id} test worker finished")
-
-
-def filter_stable_autogen_kernels(base_config: Dict, gpus_list: List[int]) -> List[int]:
-    """
-    Test all autogen kernels and return list of kernel numbers that pass the test.
-    Each kernel is tested in a completely isolated process.
-    """
-    task_queue = Queue()
-    result_queue = Queue()
-    
-    # Get all autogen kernels
-    autogen_name_fn = get_autogen_functions()
-    total_kernels = len(autogen_name_fn)
-    
-    # Fill task queue with test configs
-    for i in range(total_kernels):
-        config = {
-            **base_config,
-            'kernel_type': 'autogen_kernel',
-            'kernel_num': i
-        }
-        task_queue.put(config)
-    
-    # Add poison pills
-    for _ in gpus_list:
-        task_queue.put(None)
-    
-    # Start GPU workers
-    workers = []
-    for gpu_id in gpus_list:
-        p = Process(target=test_gpu_worker, args=(gpu_id, task_queue, result_queue))
-        p.start()
-        workers.append(p)
-    
-    # Collect results
-    stable_kernels = []
-    completed = 0
-    
-    print(f"Testing {total_kernels} autogen kernels for stability...")
-    while completed < total_kernels:
-        kernel_num, success = result_queue.get()
-        completed += 1
-        
-        if success:
-            kernel_name = get_autogen_functions()[kernel_num][0]
-            stable_kernels.append(kernel_num)
-            print(f"Kernel {kernel_num} ({kernel_name}) passed stability test")
-        
-        if completed % 10 == 0:
-            print(f"Tested {completed}/{total_kernels} kernels. {len(stable_kernels)} stable so far.")
-    
-    # Wait for workers to finish
-    for worker in workers:
-        worker.join(timeout=1)
-        if worker.is_alive():
-            worker.terminate()
-            worker.join()
-    
-    print(f"Found {len(stable_kernels)} stable kernels out of {total_kernels}")
-    return stable_kernels
-
-
-def get_config_hash(base_config: Dict) -> str:
-    """
-    Create a hash of the relevant configuration parameters that would affect kernel stability.
-    """
-    # Extract only the parameters that affect kernel stability
-    relevant_params = {
-        'dtype': str(base_config['dtype']),  # Convert dtype to string for hashing
-        'm': base_config['m'],
-        'k': base_config['k'],
-        'n': base_config['n'],
-    }
-    
-    # Add CUDA version and PyTorch version to the hash
-    relevant_params['cuda_version'] = torch.version.cuda
-    relevant_params['torch_version'] = torch.__version__
-    
-    # Create a sorted string representation for consistent hashing
-    param_str = json.dumps(relevant_params, sort_keys=True)
-    
-    # Create hash
-    return hashlib.sha256(param_str.encode()).hexdigest()[:16]
-
-
 def get_cache_path() -> str:
     """Get the path to the cache file for the given configuration hash."""
     return f'{Path(os.path.dirname(os.path.realpath(__file__)))}/stable_kernels.json'
 
 
-def load_cached_kernels(cache_path: str) -> Optional[List[int]]:
-    """
-    Load cached stable kernel list if it exists and is not too old.
-    Returns None if cache doesn't exist or is invalid.
-    """
-    try:
-        if not os.path.exists(cache_path):
-            return None
-            
-        with open(cache_path, 'r') as f:
-            cache_data = json.load(f)
-            
-        # # Check if cache is too old (e.g., older than 7 days)
-        # cache_date = datetime.fromisoformat(cache_data['date'])
-        # cache_age = (datetime.now() - cache_date).days
-        # if cache_age > 7:
-        #     print("Cache is older than 7 days, will rerun stability tests")
-        #     return None
-            
-        # Verify the cached kernel numbers are valid
-        total_kernels = len(get_autogen_functions())
-        stable_kernels = cache_data['stable_kernels']
-        if any(k >= total_kernels for k in stable_kernels):
-            print("Cache is invalid (kernel numbers out of range), will rerun stability tests")
-            return None
-            
-        print(f"Loaded {len(stable_kernels)} stable kernels from cache")
-        return stable_kernels
-        
-    except Exception as e:
-        print(f"Error loading cache: {str(e)}")
-        return None
-
-
-def save_cached_kernels(cache_path: str, stable_kernels: List[int]):
-    """Save the list of stable kernels to cache."""
-    try:
-        cache_data = {
-            'date': datetime.now().isoformat(),
-            'stable_kernels': stable_kernels
-        }
-        
-        with open(cache_path, 'w') as f:
-            json.dump(cache_data, f)
-            
-        print(f"Saved {len(stable_kernels)} stable kernels to cache")
-        
-    except Exception as e:
-        print(f"Error saving cache: {str(e)}")
-
-
-def get_stable_autogen_kernels(base_config: Dict, gpus_list: List[int]) -> List[int]:
-    """
-    Get the list of stable autogen kernels, either from cache or by running tests.
-    """
-    # Generate config hash and get cache path
-    # config_hash = get_config_hash(base_config)
-    cache_path = get_cache_path()
-    
-    # Try to load from cache
-    stable_kernels = load_cached_kernels(cache_path)
-    
-    if stable_kernels is None:
-        # Cache miss or invalid cache - run stability tests
-        print("Running stability tests for autogen kernels...")
-        stable_kernels = filter_stable_autogen_kernels(base_config, gpus_list)
-        
-        # Save results to cache
-        save_cached_kernels(cache_path, stable_kernels)
-    
-    return stable_kernels
-
-
 def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
               with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
@@ -833,23 +533,7 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
     ]
     
     # Create configs for standard kernels
-    standard_configs = [{**base_config, **kernel} for kernel in standard_kernels]
-    
-    # Get stable kernels (from cache or by testing)
-    stable_kernel_nums = get_stable_autogen_kernels(base_config, gpus_list)
-    
-    # Create configs only for stable autogen kernels
-    autogen_configs = []
-    for kernel_num in stable_kernel_nums:
-        config = {
-            **base_config,
-            'kernel_type': 'autogen_kernel',
-            'kernel_num': kernel_num
-        }
-        autogen_configs.append(config)
-    
-    # Combine all configs
-    all_configs = standard_configs + autogen_configs
+    all_configs = [{**base_config, **kernel} for kernel in standard_kernels]
     
     # Run all kernels distributed across GPUs
     print(f"Running {len(all_configs)} benchmarks across {len(gpus_list)} GPUs...")
diff --git a/csrc/quantization/cutlass_w8a8/generator/README.md b/csrc/quantization/cutlass_w8a8/generator/README.md
deleted file mode 100644
index 523d767074820..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/README.md
+++ /dev/null
@@ -1,143 +0,0 @@
-## Cutlass Kernel Generator and Benchmark Sweeps
-
-#### Basic Idea
- - Expose a C++ interface for the function to benchmark. The interface must be
-   templated with the hyper-parameters we desire to sweep over.
- - Generate .cu files using jinja templates that use the exposed interface.
-   Look at `scaled_mm_c3x.jinja`
- - Generate torch bindings for the functions in the .cu files.
- - Build vllm to include all the generated .cu files. Look at `nm_cutlass_c.cmake`
- - Run the benchmarking script to sweep over problem shapes and all the generated
-   cutlass kernels. Look at `benchmarks/cutlass_benchmarks/bench_v2.py`
-
-#### Important Files
- - scaled_mm_c3x.jinja / simple_gemm_c3x.jinja : Jinja templated files for functions to generate.
- - scaled_mm_c3x_fnprototype.jinja / simple_gemm_c3x_fnprototype.jinja : Jinja templated files for the C++ function declarations.
- - generator_types.py : This file contains all the information regarding the function type we intend to generate.
-        For example, at the time of writing, we have ScaledMMGeneratorType and SimpleGemmGeneratorType.
-        The ScaledMMGeneratorType points to the correct jinja templates to use and also defines the
-        correct torch biniding `ops.impl` and `ops.def` string. This is where we register new GeneratorTypes
-        if we add more function-generators in the future.
- - autogen_manifest.py : Defines hyper-parameter sets.
- - kernel_generator.py : All utilities that are responsible for filling out the jinja templates
-        based on the given set of hyper-parameter args.
- - generator.py : Bridges autogen_manifest.py and kernel_generator.py. This is the `main` driver
-        scripts that we use to generate kernels.
- - kernel_compiler.py : Not all sets of hyperparameters are valid. The KernelCompiler, attempts an
-        nvcc compile on the generated kernel file and kernel_generator/generator accepts/rejects
-        the generated kernel based this compilation status.
-
-#### Adding a new function to generate
-
-##### Step 1
-    - Like mentioned before, expose a C++ interface for the function to generate. The interface
-    must be templated with the hyper-parameters we desire to sweep over.
-
-##### Step 2
-    - Create jinja templates.
-        1. Create a jinja template file that is representative of the kernel we wish to generate. 
-        2. Create a separate jinja template file that has the function declaration.
-    - Refer to `scaled_mm_c3x.jinja` and `scaled_mm_c3x_fnprototype.jinja`
-
-##### Step 3
-    - Create a GeneratorType in generator_types.py
-    - The GeneratorType is the datastructure that communicates,
-        1. What jinja template files to use
-        2. What is the torch_bindings `ops.def` and `ops.impl` arguments
-    - Refer to ScaledMMGeneratorType
-
-##### Step 4
-    - In autogen_manifest, create a list of hyper-parameter sets that are to be translated into kernel files.
-    - Look at the construction of Cutlass3xArgsTest in autogen_manifest.py
-
-##### Commands to generate kernels:
-    - Example command:
-    python3 csrc/quantization/cutlass_w8a8/generator/generator.py --generator-type scaled_mm --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test --cuda-dir /usr/local/cuda-12.5 --cutlass-args-list Cutlass3xArgsTest
-
-    Here: 
-        - --generator-type : The description of the desired GeneratorType in generator_types.py
-        - --vllm-root-dir : The root-dir of your vllm project
-        - --py-venv-dir : The root-dir of your python environment
-        - --cuda-dir : cuda dir to use
-        - --cutlass-args-list : the name of the list of hyper-parameter sets that you created in autogen_manifest.py
-
-    Expectations:
-     The generator attempts to generate one kernel for every hyper-parameter set.
-        - The generator looks generates the kernel file
-        - The generator attempts to compile the generated kernel file
-        - If compilation succeeds, it keeps the generated kernel file. Deletes it otherwise.
-
-    The generator records the status of the compilation for each kernel it tries to compile. If some kernel is known to 
-    have succeeded in a previous run, it simply generates it and doesnot attempt a re-compile.
-
-##### Commands to build
-    - The normal vllm build command should work.
-    - i.e. either `pip3 install -e .` or `python3 setup.py --build_ext --inplace`
-    Expectation:
-        Compilation should be successful and you should see .so files like, `_nm_cutlass_*_C.so` in the vllm folder
-
-##### How to benchmark
-The benchmarking scripts have been updated to grab all the auto-generated cutlass kernels. Look at 
-`get_autogen_functions` in `benchmarks/cutlass_benchmarks/bench_v2.py`.
-
-Example command:
-python3 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 --with-arg-pool 32 --with-cuda-graph 32 square_bench --dim-start 128 --dim-end 256 --dim-increment 128
-
-Expectations:
-    You should see output similar to, 
-     ```
-     attempting import vllm._nm_cutlass_0_C
-     #autogen functions found 3
-    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecializedCooperative_PersistentScheduler_kGemm_float_fp8
-    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
-    Bench autogen autogen_scaled_mm_90_64x64x32_1x1x1_KernelTmaWarpSpecializedPingpongFP8FastAccum_TmaWarpSpecialized_PersistentScheduler_kGemm_float_fp8
-     ```
-
-##### Benchmark Heatmaps and Optimal Kernel Set Selection
-Typically a hyper-parameter sweep produces 100s of kernels. It could be hard to read the terminal outputs
-of benchmarking scripts. The w8a8_benchmarks.py script when used with the model_bench command, produces
-a pickle file that contains the benchmark information for all the {kernel, gemm-shape} pairs benchmarked.
-
-###### Kernel Selection Problem
-When we run a hyper-parameter sweep, we are interested in finding a minimal a set of kernels that is the
-optimal for the gemm-shapes benchmarked. `tools/select_kernels.py` solves this optimization problem.
-
-Example:
- python3 select_kernels.py --input-pkl ./model_bench-torch.float8_e4m3fn-1729989172.pkl --min-gemm-efficiency 0.98
-
- This example invocation of the select_kernels.py script,
-  - Reads the input pickle file and gathers the benchmark information of all the {kernel, gemm-shape} pairs.
-  - Normalizes the benchmark information with respect to gemm shapes. i.e. the best performing
-    kernel for some gemm-shape is given a value of 1.0. A kernel with a value of `x` ( `x` < 1.0)
-    indicates that that kernel's performance is `x` times that of the optimal kernel.
-  - The script ignores all the {kernel, gemm-shape} pairs where the kernel efficiency is < min_gemm_efficiency.
-    In this case the script only considers the {kernel, gemm-shape} pairs where the normalized value
-    is in range [0.98, 1.0]
-  - The script then determines the optimal and minimal kernel set.
-
-###### Visualization problem
-Reading the w8a8_benchmarks.py terminal output can get overwhelming. The script `tools/heatmap.py`
-consumes a model_bench pickle file and produces a heatmap for better consumption of the results.
-
-Example:
-  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --plot-all-ops
-
-  Normalizes all the {kernel, gemm-shape} information in the model_bench pickle file (refer to "Kernel Selection Problem"
-  for how the data is normalized). and renders the normalized benchmark information as a heatmap.
-
-Example:
-  python3 heatmap.py --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961-selected.pkl --select-kernels
-
-  Effectively runs select_kernel.py on the input pkl file and renders the selected kernels as heatmap.
-
-
-
-
-
-
-
-tools/select_kernel.py :  
-
-
-
-
diff --git a/csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py b/csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py
deleted file mode 100644
index a4f8297b71944..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/autogen_manifest.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import copy
-from dataclasses import dataclass
-from itertools import product
-from typing import Tuple
-
-
-@dataclass
-class Cutlass3xArgs:
-    dtype_str: str
-    arch: int
-    tile_shape: Tuple[int, int, int]
-    cluster_shape: Tuple[int, int, int]
-    kernel_schedule: str
-    epilogue_schedule: str
-    tile_schedule: str
-    gemm_mode: str
-    acc_type: str
-
-    def with_tile_shape(self, ts):
-        clone = copy.deepcopy(self)
-        clone.tile_shape = ts
-        return clone
-
-    def with_cluster_shape(self, cs):
-        clone = copy.deepcopy(self)
-        clone.cluster_shape = cs
-        return clone
-
-    def with_tile_schedule(self, ts):
-        clone = copy.deepcopy(self)
-        clone.tile_schedule = ts
-        return clone
-
-    def with_kernel_schedule(self, ks):
-        clone = copy.deepcopy(self)
-        clone.kernel_schedule = ks
-        return clone
-
-    def with_epilogue_schedule(self, es):
-        clone = copy.deepcopy(self)
-        clone.epilogue_schedule = es
-        return clone
-
-    def with_gemm_mode(self, gm):
-        clone = copy.deepcopy(self)
-        clone.gemm_mode = gm
-        return clone
-
-    def with_acc_type(self, acc):
-        clone = copy.deepcopy(self)
-        clone.acc_type = acc
-        return clone
-
-    def with_dtype_str(self, dtype_str):
-        clone = copy.deepcopy(self)
-        clone.dtype_str = dtype_str
-        return clone
-
-
-DefaultCutlass3xArgsFP8 = Cutlass3xArgs(
-    "fp8", 90, (128, 128, 128), (1, 2, 1),
-    "cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative",
-    "cutlass::epilogue::TmaWarpSpecializedCooperative",
-    "cutlass::gemm::PersistentScheduler",
-    "cutlass::gemm::GemmUniversalMode::kGemm", "float")
-
-## Kernel Schedules
-## All
-# struct KernelMultistage { };
-# struct KernelCpAsyncWarpSpecialized { };
-# struct KernelCpAsyncWarpSpecializedPingpong { };
-# struct KernelCpAsyncWarpSpecializedCooperative { };
-# struct KernelTma { };
-# struct KernelTmaWarpSpecialized { };
-# struct KernelTmaWarpSpecializedPingpong { };
-# struct KernelTmaWarpSpecializedCooperative { };
-# struct KernelPtrArrayTmaWarpSpecializedCooperative { };
-## FP8
-# struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
-# struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { }; # noqa
-# struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { }; #noqa
-# struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };  #noqa
-
-## Epilogue policies
-# struct NoSmemWarpSpecialized {};
-# struct PtrArrayNoSmemWarpSpecialized {};
-# struct TmaWarpSpecialized {};
-# struct TmaWarpSpecializedCooperative {};
-
-## Tile scheduler
-# struct PersistentScheduler { };
-# struct StreamKScheduler { };
-
-## Kgemms
-# kGemm
-# kGemmSplitKParallel,
-# kBatched,
-# kArray,
-# kGrouped,
-# kInvalid
-
-cluster_shapes = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (4, 1, 1),
-                  (1, 4, 1), (8, 1, 1), (1, 8, 1), (4, 4, 1)]
-tile_shapes_m = [64, 128, 256]
-tile_shapes_n = [64, 128, 256]
-tile_shapes_k = [32, 64, 128, 256]
-tile_shapes = list(product(tile_shapes_m, tile_shapes_n, tile_shapes_k))
-
-kernel_schedules = [
-    "cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum",
-    "cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum",
-    "cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum"
-]
-
-epilogue_schedules = [
-    "cutlass::epilogue::TmaWarpSpecialized",
-    "cutlass::epilogue::TmaWarpSpecializedCooperative"
-]
-
-tile_schedules = [
-    "cutlass::gemm::PersistentScheduler", "cutlass::gemm::StreamKScheduler"
-]
-
-gemm_modes = ["cutlass::gemm::GemmUniversalMode::kGemm"]
-
-acc_types = ["float"]
-
-#epilogue_schedules_v2 = ["cutlass::epilogue::NoSmemWarpSpecialized"]
-gemm_modes_v2 = ["cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel"]
-acc_types_v2 = ["cutlass::half_t"]
-
-## Make Cutlass3xArgsTest
-
-Cutlass3xArgsTest = []
-
-for ts, cs, ks, es, tile_schedule, gm, at in product(
-        tile_shapes, cluster_shapes, kernel_schedules, epilogue_schedules,
-        tile_schedules, gemm_modes, acc_types):
-    Cutlass3xArgsTest.append(
-        DefaultCutlass3xArgsFP8.with_tile_shape(ts).with_cluster_shape(cs).
-        with_kernel_schedule(ks).with_epilogue_schedule(es).with_tile_schedule(
-            tile_schedule).with_gemm_mode(gm).with_acc_type(at))
-
-Cutlass3xArgsTest = Cutlass3xArgsTest[:5]
-
-## Make StreamK args
-
-Cutlass3xStreamKArgsTest = []
-
-# This is the only schedule that supports actual stream k scheduling.
-streamk_kernel_schedules = [
-    "cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum"
-]
-
-# "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension." #noqa
-streamk_tile_shapes = [x for x in tile_shapes if x[0] >= 128]
-
-for ts, cs, ks, es, tile_schedule, gm, at in product(
-        streamk_tile_shapes, cluster_shapes, streamk_kernel_schedules, epilogue_schedules,
-        ["cutlass::gemm::StreamKScheduler"], gemm_modes, acc_types):
-    Cutlass3xStreamKArgsTest.append(
-        DefaultCutlass3xArgsFP8.with_tile_shape(ts).with_cluster_shape(cs).
-        with_kernel_schedule(ks).with_epilogue_schedule(es).with_tile_schedule(
-            tile_schedule).with_gemm_mode(gm).with_acc_type(at))
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/generator/generator.py b/csrc/quantization/cutlass_w8a8/generator/generator.py
deleted file mode 100644
index 99209cd04de6f..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/generator.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import pprint
-from dataclasses import dataclass
-from multiprocessing.pool import ThreadPool
-from typing import List, Optional
-
-import autogen_manifest
-from autogen_manifest import Cutlass3xArgs
-from generator_types import GeneratorType, GeneratorTypes
-from kernel_compiler import KernelCompiler
-from kernel_generator import GeneratorOutput, KernelGenerator
-from tqdm import tqdm
-
-
-@dataclass
-class GenerateFromArgInput:
-    generator_type: Optional[GeneratorType] = None
-    args: Optional[Cutlass3xArgs] = None
-    kernel_compiler: Optional[KernelCompiler] = None
-
-
-def generate_from_arg(input: GenerateFromArgInput) -> GeneratorOutput:
-    """
-    Kernel generation for a single Cutlass3xArg
-    """
-    generator_type, args, kernel_compiler = (input.generator_type, input.args,
-                                             input.kernel_compiler)
-    return KernelGenerator.generate(generator_type, args, kernel_compiler)
-
-
-def generate_from_args_mt(generator_type: GeneratorType,
-                          args: List[Cutlass3xArgs],
-                          kernel_compiler: KernelCompiler,
-                          num_threads: int = 32) -> GeneratorOutput:
-    """
-    Kernel generator for a list of Cutlass3xArgs with multi-threading.
-    """
-    generator_outputs = GeneratorOutput()
-    # create thread pool with {num_threads} threads
-    pool = ThreadPool(processes=num_threads)
-    inputs = [
-        GenerateFromArgInput(generator_type, x, kernel_compiler) for x in args
-    ]
-    result = pool.map_async(generate_from_arg, inputs)
-    for r in result.get():
-        generator_outputs.merge(r)
-    return generator_outputs
-
-
-def main(args):
-    pprint.pprint(args)
-
-    cutlass_args_list = getattr(autogen_manifest, args.cutlass_args_list)
-    print(f"Generating {len(cutlass_args_list)} cuda files ...")
-
-    generator_type: GeneratorType = GeneratorType.from_str(args.generator_type)
-
-    additional_compile_args = [x.strip() for x in args.additional_compile_args]
-    kernel_compiler: KernelCompiler = KernelCompiler(
-        vllm_root_dir=args.vllm_root_dir,
-        py_venv_dir=args.py_venv_dir,
-        cuda_dir=args.cuda_dir,
-        py_version=args.py_version,
-        additional_args=additional_compile_args,
-        test_compile=args.test_compile)
-    kernel_compiler.init_compile_cache()
-
-    generator_outputs = GeneratorOutput()
-    batch_size = 100  # Compile-and-Generate batch_size items at a time
-    for idx in tqdm(range(0, len(cutlass_args_list), batch_size)):
-        print(f"Total {len(cutlass_args_list)}"
-              f" | Success {len(generator_outputs.success_file_names)}"
-              f"| Fail {len(generator_outputs.failed_file_names)}")
-
-        chunk_generator_output = generate_from_args_mt(
-            generator_type, cutlass_args_list[idx:idx + batch_size],
-            kernel_compiler)
-        generator_outputs.merge(chunk_generator_output)
-
-        # Store intermediate results
-        # fill-out ops.h
-        KernelGenerator.write_ops(generator_type, generator_outputs.file_paths,
-                                  generator_outputs.fn_names,
-                                  generator_outputs.fn_decls)
-        # store result batch
-        kernel_compiler.cache.add(generator_outputs.success_file_names,
-                                  generator_outputs.failed_file_names)
-        kernel_compiler.cache.store()
-
-
-if __name__ == "__main__":
-
-    import argparse
-    parser = argparse.ArgumentParser(description='''
-            Autogen cutlass kernels
-            Example: 
-            python3 csrc/quantization/cutlass_w8a8/generator/generator.py \
-                 --generator-type scaled_mm \
-                 --vllm-root-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/ \
-                 --py-venv-dir ${HOME}/code/nm-vllm-ent/nm-vllm-ent/vllm-test \
-                 --cuda-dir /usr/local/cuda-12.5
-            ''')
-
-    parser.add_argument("--generator-type",
-                        required=True,
-                        choices=[x.description() for x in GeneratorTypes])
-    parser.add_argument("--cutlass-args-list",
-                        required=True,
-                        type=str,
-                        default=None,
-                        help='''
-                        The cutlass args list variable name constructed in
-                        autogen_manifest.py. The variable name is imported
-                        as,
-                        getattr(autogen_manifest, args.cutlass_args_list)
-                        ''')
-    parser.add_argument('--test-compile',
-                        action='store_true',
-                        help='''
-                        Runs as usual but,
-                            - Prints compiler errors
-                            - Doesn't update the kernel compiler cache.
-                        ''')
-    parser.add_argument("--vllm-root-dir",
-                        required=True,
-                        type=str,
-                        default=None,
-                        help="Root directory of vllm source code")
-    parser.add_argument("--py-venv-dir",
-                        required=True,
-                        type=str,
-                        default=None,
-                        help="py venv root directory")
-    parser.add_argument("--cuda-dir",
-                        type=str,
-                        default=None,
-                        help="CUDA dir example: /usr/local/cuda-12.5")
-    parser.add_argument(
-        "--py-version",
-        type=str,
-        default="3.10",
-        help="Python version to use. Used in fetching the python includes")
-    parser.add_argument("--additional-compile-args", nargs='*', default=[])
-
-    args = parser.parse_args()
-    main(args)
diff --git a/csrc/quantization/cutlass_w8a8/generator/generator_types.py b/csrc/quantization/cutlass_w8a8/generator/generator_types.py
deleted file mode 100644
index 40aba7edaf280..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/generator_types.py
+++ /dev/null
@@ -1,125 +0,0 @@
-"""
-Generator function types.
-
-Defines necessary information about each function type to generate.
-"""
-
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import List
-
-from utils import get_script_dir
-
-
-class GeneratorType(ABC):
-    SCRIPT_DIR = get_script_dir()
-
-    @staticmethod
-    def description() -> str:
-        raise NotImplementedError
-
-    @abstractmethod
-    def fn_defn_jinja_filepath(self) -> Path:
-        # Function definition jinja - the entrypoint to the function to
-        # generate.
-        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x.jinja for
-        # an example.
-        raise NotImplementedError
-
-    @abstractmethod
-    def fn_decl_jinja_filepath(self) -> Path:
-        # Function decl jinja - the c++ function declaration of the function
-        # to generate.
-        # Refer to csrc/quantization/cutlass_w8a8/scaled_mm_c3x_fnprototype.jinja #noqa
-        # for an example.
-
-        raise NotImplementedError
-
-    @abstractmethod
-    def ops_def(self, fn_name: str) -> str:
-        # torch binding ops.def template.
-        raise NotImplementedError
-
-    @abstractmethod
-    def ops_impl(self, fn_name: str) -> str:
-        # torch binding ops.impl template.
-        raise NotImplementedError
-
-    @staticmethod
-    def from_str(s: str) -> "GeneratorType":
-        if ScaledMMGenerator.description() == s:
-            return ScaledMMGenerator()
-        if SimpleGemmGenerator.description() == s:
-            return SimpleGemmGenerator()
-        if ScaledMMStreamKGenerator.description() == s:
-            return ScaledMMStreamKGenerator()
-        raise ValueError("Unknown generator type string {s}")
-
-
-class ScaledMMGenerator(GeneratorType):
-
-    def __init__(self):
-        super().__init__()
-
-    @staticmethod
-    def description():
-        return "scaled_mm"
-
-    def fn_defn_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x.jinja"
-
-    def fn_decl_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_fnprototype.jinja"
-
-    def ops_def(self, fn_name: str) -> str:
-        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor b, Tensor a_scales, Tensor b_scales) -> ()");'  #noqa
-
-    def ops_impl(self, fn_name: str) -> str:
-        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
-
-
-class SimpleGemmGenerator(GeneratorType):
-
-    def __init__(self):
-        super().__init__()
-
-    @staticmethod
-    def description():
-        return "simple_gemm"
-
-    def fn_defn_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "simple_gemm_c3x.jinja"
-
-    def fn_decl_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "simple_gemm_c3x_fnprototype.jinja"
-
-    def ops_def(self, fn_name: str) -> str:
-        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor b) -> ()");'
-
-    def ops_impl(self, fn_name: str) -> str:
-        # The {} should be filled in by the caller using the function name.
-        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
-
-
-class ScaledMMStreamKGenerator(GeneratorType):
-
-    def __init__(self):
-        super().__init__()
-
-    @staticmethod
-    def description():
-        return "scaled_mm_streamk"
-
-    def fn_defn_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_streamk.jinja"
-
-    def fn_decl_jinja_filepath(self):
-        return GeneratorType.SCRIPT_DIR / "scaled_mm_c3x_streamk_fnprototype.jinja"
-
-    def ops_def(self, fn_name: str) -> str:
-        return f'ops.def("{fn_name}(Tensor! out, Tensor a, Tensor b, str reduction_mode, str decomposition_mode, Tensor a_scales, Tensor b_scales) -> ()");'  #noqa
-
-    def ops_impl(self, fn_name: str) -> str:
-        return f'ops.impl("{fn_name}", torch::kCUDA, &{fn_name});'
-
-GeneratorTypes: List[GeneratorType] = [ScaledMMGenerator, SimpleGemmGenerator, ScaledMMStreamKGenerator]
diff --git a/csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py b/csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py
deleted file mode 100644
index b4fa1cf434567..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/kernel_compiler.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""
-Utilities to invoke the kernel compiler.
-When generating cutlass kernels, we attempt an nvcc compile to make sure that
-there won't be any issues at vllm build time.
-"""
-
-import pickle as pkl
-import subprocess
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import List, Optional
-
-# Global compile cache path that stores information about which kernels
-# compiled successfully and which failed.
-CACHE_FILE_PATH = Path('./kernels_compile_cache.pkl')
-
-
-class KernelCompileCache:
-
-    def __init__(self, test_compile=False):
-        # If test_compile is true, we override the cache operations so it
-        # is a no-op.
-        self.test_compile = test_compile
-
-        # self.bad_kernels are kernels that failed compilation
-        # self.good_kernels are kernels that succeeded compilation
-        if not CACHE_FILE_PATH.exists() or self.test_compile:
-            self.bad_kernels = []
-            self.good_kernels = []
-        else:
-            # Load from cache
-            data = None
-            with open(str(CACHE_FILE_PATH), 'rb') as f:
-                data = pkl.load(f)
-            self.bad_kernels, self.good_kernels = data
-        print(f"#bad kernels {len(self.bad_kernels)},"
-              f"#good kernels {len(self.good_kernels)} loaded from cache ...")
-
-    def is_bad_kernel(self, kernel_file_name: str):
-        if self.test_compile:
-            return False
-        return kernel_file_name in self.bad_kernels
-
-    def is_good_kernel(self, kernel_file_name: str):
-        if self.test_compile:
-            return False
-        return kernel_file_name in self.good_kernels
-
-    def add(self, success: List[str], fail: List[str]):
-        self.good_kernels.extend(success)
-        self.bad_kernels.extend(fail)
-        # Remove duplicates
-        self.good_kernels = list(set(self.good_kernels))
-        self.bad_kernels = list(set(self.bad_kernels))
-
-    def store(self):
-        if self.test_compile:
-            return
-        print(f"Storing #badkernels {len(self.bad_kernels)}, "
-              f"#goodkernels {len(self.good_kernels)}")
-        with open(str(CACHE_FILE_PATH), 'wb+') as f:
-            pkl.dump((self.bad_kernels, self.good_kernels), f)
-
-
-@dataclass
-class KernelCompiler:
-    # vllm source code directory path
-    vllm_root_dir: Optional[str] = None
-    # python venv directory path
-    py_venv_dir: Optional[str] = None
-    # cuda directory path. example : /usr/local/cuda-12.5
-    cuda_dir: Optional[str] = None
-    #python version
-    py_version: str = '3.10'
-    # any additional flags
-    additional_args: List[str] = field(default_factory=lambda: [])
-    # kernel compile cache. Cache that holds history of which kernels
-    # succeeded and failed compilation.
-    cache: Optional[KernelCompileCache] = None
-    # Print nvcc compile information and override cache updates.
-    test_compile: bool = False
-
-    def init_compile_cache(self):
-        self.cache = KernelCompileCache(self.test_compile)
-
-    def compile(self, cu_file: str, gencode_arch: str) -> bool:
-        compile_command_base = [
-            'nvcc',
-            '-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1',
-            f'-I{self.vllm_root_dir}/csrc',
-            f'-I{self.vllm_root_dir}/.deps/cutlass-src/include',  #noqa
-            '-isystem',
-            f'/usr/include/python{self.py_version}',
-            '-isystem',
-            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include',
-            '-isystem',
-            f'{self.py_venv_dir}/lib/python3.10/site-packages/torch/include/torch/csrc/api/include',  #noqa
-            '-isystem',
-            f'{self.cuda_dir}/include',
-            '-gencode',
-            f'arch=compute_{gencode_arch},code=sm_{gencode_arch}',
-            '-DONNX_NAMESPACE=onnx_c2',
-            '-Xcudafe',
-            '-DNDEBUG',
-            '-std=c++17',
-            '-Xcompiler=-fPIC',
-            '--expt-relaxed-constexpr',
-            '--threads=1',
-            '-D_GLIBCXX_USE_CXX11_ABI=0'] + self.additional_args
-        if gencode_arch == 90:
-            compile_command_base += ['-gencode', 'arch=compute_90a,code=sm_90a']
-
-        result = subprocess.run(compile_command_base + ['-c', cu_file],
-                                capture_output=True)
-
-        if self.test_compile:
-            print(f"Compiling {cu_file} : \n"
-                  f"   Successful compilation: {result.returncode == 0}\n"
-                  f"   stdout : {result.stdout}\n"
-                  f"   stderr : {result.stderr}\n")
-
-        if result.returncode == 0:
-            # Cleanup generated object code on successful compile.
-            object_file_path = Path("./" + Path(cu_file).stem + '.o')
-            assert object_file_path.exists(), object_file_path
-            object_file_path.unlink()
-
-        return result.returncode == 0
diff --git a/csrc/quantization/cutlass_w8a8/generator/kernel_generator.py b/csrc/quantization/cutlass_w8a8/generator/kernel_generator.py
deleted file mode 100644
index cdef17804f9b5..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/kernel_generator.py
+++ /dev/null
@@ -1,245 +0,0 @@
-"""
-Kernel Generator classes / functions.
-"""
-
-import shutil
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import List, Tuple
-
-import jinja2
-import utils
-from autogen_manifest import Cutlass3xArgs
-from generator_types import GeneratorType
-from kernel_compiler import KernelCompiler
-
-
-@dataclass
-class GeneratorOutput:
-    # Used in torch_bindings generation
-    file_paths: List[str] = field(default_factory=lambda: [])
-    fn_names: List[str] = field(default_factory=lambda: [])
-    fn_decls: List[str] = field(default_factory=lambda: [])
-    # Used in cache update
-    failed_file_names: List[str] = field(default_factory=lambda: [])
-    success_file_names: List[str] = field(default_factory=lambda: [])
-
-    def merge(self, output: "GeneratorOutput"):
-        self.file_paths.extend(output.file_paths)
-        self.fn_names.extend(output.fn_names)
-        self.fn_decls.extend(output.fn_decls)
-        self.failed_file_names.extend(output.failed_file_names)
-        self.success_file_names.extend(output.success_file_names)
-
-
-## Abstract generator
-
-
-class KernelGenerator_(ABC):
-    SCRIPT_DIR = utils.get_script_dir()
-    GENERATE_DIR = SCRIPT_DIR / "generated"
-
-    @staticmethod
-    def write_torch_bindings(generator_type: GeneratorType,
-                             fn_names: List[str], fn_decls: List[str],
-                             ops_macro: str, dir_path: str):
-        s = "#pragma once\n"
-        s += "#include<torch/torch.h>\n"
-        s += f"#define {ops_macro} \\\n"
-        for fn_name in fn_names:
-            s += generator_type.ops_def(fn_name) + '\\\n'
-            s += generator_type.ops_impl(fn_name) + '\\\n'
-        s += "\n"
-
-        for fn_decl in fn_decls:
-            s += f'{fn_decl}\n'
-
-        # write ops.h
-        file_path = Path(dir_path) / "ops.h"
-        with open(str(file_path), 'w+') as f:
-            f.write(s)
-
-        # write torch_bindings.cpp
-        s = ""
-        s += '\n#include "core/registration.h"'
-        s += '\n#include <torch/library.h>'
-        s += '\n#include "ops.h"'
-        s += '\nTORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {'
-        s += f'\n {ops_macro}'
-        s += '\n}'
-        s += '\nREGISTER_EXTENSION(TORCH_EXTENSION_NAME)'
-        s += '\n'
-
-        tb_path = Path(dir_path) / "torch_bindings.cpp"
-        with open(str(tb_path), 'w+') as f:
-            f.write(s)
-
-    @staticmethod
-    def write_ops(generator_type: GeneratorType,
-                  file_paths: List[str],
-                  fn_names: List[str],
-                  fn_decls: List[str],
-                  ops_macro: str,
-                  batch_size: int = 100):
-        """
-        batch_size defines the number of files per .so.
-        If there are a 1000 filenames, then with batch_size 100, we generate
-        10 directories, each directory containing 100 kernels. Each directory
-        is converted into a .so during vllm compile.
-        """
-
-        assert len(file_paths) == len(fn_names)
-        assert len(file_paths) == len(fn_decls)
-
-        dir_name = 0
-        for i in range(0, len(file_paths), batch_size):
-
-            dir_path: Path = KernelGenerator_.GENERATE_DIR / f'{dir_name}'
-            dir_path.mkdir(exist_ok=True)
-
-            # Move files to dir
-            for file_path in file_paths[i:i + batch_size]:
-                if Path(file_path).exists():
-                    shutil.move(file_path, str(dir_path))
-
-            KernelGenerator_.write_torch_bindings(generator_type,
-                                                  fn_names[i:i + batch_size],
-                                                  fn_decls[i:i + batch_size],
-                                                  ops_macro, dir_path)
-
-            dir_name += 1  #noqa
-
-    @staticmethod
-    def last_namespace(s):
-        return s.split('::')[-1]
-
-    @staticmethod
-    @abstractmethod
-    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
-                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
-        ...
-
-
-class KernelGenerator(KernelGenerator_):
-    OPS_MACRO = "CUTLASS_DEFS"
-
-    @staticmethod
-    def generate_name(description: str, args: Cutlass3xArgs):
-
-        return 'autogen_{}_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}'.format(
-            description, args.arch, args.tile_shape[0], args.tile_shape[1],
-            args.tile_shape[2], args.cluster_shape[0], args.cluster_shape[1],
-            args.cluster_shape[2],
-            KernelGenerator_.last_namespace(args.kernel_schedule),
-            KernelGenerator_.last_namespace(args.epilogue_schedule),
-            KernelGenerator_.last_namespace(args.tile_schedule),
-            KernelGenerator_.last_namespace(args.gemm_mode),
-            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str)
-
-    @staticmethod
-    def generate_filename(description: str, args: Cutlass3xArgs):
-
-        f = '{}/autogen_{}_{}x{}x{}_{}x{}x{}_{}_{}_{}_{}_{}_{}_{}'.format(
-            KernelGenerator_.GENERATE_DIR, description, args.tile_shape[0],
-            args.tile_shape[1], args.tile_shape[2], args.cluster_shape[0],
-            args.cluster_shape[1], args.cluster_shape[2],
-            KernelGenerator_.last_namespace(args.kernel_schedule),
-            KernelGenerator_.last_namespace(args.epilogue_schedule),
-            KernelGenerator_.last_namespace(args.tile_schedule),
-            KernelGenerator_.last_namespace(args.gemm_mode),
-            KernelGenerator_.last_namespace(args.acc_type), args.dtype_str,
-            args.arch)
-
-        f = f + ".cu"
-        return f
-
-    @staticmethod
-    def generate_kernel_file(generator_type: GeneratorType,
-                             args: Cutlass3xArgs) -> Tuple[str, str]:
-        """
-        Generate a .cu file that respects args and return,
-         - The function name of the generated function.
-         - The c++ function declaration of the generated function.
-        The return values are used in generating the torch bindings.
-        """
-
-        # Make the generate dir
-        KernelGenerator_.GENERATE_DIR.mkdir(exist_ok=True)
-
-        # Get jinja templates
-        jenv = jinja2.Environment(loader=jinja2.FileSystemLoader("/"))
-        fn_defn_template = jenv.get_template(
-            str(generator_type.fn_defn_jinja_filepath()))
-        fn_decl_template = jenv.get_template(
-            str(generator_type.fn_decl_jinja_filepath()))
-
-        # Generate code
-        fn_name = KernelGenerator.generate_name(generator_type.description(),
-                                                args)
-        fn_decl = fn_decl_template.render(_name=fn_name)
-        code: str = fn_defn_template.render(
-            _name=fn_name,
-            _torch_input_dtype=utils.to_torch_dtype_str(args.dtype_str),
-            _cutlass_input_dtype=utils.to_cutlass_dtype_str(args.dtype_str),
-            _tile_shape=utils.get_as_cutlass3x_gemm_shape(args.tile_shape),
-            _cluster_shape=utils.get_as_cutlass3x_gemm_shape(
-                args.cluster_shape),
-            _kernel_schedule=args.kernel_schedule,
-            _epilogue_schedule=args.epilogue_schedule,
-            _tile_schedule=args.tile_schedule,
-            _gemm_mode=args.gemm_mode,
-            _acc_type=args.acc_type)
-
-        filename = KernelGenerator.generate_filename(
-            generator_type.description(), args)
-        if utils.file_contents_same(filename, code):
-            return (fn_name, fn_decl)
-
-        # write code
-        with open(filename, "w+") as f:
-            f.write(code)
-
-        return (fn_name, fn_decl)
-
-    @staticmethod
-    def generate(generator_type: GeneratorType, args: Cutlass3xArgs,
-                 kernel_compiler: KernelCompiler) -> GeneratorOutput:
-        generator_output = GeneratorOutput()
-
-        filepath = KernelGenerator.generate_filename(
-            generator_type.description(), args)
-        filename = Path(filepath).name
-
-        if kernel_compiler.cache.is_bad_kernel(filename):
-            # We know that this kernel wouldn't compile. Abort
-            return generator_output
-
-        fn_name, fn_decl = KernelGenerator.generate_kernel_file(
-            generator_type, args)
-
-        if not kernel_compiler.cache.is_good_kernel(filename):
-            # We dont have any information about this kernel in the cache.
-            # try compiling
-            compile_success = kernel_compiler.compile(filepath,
-                                                      gencode_arch=args.arch)
-            if compile_success:
-                generator_output.success_file_names.append(filename)
-            else:
-                generator_output.failed_file_names.append(filename)
-                if not kernel_compiler.test_compile:
-                    # Remove generated file
-                    Path(filepath).unlink()
-                    return generator_output
-
-        generator_output.file_paths.append(filepath)
-        generator_output.fn_names.append(fn_name)
-        generator_output.fn_decls.append(fn_decl)
-
-        return generator_output
-
-    @staticmethod
-    def write_ops(generator_type: GeneratorType, file_paths: List[str],
-                  fn_names: List[str], fn_decls: List[str]):
-        return KernelGenerator_.write_ops(generator_type, file_paths, fn_names,
-                                          fn_decls, KernelGenerator.OPS_MACRO)
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja
deleted file mode 100644
index ea6b25fb5f91c..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x.jinja
+++ /dev/null
@@ -1,56 +0,0 @@
-#include <stddef.h>
-#include <torch/all.h>
-#include "cutlass/cutlass.h"
-#include "quantization/cutlass_w8a8/scaled_mm_c3x.cuh"
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &b,
-                torch::Tensor const &a_scales,
-                torch::Tensor const &b_scales) {
-
-  using TileShape =  {{ _tile_shape }};
-  using ClusterShape = {{ _cluster_shape }};
-  using KernelSchedule = typename {{ _kernel_schedule }};
-  using EpilogueSchedule = typename {{ _epilogue_schedule }};
-  using TileSchedule = typename {{ _tile_schedule }};
-  using AccType = {{ _acc_type }};
-  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
-
-  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
-  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (out.dtype() == torch::kBFloat16) {
-    using Cutlass3xGemm =
-      cutlass_3x_gemm<cutlass::float_e4m3_t,
-                      cutlass::bfloat16_t,
-                      ScaledEpilogue,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      EpilogueSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_gemm_caller<Cutlass3xGemm>(
-        out, a, b, a_scales, b_scales);
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    using Cutlass3xGemm =
-      cutlass_3x_gemm<cutlass::float_e4m3_t,
-                      cutlass::half_t,
-                      ScaledEpilogue,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      EpilogueSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_gemm_caller<Cutlass3xGemm>(
-        out, a, b, a_scales, b_scales);
-  }
-}
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja
deleted file mode 100644
index c671bfc155c09..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_fnprototype.jinja
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &b,
-                torch::Tensor const &a_scales,
-                torch::Tensor const &b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja
deleted file mode 100644
index 17bfdcc92ddb6..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk.jinja
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <stddef.h>
-#include <torch/all.h>
-#include "cutlass/cutlass.h"
-#include "quantization/cutlass_w8a8/scaled_mm_c3x.cuh"
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &b,
-                std::string reduction_mode_,
-                std::string decomposition_mode_,
-                torch::Tensor const &a_scales,
-                torch::Tensor const &b_scales) {
-
-  assert (reduction_mode_ == "Deterministic" ||
-          reduction_mode_ == "Nondeterministic");
-  assert (decomposition_mode_ == "Heuristic" ||
-          decomposition_mode_ == "SplitK" ||
-          decomposition_mode_ == "DataParallel" ||
-          decomposition_mode_ == "StreamK");
-
-  using TileShape =  {{ _tile_shape }};
-  using ClusterShape = {{ _cluster_shape }};
-  using KernelSchedule = typename {{ _kernel_schedule }};
-  using EpilogueSchedule = typename {{ _epilogue_schedule }};
-  using TileSchedule = typename {{ _tile_schedule }};
-  using AccType = {{ _acc_type }};
-  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
-
-  ReductionMode reduction_mode{ReductionMode::Deterministic};
-  if (reduction_mode_ == "Deterministic") {
-    reduction_mode = ReductionMode::Deterministic;
-  }
-  else if (reduction_mode_ == "Nondeterministic") {
-    reduction_mode = ReductionMode::Nondeterministic;
-  }
-
-  DecompositionMode decomposition_mode{DecompositionMode::Heuristic};
-  if (decomposition_mode_ == "Heuristic") {
-    decomposition_mode = DecompositionMode::Heuristic;
-  }
-  else if (decomposition_mode_ == "SplitK") {
-    decomposition_mode = DecompositionMode::SplitK;
-  }
-  else if (decomposition_mode_ == "DataParallel") {
-    decomposition_mode = DecompositionMode::DataParallel;
-  }
-  else if (decomposition_mode_ == "StreamK") {
-    decomposition_mode = DecompositionMode::StreamK;
-  }
-
-  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
-  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (out.dtype() == torch::kBFloat16) {
-    using Cutlass3xGemm =
-      cutlass_3x_gemm<cutlass::float_e4m3_t,
-                      cutlass::bfloat16_t,
-                      ScaledEpilogue,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      EpilogueSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_gemm_caller_streamk<Cutlass3xGemm>(
-        out, a, b, reduction_mode, decomposition_mode, a_scales, b_scales);
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    using Cutlass3xGemm =
-      cutlass_3x_gemm<cutlass::float_e4m3_t,
-                      cutlass::half_t,
-                      ScaledEpilogue,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      EpilogueSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_gemm_caller_streamk<Cutlass3xGemm>(
-        out, a, b, reduction_mode, decomposition_mode, a_scales, b_scales);
-  }
-}
diff --git a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja b/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja
deleted file mode 100644
index be5f7cb351209..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/scaled_mm_c3x_streamk_fnprototype.jinja
+++ /dev/null
@@ -1,7 +0,0 @@
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &b,
-                std::string reduction_mode_,
-                std::string decomposition_mode_,
-                torch::Tensor const &a_scales,
-                torch::Tensor const &b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja b/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja
deleted file mode 100644
index ba1a427bfea1e..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x.jinja
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <stddef.h>
-#include <torch/all.h>
-#include "cutlass/cutlass.h"
-#include "quantization/cutlass_w8a8/scaled_mm_c3x.cuh"
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &b) {
-
-  using TileShape =  {{ _tile_shape }};
-  using ClusterShape = {{ _cluster_shape }};
-  using KernelSchedule = typename {{ _kernel_schedule }};
-  using AccType = {{ _acc_type }};
-  using TileSchedule = typename {{ _tile_schedule }};
-  static constexpr cutlass::gemm::GemmUniversalMode Mode = {{ _gemm_mode }};
-
-  TORCH_CHECK(a.dtype() == {{ _torch_input_dtype }});
-  TORCH_CHECK(b.dtype() == {{ _torch_input_dtype}});
-
-  if (out.dtype() == torch::kBFloat16) {
-    using Cutlass3xGemm =
-      cutlass_3x_simple_gemm<cutlass::float_e4m3_t,
-                      cutlass::bfloat16_t,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
-
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-
-    using Cutlass3xGemm =
-      cutlass_3x_simple_gemm<cutlass::float_e4m3_t,
-                      cutlass::half_t,
-                      TileShape,
-                      ClusterShape,
-                      KernelSchedule,
-                      AccType,
-                      TileSchedule,
-                      Mode>;
-
-    return cutlass_simple_gemm_caller<Cutlass3xGemm>(
-        out, a, b);
-  }
-}
diff --git a/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja b/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja
deleted file mode 100644
index 12feea36beede..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/simple_gemm_c3x_fnprototype.jinja
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-void {{ _name }}(torch::Tensor &out, torch::Tensor const &a,
-                torch::Tensor const &b);
diff --git a/csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py b/csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py
deleted file mode 100644
index 3d8296df38d10..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/tools/heatmap.py
+++ /dev/null
@@ -1,242 +0,0 @@
-import pickle as pkl
-from pathlib import Path
-from typing import List, Optional
-
-import matplotlib.pyplot as plt
-import numpy as np
-from select_kernels import select_kernels
-from utils import Data, make_heatmap_data, measurement_to_data
-
-
-def plot_heatmap(data: np.array,
-                 y_labels: List[str],
-                 x_labels: List[str],
-                 save_filename='heatmap.png'):
-    # min because of some matplotlib render restrictions.
-    fig_size_x = min(len(x_labels), 320)
-    fig_size_y = len(y_labels)
-    fig, ax = plt.subplots(figsize=(fig_size_x, fig_size_y))
-    im = ax.imshow(data, cmap="Reds", vmin=0.0, vmax=1.0, interpolation=None)
-
-    cbar = ax.figure.colorbar(im, ax=ax, cmap="Reds")
-    cbar.ax.set_ylabel("Hot == Closer to peak perf.", rotation=90, va="top")
-
-    # Show all ticks and label them with the respective list entries
-    ax.set_xticks(np.arange(len(x_labels)), labels=x_labels)
-    ax.set_yticks(np.arange(len(y_labels)), labels=y_labels)
-
-    # Rotate the tick labels and set their alignment.
-    plt.setp(ax.get_xticklabels(), rotation=90)
-
-    # Loop over data dimensions and create text annotations.
-    for i in range(len(y_labels)):
-        for j in range(len(x_labels)):
-            ax.text(j,
-                    i,
-                    data[i, j],
-                    ha="center",
-                    va="center",
-                    color="w",
-                    fontsize=10.0)
-
-    #ax.colorbar()
-
-    ax.set_title("GEMM shape vs Best cutlass op")
-    #ax.set_aspect('equal')
-    fig.tight_layout()
-
-    #fig.set_dpi(300)
-    #plt.show()
-    print(f"Save location : {save_filename}")
-    fig.savefig(save_filename, dpi=100)
-    #fig.savefig(save_filename, dpi=10)
-
-
-def select_top_k_kernels(gemm_ops: np.array,
-                         gemm_problems: List[str],
-                         ops: List[str],
-                         k: int = 100) -> List[str]:
-    """
-    Simple top_k kernel selection. 
-    Gather the top-k best performing kernels for each gemm problem and
-    return the union.
-    """
-    n_rows = len(gemm_problems)
-
-    max_kernels_per_gemm_shape = 100  # k-value
-    gemm_efficiency_threshold = 0.90
-
-    selected_ops = []
-    for r in range(n_rows):
-        gemm_ops_list = np.copy(gemm_ops[r])
-        sorted_indices = list(reversed(np.argsort(gemm_ops_list).tolist()))
-
-        selected_shape_ops = []
-        for x in sorted_indices:
-            if 'autogen' not in ops[x]:
-                # select only autogen kernels/ops
-                continue
-            if len(selected_shape_ops) >= max_kernels_per_gemm_shape:
-                break
-            # we have reached the min requirement. Decide to break based on
-            # the gemm_efficiency threshold.
-            if gemm_ops_list[x] < gemm_efficiency_threshold:
-                break
-            else:
-                selected_shape_ops.append(ops[x])
-
-        selected_ops.append(selected_shape_ops)
-
-        op_scores = []
-        for idx in range(len(selected_shape_ops)):
-            if 'autogen' not in ops[sorted_indices[idx]]:
-                continue
-            op_scores.append(gemm_ops_list[sorted_indices[idx]])
-        print(f"Gemm problem {gemm_problems[r]} "
-              f"- #kernels {len(selected_shape_ops)} "
-              f"- selected kernel range [ {min(op_scores)} , "
-              f"{max(op_scores)} ] ")
-
-    # Merge all ops to create a final list
-    selected_ops = [set(x) for x in selected_ops]
-    selected_ops_set = set()
-    for x in selected_ops:
-        selected_ops_set = selected_ops_set.union(x)
-
-    print(f"#Selected ops set {len(selected_ops_set)}")
-    for x in selected_ops_set:
-        print(x)
-    return list(selected_ops_set)
-
-
-def remove_less_performant_kernels(gemm_ops: np.array, ops: List[str]):
-    """
-    Removes kernel that are relatively less performant from gemm_ops.
-    """
-    n_ops = gemm_ops.shape[1]
-    assert n_ops == len(ops)
-
-    gemm_ops_predicated = gemm_ops < 0.75
-    ops_predicated = np.all(gemm_ops_predicated, axis=0)
-
-    bad_cols = list(range(n_ops))
-    bad_cols = list(filter(lambda x: ops_predicated[x], bad_cols))
-    bad_cols = sorted(list(set(bad_cols)), reverse=True)
-    for bc in bad_cols:
-        ops.pop(bc)
-        gemm_ops = np.delete(gemm_ops, bc, 1)
-
-    return gemm_ops, ops
-
-
-def plot(gemm_ops: np.array,
-         gemm_problems: List[str],
-         ops: List[str],
-         save_filename: str,
-         prune_ops: bool = False):
-    if prune_ops:
-        gemm_ops, ops = remove_less_performant_kernels(gemm_ops, ops)
-        print(f"Pruned gemm_ops {gemm_ops.shape}")
-
-    plot_heatmap(gemm_ops, gemm_problems, ops, save_filename)
-
-
-def select_kernels_and_plot(gemm_problems: List[str], ops: List[str],
-                            data: List[str], save_filename: str):
-
-    autogen_ops = list(filter(lambda x: x.startswith('autogen'), ops))
-    cutlass_ops = list(filter(lambda x: x.startswith('cutlass'), ops))
-    pytorch_ops = list(filter(lambda x: x.startswith('pytorch'), ops))
-    assert len(autogen_ops) + len(cutlass_ops) + len(pytorch_ops) == len(ops)
-
-    print("Selecting the autogen kernels ..")
-    # select the best autogen kernels
-    gemm_autogenops = make_heatmap_data(gemm_problems, autogen_ops, data)
-    selected_autogen_ops = select_kernels(gemm_autogenops,
-                                          gemm_problems,
-                                          autogen_ops,
-                                          min_gemm_efficiency=0.98)
-
-    # prepare plot data
-    selected_ops = selected_autogen_ops + cutlass_ops + pytorch_ops
-    gemm_ops = make_heatmap_data(gemm_problems, selected_ops, data)
-    print("Plotting autogen kernels ...")
-    plot(gemm_ops, gemm_problems, selected_ops, save_filename)
-
-
-def from_measurements(args):
-    pkl_files: List[str] = args.input_pkl
-    save_file: Optional[str] = args.save_file
-    data: List[Data] = []
-
-    for pkl_file in pkl_files:
-        with open(pkl_file, 'rb') as f:
-            pkl_data = pkl.load(f)
-            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
-
-    ops: List[str] = list(map(lambda x: x.description, data))
-    ops = sorted(list(set(ops)))
-
-    gemm_problems: List[str] = list(map(lambda x: (x.m, x.n, x.k), data))
-    gemm_problems = sorted(list(set(gemm_problems)))
-
-    print(f"#gemm_problems {len(gemm_problems)}")
-    print(f"#gemm_ops {len(ops)}")
-
-    # plot all data as heat map
-    if args.plot_all_ops:
-        gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
-        out_file: str = pkl_file.replace(
-            '.pkl', '_heatmap.png') if save_file is None else save_file
-        plot(gemm_ops, gemm_problems, ops, save_filename=out_file)
-
-    if args.select_kernels:
-        out_file = None
-        if save_file:
-            out_file = Path(save_file).with_suffix("_selected.png")
-        else:
-            out_file = pkl_file.replace('.pkl', 'selected_heatmap.png')
-        select_kernels_and_plot(gemm_problems, ops, data, out_file)
-
-
-def main(args):
-    from_measurements(args)
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='''
-        Plot bench measurements pkl.
-        Example invocation: 
-        Plot all the ops in model bench pickle file:
-            python3 heatmap.py \
-              --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
-              --plot-all-ops
-        Run select kernel on the input-pkl and plot the selected ops.
-            python3 heatmap.py \
-               --input-pkl ./model_bench-torch.float8_e4m3fn-1730295961.pkl \
-               --select-kernels
-        ''')
-
-    parser.add_argument("--input-pkl",
-                        "-i",
-                        nargs="+",
-                        required=True,
-                        type=str,
-                        help=("This is typically the pickle file output by "
-                              "w8a8_benchmarks.py 's model_bench command"))
-    parser.add_argument("--save-file", "-o", required=False, type=str)
-    parser.add_argument("--select-kernels",
-                        action='store_true',
-                        help="Run kernel selection and plot the heatmap "
-                        "for the selected kernels")
-    parser.add_argument("--plot-all-ops",
-                        action='store_true',
-                        help="plot heatmap for all ops")
-    args = parser.parse_args()
-
-    if not args.plot_all_ops and not args.select_kernels:
-        print("Argument error : Please provide at least one argument among"
-              "[--plot-all-ops, --select-kernels]")
-
-    main(args)
diff --git a/csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py b/csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py
deleted file mode 100644
index aea3289ed232d..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/tools/select_kernels.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import pickle as pkl
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-from utils import Data, make_heatmap_data, measurement_to_data
-
-
-@dataclass
-class Interval:
-    s: int  # start of interval
-    e: int  # end of interval
-    eff: float  # efficiency of the kernel in that range.
-
-    def x_in_interval(self, x: int) -> bool:
-        return self.s <= x and x <= self.e
-
-    def is_overlap(self, s, e):
-        return s <= self.e and self.s <= e
-
-
-@dataclass
-class KernelIntervals:
-    name: str
-    intervals: List[Interval]
-
-    def spanning_interval(self, pi: int) -> Optional[Interval]:
-        for i in self.intervals:
-            if i.x_in_interval(pi):
-                return i
-        return None
-
-
-class SelectKernelMeta:
-
-    def __init__(self, gemm_ops: np.array, gemm_problems: List[str],
-                 ops: List[str], min_gemm_efficiency: float):
-        self.gemm_ops = np.copy(gemm_ops)
-        self.gemm_problems = gemm_problems
-        self.ops = ops
-        self.min_gemm_efficiency = min_gemm_efficiency
-
-        self.n_problems = len(self.gemm_problems)
-        self.n_kernels = len(self.ops)
-
-        # Convert to kernel ranges
-        self.problem_indices = {x: idx for idx, x in enumerate(gemm_problems)}
-        self.kernel_indices = {x: idx for idx, x in enumerate(ops)}
-
-        self.kernel_intervals: List[KernelIntervals] = []
-        for ki in range(self.n_kernels):
-            self.kernel_intervals.append(self.make_kernel_intervals(ki))
-
-    def avg_efficiency(self, p_s: int, p_e: int, ki: int) -> float:
-        """
-        Average efficiency of the ki kernel for the gemm shapes in
-        range [p_s, p_e]
-        """
-        vals = self.gemm_ops[:, ki].tolist()[p_s:p_e + 1]
-        return sum(vals) / len(vals)
-
-    # TODO (varun) : Revisit kernel scores to use only the intervals we actually
-    # use for specific kernels.
-    def kernel_set_score(self, p_s: int, p_e: int, kernel_indices: set[int]):
-        """
-        Compute a score for a set of kernels for the gemm shape indices in
-        range [p_s, p_e]
-        """
-        if len(kernel_indices) == 0:
-            return 0.0
-        ki_scores = []
-        for ki in kernel_indices:
-            interval_scores = []
-            for i in self.kernel_intervals[ki].intervals:
-                if i.is_overlap(p_s, p_e):
-                    interval_scores.append(i.eff)
-            assert len(interval_scores) > 0
-            ki_scores.append(sum(interval_scores) / len(interval_scores))
-        assert len(ki_scores) > 0
-        return sum(ki_scores) / len(ki_scores)
-
-    def make_kernel_intervals(self, ki: int) -> KernelIntervals:
-        s = None
-        e = None
-        kernel_intervals: KernelIntervals = KernelIntervals(self.ops[ki], [])
-        for pi in range(self.n_problems):
-            if self.gemm_ops[pi][ki] < self.min_gemm_efficiency:
-                # record range
-                if e:
-                    assert s is not None
-                    kernel_intervals.intervals.append(
-                        Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
-                s, e = None, None
-            else:
-                s = pi if s is None else s
-                e = pi
-        if e:
-            assert s is not None
-            kernel_intervals.intervals.append(
-                Interval(s, e, eff=self.avg_efficiency(s, e, ki)))
-        # sort intervals in the kernel
-        kernel_intervals.intervals = sorted(kernel_intervals.intervals,
-                                            key=lambda x: x.s)
-        return kernel_intervals
-
-
-def map_gemm_to_kernel(kernel_indices: List[int],
-                       meta: SelectKernelMeta) -> Dict[int, int]:
-    """
-    For every gemm problem in meta.gemm_problems, select a kernel from
-    kernel_indices and return as a dict.
-    """
-    gemm_to_kernel_map = {}
-
-    for pi in range(meta.n_problems):
-        kernels_for_pi = []
-        for ki in kernel_indices:
-            if meta.kernel_intervals[ki].spanning_interval(pi):
-                kernels_for_pi.append(ki)
-        assert len(kernels_for_pi) != 0
-
-        # select the kernel with max efficiency
-        eff_ki = [(meta.gemm_ops[pi][ki], ki) for ki in kernels_for_pi]
-        max_eff_ki = max(eff_ki, key=lambda x: x[0])[1]
-        gemm_to_kernel_map[pi] = max_eff_ki
-
-    return gemm_to_kernel_map
-
-
-def select_kernels_dp(
-        p_s: int,
-        p_e: int,  # Problem start index and problem end index
-        meta: SelectKernelMeta,
-        solution_cache: Dict[Tuple[int, int], set]) -> set[int]:
-    """
-    Compute the best set of kernels for the gemm problem shapes,
-    meta.gemm_problems[p_s:p_e].
-    """
-    if p_s > p_e:
-        return set([])
-    assert p_s <= p_e
-    assert p_s >= 0 and p_e >= 0
-    assert p_s < meta.n_problems and p_e < meta.n_problems
-
-    if solution_cache.get((p_s, p_e), None) is not None:
-        return solution_cache.get((p_s, p_e))
-
-    spanning_kernels: List[Tuple[int, Interval]] = []
-    for ki in range(meta.n_kernels):
-        span_i = meta.kernel_intervals[ki].spanning_interval(p_s)
-        assert span_i is None or (span_i.s <= p_s and span_i.e >= p_s)
-        if span_i is not None:
-            spanning_kernels.append((ki, span_i))
-
-    assert len(spanning_kernels) != 0, \
-            (f"Cannot find a spanning kernel in range ({p_s}, {p_e})"
-            f"- gemm {meta.gemm_problems[p_s]} to {meta.gemm_problems[p_e]}"
-            f". Try reducing the min_gemm_efficiency")
-    ki_solutions: List[set[int]] = []
-    for ki, span in spanning_kernels:
-        ki_solutions.append(
-            set([ki]).union(
-                select_kernels_dp(span.e + 1, p_e, meta, solution_cache)))
-
-    # find the solution with minimum number of kernels.
-    sol = min(ki_solutions, key=lambda x: len(x))
-    solution_cache[(p_s, p_e)] = sol
-    return sol
-
-
-def select_kernels(gemm_ops: np.array, gemm_problems: List[str],
-                   ops: List[str], min_gemm_efficiency: float) -> List[str]:
-    """
-    Given a list of gemm problem shapes, gemm_problems, a list of autogen
-    kernel operations ops, normalized benchmarking information and a
-    minimum operation efficiency to consider, this function, finds that
-    smallest set of kernels such that kernels in the satisfies the
-    min_gemm_efficiency for all the gemm shapes. 
-    """
-    solution_cache = {}
-    meta = SelectKernelMeta(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
-    kernels = select_kernels_dp(0, meta.n_problems - 1, meta, solution_cache)
-
-    gemm_to_kernel_map = map_gemm_to_kernel(list(kernels), meta)
-
-    print(f"#kernels found {len(kernels)}")
-    for pi in range(meta.n_problems):
-        print(f"Problem {meta.gemm_problems[pi]} - "
-              f"Kernel {meta.ops[gemm_to_kernel_map[pi]]} "
-              f"eff. ({gemm_ops[pi][gemm_to_kernel_map[pi]]}) ")
-
-    kernel_names = [ops[ki] for ki in kernels]
-    return kernel_names
-
-
-def from_measurements(pkl_files: List[str], min_gemm_efficiency: float):
-    data: List[Data] = []
-
-    for pkl_file in pkl_files:
-        with open(pkl_file, 'rb') as f:
-            pkl_data = pkl.load(f)
-            data.extend(list(map(lambda x: measurement_to_data(x), pkl_data)))
-
-    ops = list(map(lambda x: x.description, data))
-    ops = sorted(list(set(ops)))
-    # have only autogen kernels
-    ops = list(filter(lambda x: 'autogen' in x, ops))
-
-    gemm_problems = list(map(lambda x: (x.m, x.n, x.k), data))
-    gemm_problems = sorted(list(set(gemm_problems)))
-
-    print(f"#gemm_problems {len(gemm_problems)}")
-    print(f"#gemm_ops {len(ops)}")
-
-    gemm_ops: np.array = make_heatmap_data(gemm_problems, ops, data)
-    select_kernels(gemm_ops, gemm_problems, ops, min_gemm_efficiency)
-
-
-def main(pkl_files: List[str], min_gemm_efficiency: float):
-    from_measurements(pkl_files, min_gemm_efficiency)
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description=("Select minimal set of kernels in some model_bench "
-                     "pkl file such that the set of kernels satisfy"
-                     "the min-gemm-efficiency for all the gemm shapes in"
-                     "the model_bench"))
-    parser.add_argument("--input-pkl",
-                        "-i",
-                        nargs="+",
-                        required=True,
-                        type=str)
-    parser.add_argument(
-        "--min-gemm-efficiency",
-        type=float,
-        default=0.95,
-        help="Gemms that are less than this for a particular gemm shape is"
-        "disregarded")
-    args = parser.parse_args()
-
-    main(args.input_pkl, args.min_gemm_efficiency)
diff --git a/csrc/quantization/cutlass_w8a8/generator/tools/utils.py b/csrc/quantization/cutlass_w8a8/generator/tools/utils.py
deleted file mode 100644
index 74d5c9ddbd08d..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/tools/utils.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from dataclasses import dataclass
-from typing import List
-
-import numpy as np
-from torch.utils.benchmark import Measurement as TMeasurement
-
-
-@dataclass
-class Data:
-    m: int
-    k: int
-    n: int
-    description: str
-    time: float
-    tflops: float
-
-
-def parse_mkn(mkn_str: str):
-    # mkn_str : MKN=(16x1024x512)
-    mkn_tuple = mkn_str.split("=")[1]
-    # mkn_tuple : (16x1024x512)
-    mkn_prod = mkn_tuple[1:-1]
-    # mkn_prod : 16x1024x512
-    mkn_tuple = tuple(mkn_prod.split("x"))
-    return (int(mkn_tuple[0]), int(mkn_tuple[1]), int(mkn_tuple[2]))
-
-
-def measurement_to_data(measurement: TMeasurement) -> Data:
-    m, k, n = parse_mkn(measurement.sub_label)
-    t_ops = 2 * m * k * n / 1024 / 1024 / 1024 / 1024
-    tflops = t_ops / measurement.median
-    return Data(m, k, n, measurement.task_spec.description, measurement.median,
-                tflops)
-
-
-def make_heatmap_data(gemm_problems: List[str], ops: List[str],
-                      data: List[Data]) -> np.array:
-    """
-        gemm_problems : List of gemm problem shapes
-        ops : List of operations (kernels)
-        data : List of Data that contains benchmark information for all
-            op-gemmshape pairs.
-        Normalize all the benchmark information w.r.t. to its gemm-shape
-        and return the normalized benchmark information as a numpy array.
-    """
-    gemm_ops: List[List[float]] = [[0.0] * len(ops)
-                                   for _ in range(len(gemm_problems))]
-    for op_idx, op in enumerate(ops):
-        op_data = list(filter(lambda x: x.description == op, data))
-        for gemm_idx, gemm in enumerate(gemm_problems):
-            m, n, k = gemm
-            selected = list(
-                filter(lambda x: x.m == m and x.n == n and x.k == k, op_data))
-            if len(selected) >= 1:
-                gemm_ops[gemm_idx][op_idx] = float(selected[0].tflops)
-
-    for gemm_idx in range(len(gemm_problems)):
-        max_tflops = max(gemm_ops[gemm_idx])
-        for op_idx in range(len(ops)):
-            gemm_ops[gemm_idx][op_idx] = round(
-                gemm_ops[gemm_idx][op_idx] / max_tflops, 2)
-
-    return np.array(gemm_ops)
diff --git a/csrc/quantization/cutlass_w8a8/generator/utils.py b/csrc/quantization/cutlass_w8a8/generator/utils.py
deleted file mode 100644
index 8121412e42a47..0000000000000
--- a/csrc/quantization/cutlass_w8a8/generator/utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""
-Utils used in generating cutlass kernels.
-"""
-
-import os
-from pathlib import Path
-from typing import Tuple
-
-## Utilities ####
-
-
-def to_torch_dtype_str(dtype_str):
-    if dtype_str == "int8":
-        return "torch::kInt8"
-    if dtype_str == "fp8":
-        return "torch::kFloat8_e4m3fn"
-    raise ValueError("unknown type")
-
-
-def to_cutlass_dtype_str(dtype_str):
-    if dtype_str == "int8":
-        return "int8_t"
-    if dtype_str == "fp8":
-        return "cutlass::float_e4m3_t"
-    raise ValueError("unknown type")
-
-
-def get_script_dir() -> Path:
-    return Path(os.path.dirname(os.path.realpath(__file__)))
-
-
-def get_as_cutlass_gemm_shape(shape: Tuple[int, int, int]):
-    return f'cutlass::gemm::GemmShape<{shape[0]}, {shape[1]}, {shape[2]}>'
-
-
-def get_as_cutlass3x_gemm_shape(shape: Tuple[int, int, int]):
-    return f'Shape<_{shape[0]}, _{shape[1]}, _{shape[2]}>'
-
-
-def file_contents_same(filepath, contents):
-    if not Path(filepath).exists():
-        return
-
-    f_contents = None
-    with open(filepath, "r") as f:
-        f_contents = f.read()
-
-    return f_contents == contents

From 13fccf4bd167069b1a0047bd690912cb8d64b343 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 20:36:33 +0000
Subject: [PATCH 41/92] Clean up the benchmarking

---
 benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py  | 4 ++--
 benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
index 20318ff920136..466887d94f957 100644
--- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
@@ -141,7 +141,7 @@ def get_cuda_graph_runner(self):
                     self.fn(*args, **kwargs)
         return g
 
-    def run_cudagrah(self) -> TMeasurement:
+    def run_cudagraph(self) -> TMeasurement:
         assert self.use_cuda_graph
         globals = {'g': self.g}
 
@@ -193,7 +193,7 @@ def run_eager(self) -> TMeasurement:
     def run(self) -> TMeasurement:
         timer = None
         if self.use_cuda_graph:  # noqa SIM108
-            timer = self.run_cudagrah()
+            timer = self.run_cudagraph()
         else:
             timer = self.run_eager()
         #assert timer.meets_confidence()
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
index 19ae3a9dbdfaf..f9b4871044526 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -152,7 +152,7 @@ def get_cuda_graph_runner(self):
                     self.fn(*args, **kwargs)
         return g
 
-    def run_cudagrah(self) -> TMeasurement:
+    def run_cudagraph(self) -> TMeasurement:
         assert self.use_cuda_graph
         globals = {'g': self.g}
 
@@ -204,7 +204,7 @@ def run_eager(self) -> TMeasurement:
     def run(self) -> TMeasurement:
         timer = None
         if self.use_cuda_graph:  # noqa SIM108
-            timer = self.run_cudagrah()
+            timer = self.run_cudagraph()
         else:
             timer = self.run_eager()
         #assert timer.meets_confidence()

From b345cc8175548740dae52a04d54717f84da108e6 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 20:39:08 +0000
Subject: [PATCH 42/92] Clean up the cutlass benchmarking

---
 .../cutlass_benchmarks/dense_mm/bench_v1.py   | 191 ---------
 .../cutlass_benchmarks/dense_mm/bench_v2.py   | 293 -------------
 .../cutlass_benchmarks/dense_mm/utils.py      |  36 --
 .../dense_mm/w8a8_benchmarks.py               | 211 ----------
 .../dense_mm/weight_shapes.py                 |  75 ----
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 389 ++++++++++++++++++
 .../cutlass_benchmarks/weight_shapes.py       |  43 ++
 7 files changed, 432 insertions(+), 806 deletions(-)
 delete mode 100644 benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
 delete mode 100644 benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
 delete mode 100644 benchmarks/cutlass_benchmarks/dense_mm/utils.py
 delete mode 100644 benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
 delete mode 100644 benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
 create mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
 create mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py

diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
deleted file mode 100644
index d2f532c6bf18c..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v1.py
+++ /dev/null
@@ -1,191 +0,0 @@
-## Cutlass benchmark V1
-
-from typing import Callable, Iterable
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_rand_tensors
-
-import vllm._custom_ops as ops
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
-    min_run_time = 1
-
-    globals = {
-        "args": args,
-        "kwargs": kwargs,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
-
-    timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-
-    timers = []
-
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
-
-    return timers
-
-
-def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-             sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
deleted file mode 100644
index 466887d94f957..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/bench_v2.py
+++ /dev/null
@@ -1,293 +0,0 @@
-import dataclasses
-import random
-from typing import Any, Callable, Iterable, Optional
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_n_rand_tensors
-
-import vllm._custom_ops as ops
-
-
-@dataclasses.dataclass
-class CudaGraphBenchParams:
-    num_ops_in_cuda_graph: int
-
-
-@dataclasses.dataclass
-class ArgPool:
-    '''
-    When some argument of the benchmarking function is annotated with this type,
-    the benchmarking class (BenchMM) will collapse the argument to a pick a
-    single value from the given list of values, during function invocation.
-
-    For every invocation during a benchmarking run, it will choose a
-    different value from the list.
-    '''
-    values: Iterable[Any]
-
-
-class BenchMM:
-
-    class ArgsIterator:
-
-        def __init__(self, args_list, kwargs_list):
-            assert len(args_list) == len(kwargs_list)
-            self.args_list = args_list
-            self.kwargs_list = kwargs_list
-            self.n = len(self.args_list)
-            self.idx = 0
-
-        def __next__(self):
-            while True:
-                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
-                self.idx += 1
-                self.idx = self.idx % self.n
-
-        def reset(self):
-            self.idx = 0
-
-        @property
-        def n_args(self):
-            return self.n
-
-    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
-                 label: str, sub_label: str, description: str, fn: Callable,
-                 *args, **kwargs):
-
-        self.cuda_graph_params = cuda_graph_params
-        self.use_cuda_graph = self.cuda_graph_params is not None
-        self.label = label
-        self.sub_label = sub_label
-        self.description = description
-        self.fn = fn
-
-        # Process args
-        self._args = args
-        self._kwargs = kwargs
-        self.args_list, self.kwargs_list = self.collapse_argpool(
-            *args, **kwargs)
-        self.args_iterator = self.ArgsIterator(self.args_list,
-                                               self.kwargs_list)
-
-        # Cudagraph runner
-        self.g = None
-        if self.use_cuda_graph:
-            self.g = self.get_cuda_graph_runner()
-
-        # benchmark run params
-        self.min_run_time = 1
-
-    def collapse_argpool(self, *args, **kwargs):
-        kwargs = kwargs if kwargs is not None else {}
-        assert kwargs is None or all([
-            not isinstance(v, ArgPool) for k, v in kwargs.items()
-        ]), 'ArgPools in kwargs are not supported yet'
-
-        arg_pool_indices = [
-            i for i, x in enumerate(args) if isinstance(x, ArgPool)
-        ]
-        if len(arg_pool_indices) == 0:
-            return [args], [kwargs]
-
-        # make sure all the Arg pools have the same number of choices
-        arg_pool_size = len(args[arg_pool_indices[0]].values)
-        assert all(
-            [len(args[i].values) == arg_pool_size for i in arg_pool_indices])
-
-        # create copies of the args
-        args_list = []
-        kwargs_list = []
-        for _ in range(arg_pool_size):
-            args_list.append(args)
-            kwargs_list.append(kwargs.copy())
-
-        # collapse the arg pools by simply choosing the ith value
-        for i in range(arg_pool_size):
-            assert isinstance(args_list[i], tuple)
-            # get as list
-            args_i = list(args_list[i])
-            # collapse - make replacements
-            for arg_pool_idx in arg_pool_indices:
-                val_from_pool = args_i[arg_pool_idx].values[i]
-                args_i[arg_pool_idx] = val_from_pool
-            # store back as tuple
-            args_list[i] = tuple(args_i)
-
-        return args_list, kwargs_list
-
-    def get_cuda_graph_runner(self):
-        assert self.use_cuda_graph
-        assert self.args_iterator is not None
-
-        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
-
-        # warmup
-        args_it = self.args_iterator.__next__()
-        for _ in range(5):
-            args, kwargs = next(args_it)
-            self.fn(*args, **kwargs)
-
-        self.args_iterator.reset()
-        args_it = self.args_iterator.__next__()
-
-        stream = torch.cuda.Stream()
-        with torch.cuda.stream(stream):
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                for _ in range(num_graph_ops):
-                    args, kwargs = next(args_it)
-                    self.fn(*args, **kwargs)
-        return g
-
-    def run_cudagraph(self) -> TMeasurement:
-        assert self.use_cuda_graph
-        globals = {'g': self.g}
-
-        return TBenchmark.Timer(
-            stmt="g.replay()",
-            globals=globals,
-            label=self.label,
-            sub_label=self.sub_label,
-            description=self.description,
-        ).blocked_autorange(min_run_time=self.min_run_time)
-
-    def run_eager(self) -> TMeasurement:
-        setup = None
-        stmt = None
-        globals = None
-
-        has_arg_pool = self.args_iterator.n_args > 1
-        if has_arg_pool:
-            setup = '''
-                    args_iterator.reset()
-                    args_it = args_iterator.__next__()
-                    '''
-            stmt = '''
-                    args, kwargs = next(args_it)
-                    fn(*args, **kwargs)
-                    '''
-            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
-        else:
-            # no arg pool. Just use the args and kwargs directly
-            self.args_iterator.reset()
-            args_it = self.args_iterator.__next__()
-            args, kwargs = next(args_it)
-
-            setup = ""
-            stmt = '''
-                    fn(*args, **kwargs)
-                   '''
-            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
-
-        return TBenchmark.Timer(
-            stmt=stmt,
-            setup=setup,
-            globals=globals,
-            label=self.label,
-            sub_label=self.sub_label,
-            description=self.description,
-        ).blocked_autorange(min_run_time=self.min_run_time)
-
-    def run(self) -> TMeasurement:
-        timer = None
-        if self.use_cuda_graph:  # noqa SIM108
-            timer = self.run_cudagraph()
-        else:
-            timer = self.run_eager()
-        #assert timer.meets_confidence()
-        #assert not timer.has_warnings, f"Warnings {timer._warnings}"
-        if not timer.meets_confidence() or timer.has_warnings:
-            print("Doesn't meet confidence - re-running bench ...")
-            return self.run()
-        return timer
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if exc_type:
-            print(f"exc type {exc_type}")
-            print(f"exc value {exc_value}")
-            print(f"exc traceback {traceback}")
-
-
-def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
-              with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-
-    arg_pool_size = with_arg_pool if with_arg_pool else 1
-    cuda_graph_params: Optional[CudaGraphBenchParams] = None
-    if with_cuda_graph:
-        num_ops_in_cuda_graph = with_cuda_graph if with_cuda_graph else None
-        cuda_graph_params = CudaGraphBenchParams(num_ops_in_cuda_graph)
-
-    assert dtype == torch.float8_e4m3fn
-
-    # Make input As and Bs
-    As, Bs = make_n_rand_tensors(arg_pool_size, torch.float8_e4m3fn, m, n, k)
-    bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
-    bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
-    # shuffle As and Bs to prevent any suspicion of pattern exploitation
-    random.shuffle(As)
-    random.shuffle(Bs)
-    random.shuffle(bf16_As)
-    random.shuffle(bf16_Bs)
-
-    # Make scales and biases
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-
-    # pytorch impl w. bf16
-    with BenchMM(cuda_graph_params, label, sub_label,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales", torch.mm,
-                 ArgPool(bf16_As), ArgPool(bf16_Bs)) as bench:
-        timers.append(bench.run())
-
-    ## pytorch impl: bf16 output, without fp8 fast accum
-    with BenchMM(cuda_graph_params,
-                 label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 ArgPool(As),
-                 ArgPool(Bs),
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16) as bench:
-        timers.append(bench.run())
-
-    ## pytorch impl: bf16 output, with fp8 fast accum
-    with BenchMM(cuda_graph_params,
-                 label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 ArgPool(As),
-                 ArgPool(Bs),
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True) as bench:
-        timers.append(bench.run())
-
-    ## cutlass impl: bf16 output
-    with BenchMM(cuda_graph_params, label, sub_label,
-                 "cutlass_fp8_fp8_bf16_scaled_mm", ops.cutlass_scaled_mm,
-                 ArgPool(As), ArgPool(Bs), scale_a, scale_b,
-                 torch.bfloat16) as bench:
-        timers.append(bench.run())
-
-    return timers
-
-
-def bench_v2(dtype: torch.dtype, with_cuda_graph: Optional[int],
-             with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
-             sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
-                         sub_label)
-    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/utils.py b/benchmarks/cutlass_benchmarks/dense_mm/utils.py
deleted file mode 100644
index c8fcd50a51d31..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Cutlass bench utils
-from typing import Iterable, Tuple
-
-import torch
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
-def make_n_rand_tensors(num_tensors: int, dtype: torch.dtype,
-                        m: int, n: int, k: int) -> \
-                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
-    ABs = []
-    for _ in range(num_tensors):
-        ABs.append(make_rand_tensors(dtype, m, n, k))
-    As, Bs = zip(*ABs)
-    return list(As), list(Bs)
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
deleted file mode 100644
index a597988cd2840..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/w8a8_benchmarks.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from bench_v1 import bench_v1
-from bench_v2 import bench_v2
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-    results = []
-    dtype = args.dtype
-
-    use_bench_v2 = args.with_cuda_graph or args.with_arg_pool
-    for m, k, n in MKNs:
-        if use_bench_v2:
-            label = f"scaled-{dtype}-gemm"
-            label = f"{label}-cugraph_{args.with_cuda_graph}" \
-                  if args.with_cuda_graph else label
-            label = f"{label}-argpool_{args.with_arg_pool}" \
-                if args.with_arg_pool else label
-            timers = bench_v2(args.dtype, args.with_cuda_graph,
-                              args.with_arg_pool, m, k, n, label,
-                              f"MKN=({m}x{k}x{n})")
-        else:
-            timers = bench_v1(args.dtype, m, k, n, f"scaled-{dtype}-gemm",
-                              f"MKN=({m}x{k}x{n})")
-
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            if tp_split_dim is not None:
-                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
-    parser.add_argument(
-        '--with-cuda-graph',
-        type=int,
-        default=None,
-        help="Number of ops/matmuls in a cudagraph execution. When set"
-        "cuda-graphs is enabled")
-    parser.add_argument(
-        '--with-arg-pool',
-        type=int,
-        default=None,
-        help="Number of A and B tensors to use as arg-pool. When not set,"
-        "it defaults to 1")
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
deleted file mode 100644
index 77f15891d84b2..0000000000000
--- a/benchmarks/cutlass_benchmarks/dense_mm/weight_shapes.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
-                                                              8192], None),
-                                      ([8192, 14336], None),
-                                      ([7168, 8192], None)],
-    # The shape space is very big when benchmarking a large set of kernels.
-    # For example: Let,
-    #  - #kernels to benchmark be 1700
-    #  - #models to benchmark be 4 (each model has 4 shapes)
-    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
-    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
-    # to run, then the benchmark suite would take,
-    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
-    # Below, we exploit some observation on the benchmark shapes to create a
-    # representative set.
-    #
-    # From previous benchmarking runs, we observe that perf if stratified as,
-    # N - small, medium, large and K - small and large. We also observe that
-    # in the model shapes, when K is small, we have small, medium and large Ns.
-    # when K is large, we only have small Ns.
-    #
-    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
-    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
-    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
-    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
-    #         22016, 27648, 28672]
-    "llama-representative-set": [
-        ([4096, 4096], None),  # small K, small N
-        ([4096, 8192], None),  # small K, medium N
-        ([4096, 22016], None),  # small K, large N
-        ([14336, 4096], None),  # large K, small N
-        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
-    ],
-}
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
new file mode 100644
index 0000000000000..abcde3b016a7b
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -0,0 +1,389 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
+    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass with azp per-tensor
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj))
+
+    # cutlass with azp per-tensor + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, None, bias))
+
+    # cutlass with azp per-token
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp))
+
+    # cutlass with azp per-token + bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
+                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
+                 torch.bfloat16, azp_adj, azp, bias))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
new file mode 100644
index 0000000000000..d58fb0bf86374
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -0,0 +1,43 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}
\ No newline at end of file

From 2d03e1d57f5debcb4c73de52804ea622a38e35fa Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 23:05:40 +0000
Subject: [PATCH 43/92] Fix cmake errors

---
 CMakeLists.txt                                | 78 +++----------------
 .../cutlass_benchmarks/sparse_mm/bench_v1.py  | 45 +++++------
 nm_cutlass_c.cmake                            | 44 -----------
 setup.py                                      | 18 ++---
 4 files changed, 40 insertions(+), 145 deletions(-)
 delete mode 100644 nm_cutlass_c.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa149a4cfcf2d..67ed6e3d54d21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,13 +208,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG be692b48b01620eedabeef8325df5d4eeed6c2ae
+        GIT_TAG 1dbae0329c6d907b72b373667b4d5716bae4415f
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
         # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
         # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
-        GIT_SHALLOW TRUE
+        # GIT_SHALLOW FALSE
   )
   FetchContent_MakeAvailable(cutlass)
 
@@ -258,11 +258,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
+  # For Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+             "csrc/sparse/cutlass/sparse_compressor.cu"
+             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -271,12 +274,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
   else()
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+      message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
+                     "later if you intend on running FP8 quantized models or sparse on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building cutlass_c3x as no compatible archs found "
                      "in CUDA target architectures")
     endif()
 
@@ -285,63 +288,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(SCALED_MM_3X_ARCHS)
   endif()
 
-  #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/sparse/cutlass/sparse_compressor.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
-    message(STATUS "Building test_util for archs: ${SCALED_MM_3X_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building test_util as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building test_util as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
-  endif()
-  
-  #
-  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
-    message(STATUS "Building test_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
-  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building test_mm_c3x as CUDA Compiler version is "
-                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models on "
-                     "Hopper.")
-    else()
-      message(STATUS "Not building test_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
-    endif()
-
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
-  endif()
-
-
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
@@ -458,8 +404,8 @@ define_gpu_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
-include(nm_cutlass_c.cmake)
-build_nm_cutlass_c()
+# include(nm_cutlass_c.cmake)
+# build_nm_cutlass_c()
 
 #
 # _moe_C extension
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
index 9c516fc6762a7..22616c2359b74 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -82,8 +82,13 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float8_e4m3fn
-    a_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
 
+    # Create tensors
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    aT = a.t()
+    bT = b.t()
+    bf16_a = a.to(dtype=torch.bfloat16)
+    bf16_bT = bT.to(dtype=torch.bfloat16)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
@@ -94,7 +99,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     timers.append(
         bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
                  torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
+                 bT.to(dtype=torch.bfloat16, device="cuda")))
 
     # pytorch impl: bf16 output, without fp8 fast accum
     timers.append(
@@ -103,7 +108,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                  "pytorch_fp8_fp8_bf16_scaled_mm",
                  torch._scaled_mm,
                  a,
-                 b,
+                 bT,
                  scale_a=scale_a,
                  scale_b=scale_b,
                  out_dtype=torch.bfloat16))
@@ -115,7 +120,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                  "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
                  torch._scaled_mm,
                  a,
-                 b,
+                 bT,
                  scale_a=scale_a,
                  scale_b=scale_b,
                  out_dtype=torch.bfloat16,
@@ -128,7 +133,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                  "pytorch_fp8_fp8_fp16_scaled_mm",
                  torch._scaled_mm,
                  a,
-                 b,
+                 bT,
                  scale_a=scale_a,
                  scale_b=scale_b,
                  out_dtype=torch.float16))
@@ -140,7 +145,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                  "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
                  torch._scaled_mm,
                  a,
-                 b,
+                 bT,
                  scale_a=scale_a,
                  scale_b=scale_b,
                  out_dtype=torch.float16,
@@ -149,24 +154,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
                  torch.bfloat16))
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
 
     return timers
 
@@ -307,12 +300,12 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
 def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
           sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.int8:
+    #     return bench_int8(dtype, m, k, n, label, sub_label)
     if dtype == torch.float8_e4m3fn:
         return bench_fp8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float16:
-        return bench_fp16(dtype, m, k, n, label, sub_label)
-    if dtype == torch.bfloat16:
-        return bench_bf16(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.float16:
+    #     return bench_fp16(dtype, m, k, n, label, sub_label)
+    # if dtype == torch.bfloat16:
+    #     return bench_bf16(dtype, m, k, n, label, sub_label)
     raise ValueError("unsupported type")
diff --git a/nm_cutlass_c.cmake b/nm_cutlass_c.cmake
deleted file mode 100644
index 8228c890244af..0000000000000
--- a/nm_cutlass_c.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-function(build_nm_cutlass_c)
-
-  message (STATUS "Project root dir ${PROJECT_ROOT_DIR}")
-  file(GLOB full_path_generated_dirs LIST_DIRECTORIES true "${PROJECT_ROOT_DIR}/csrc/sparse/cutlass/generator/generated/*")
-  
-  message (STATUS "fullpath generated dirs ${full_path_generated_dirs}")
-  
-  set(generated_dirs)
-  foreach(d ${full_path_generated_dirs})
-    get_filename_component(d_name ${d} NAME)
-    list(APPEND generated_dirs ${d_name})
-  endforeach()
-  
-  set(NM_CUTLASS_C_ARCHS "9.0;9.0a")
-  
-  foreach(d ${generated_dirs})
-  
-      set(SRCS_DIR "csrc/sparse/cutlass/generator/generated/${d}")
-      set(SRCS)
-      file(GLOB SRCS "${SRCS_DIR}/*cu")
-      list(APPEND SRCS "${SRCS_DIR}/torch_bindings.cpp")
-  
-      set_gencode_flags_for_srcs(
-        SRCS "${SRCS}"
-        CUDA_ARCHS "${NM_CUTLASS_C_ARCHS}")
-  
-      set(EXT_NAME "_nm_cutlass_${d}_C")
-      message(STATUS "Enabling ${EXT_NAME} extension.")
-      define_gpu_extension_target(
-        ${EXT_NAME}
-        DESTINATION vllm
-        LANGUAGE ${VLLM_GPU_LANG}
-        SOURCES ${SRCS}
-        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
-        ARCHITECTURES ${VLLM_GPU_ARCHES}
-        INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
-        USE_SABI 3
-        WITH_SOABI)
-  
-      target_compile_definitions(${EXT_NAME} PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
-  
-  endforeach()
-
-endfunction()
diff --git a/setup.py b/setup.py
index 43c3b3a268fa2..14147ca21a39f 100644
--- a/setup.py
+++ b/setup.py
@@ -462,15 +462,15 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(
         CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
 
-if _is_cuda():
-    sparse_mm_generated_dir = './csrc/sparse/cutlass/generator/generated/'
-    sparse_mm_generated_dirs = \
-        [x for x in Path(sparse_mm_generated_dir).iterdir() if x.is_dir()]
-    sparse_mm_generated_dir_names = [x.name for x in sparse_mm_generated_dirs]
-    nm_cutlass_extensions = \
-        [f"vllm._nm_cutlass_{x}_C" for x in sparse_mm_generated_dir_names]
-    for x in nm_cutlass_extensions:
-        ext_modules.append(CMakeExtension(name=x))
+# if _is_cuda():
+#     sparse_mm_generated_dir = './csrc/sparse/cutlass/generator/generated/'
+#     sparse_mm_generated_dirs = \
+#         [x for x in Path(sparse_mm_generated_dir).iterdir() if x.is_dir()]
+#     sparse_mm_generated_dir_names = [x.name for x in sparse_mm_generated_dirs]
+#     nm_cutlass_extensions = \
+#         [f"vllm._nm_cutlass_{x}_C" for x in sparse_mm_generated_dir_names]
+#     for x in nm_cutlass_extensions:
+#         ext_modules.append(CMakeExtension(name=x))
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))

From e9439cccfddb188cd30684150f5451267823d937 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 14 Nov 2024 23:12:58 +0000
Subject: [PATCH 44/92] Fix the cmake TAG

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67ed6e3d54d21..61208d176440e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,7 +208,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG 1dbae0329c6d907b72b373667b4d5716bae4415f
+        GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.

From f74ef37726528bcc60625b1428ce6c27b6d7d1e7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 02:46:25 +0000
Subject: [PATCH 45/92] update

---
 CMakeLists.txt                                |   1 -
 .../fp8_semi_structured/cusparseLt.cpp        | 483 ------------------
 .../schemes/compressed_tensors_24.py          | 242 +++------
 .../sparsity/utils/cusparse_2_4_utils.py      | 230 ---------
 4 files changed, 72 insertions(+), 884 deletions(-)
 delete mode 100644 csrc/quantization/fp8_semi_structured/cusparseLt.cpp
 delete mode 100644 vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c42f3f7ac9e67..0ebe0bcf4e8aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,7 +196,6 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
-  "csrc/quantization/fp8_semi_structured/cusparseLt.cpp"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp b/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
deleted file mode 100644
index 54fbc81345449..0000000000000
--- a/csrc/quantization/fp8_semi_structured/cusparseLt.cpp
+++ /dev/null
@@ -1,483 +0,0 @@
-#include <cusparse.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#define STUB_FUNC_IMPL()                                               \
-  torch::Tensor cslt_compress_fp8_semi_structured(                     \
-      const torch::Tensor& input) {                                    \
-    TORCH_CHECK(false, "cusparseLt is not found");                     \
-  }                                                                    \
-                                                                       \
-  torch::Tensor cslt_mm_semi_structured(                               \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
-      const c10::optional<torch::Tensor>& scale_opt,                   \
-      const c10::optional<torch::Tensor>& bias_opt) {                  \
-    TORCH_CHECK(false, "cusparseLt is not found");                     \
-  }                                                                    \
-  torch::Tensor cslt_mm_fp8_semi_structured2(                          \
-      const torch::Tensor& compressed_A, const torch::Tensor& dense_B, \
-      const c10::optional<torch::Tensor>& scale_opt,                   \
-      const c10::optional<torch::Tensor>& bias_opt) {                  \
-    TORCH_CHECK(false, "cusparseLt is not found");                     \
-  }                                                                    \
-  void cslt_clear_cache() { TORCH_CHECK(false, "cusparseLt is not found"); }
-
-#if defined(VLLM_CUSPARSELT_ENABLED)
-
-  #include <cusparseLt.h>
-
-  #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 600
-
-    #define CUDASPARSE_CHECK(EXPR)                                 \
-      do {                                                         \
-        cusparseStatus_t __err = EXPR;                             \
-        TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,              \
-                    "CUDA error: ", cusparseGetErrorString(__err), \
-                    " when calling `" #EXPR "`");                  \
-      } while (0)
-
-namespace vllm {
-namespace cusparseLt {
-
-struct cusparseLtEntry {
-  cusparseLtMatDescriptor_t* sparse_input_descriptor_p;
-  cusparseLtMatDescriptor_t* dense_input_descriptor_p;
-  cusparseLtMatDescriptor_t* res_descriptor_p;
-  cusparseLtMatDescriptor_t* C_descriptor_p;
-
-  cusparseLtMatmulDescriptor_t* matmul_p;
-  cusparseLtMatmulPlan_t* plan_p;
-  cusparseLtMatmulAlgSelection_t* alg_sel_p;
-
-  void* workspace_ptr;
-
-  ~cusparseLtEntry() {
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(sparse_input_descriptor_p));
-    TORCH_CUDASPARSE_CHECK(
-        cusparseLtMatDescriptorDestroy(dense_input_descriptor_p));
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(C_descriptor_p));
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(res_descriptor_p));
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(plan_p));
-
-    // Destructor is called after the cuda cleanup so double free is done here.
-    // AT_CUDA_CHECK(cudaFree(workspace_ptr));
-    delete sparse_input_descriptor_p;
-    delete dense_input_descriptor_p;
-    delete res_descriptor_p;
-    delete C_descriptor_p;
-    delete plan_p;
-    delete alg_sel_p;
-    delete matmul_p;
-  }
-};
-
-cusparseLtHandle_t handle;
-bool handle_initialized = false;
-using cacheID = std::tuple<int64_t, int64_t, int64_t, at::ScalarType>;
-
-std::map<cacheID, cusparseLtEntry> cusparseLt_cache;
-
-void prepare_mm_semi_structured(const cacheID& tuple_id,
-                                at::ScalarType out_dtype,
-                                bool is_B_contiguous) {
-  auto m = std::get<0>(tuple_id);
-  auto k = std::get<1>(tuple_id);
-  auto n = std::get<2>(tuple_id);
-  at::ScalarType input_dtype = std::get<3>(tuple_id);
-  auto& entry = cusparseLt_cache[tuple_id];
-
-  cudaDataType input_type;
-  cudaDataType output_type;
-  cudaDataType C_type;
-  cusparseComputeType compute_type;
-
-  switch (input_dtype) {
-    case at::ScalarType::Char:
-      input_type = CUDA_R_8I;
-      output_type = CUDA_R_8I;
-      C_type = CUDA_R_8I;
-      compute_type = CUSPARSE_COMPUTE_32I;
-      break;
-    case at::ScalarType::Half:
-      input_type = CUDA_R_16F;
-      output_type = CUDA_R_16F;
-      C_type = CUDA_R_16F;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    case at::ScalarType::BFloat16:
-      input_type = CUDA_R_16BF;
-      output_type = CUDA_R_16BF;
-      C_type = CUDA_R_16BF;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    case at::ScalarType::Float:
-      input_type = CUDA_R_32F;
-      output_type = CUDA_R_32F;
-      C_type = CUDA_R_32F;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    case at::ScalarType::Float8_e4m3fn:
-      input_type = CUDA_R_8F_E4M3;
-      output_type = CUDA_R_8F_E4M3;
-      C_type = CUDA_R_16F;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    default:
-      TORCH_CHECK(
-          false,
-          "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
-      break;
-  }
-
-  // cudaDataType input_type = CUDA_R_8F_E4M3;
-  // cudaDataType output_type;
-  // cudaDataType C_type;
-  // cusparseComputeType compute_type = CUSPARSE_COMPUTE_32F;
-  // switch (out_dtype) {
-  //   case at::ScalarType::Float8_e4m3fn:
-  //     output_type = CUDA_R_8F_E4M3;
-  //     C_type = CUDA_R_16F;
-  //     break;
-  //   case at::ScalarType::Half:
-  //     output_type = CUDA_R_16F;
-  //     C_type = CUDA_R_16F;
-  //     break;
-  //   case at::ScalarType::BFloat16:
-  //     output_type = CUDA_R_16BF;
-  //     C_type = CUDA_R_16BF;
-  //     break;
-  //   case at::ScalarType::Float:
-  //     output_type = CUDA_R_32F;
-  //     C_type = CUDA_R_32F;
-  //     break;
-  //   default:
-  //     TORCH_CHECK(false,
-  //                 "Unsupported out_dtype passed, must be one of {fp16, bf16,
-  //                 " "float32} for fp8 inputs");
-  //     break;
-  // }
-  entry.sparse_input_descriptor_p = new cusparseLtMatDescriptor_t();
-  entry.dense_input_descriptor_p = new cusparseLtMatDescriptor_t();
-  entry.res_descriptor_p = new cusparseLtMatDescriptor_t();
-  entry.C_descriptor_p = new cusparseLtMatDescriptor_t();
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &handle, entry.sparse_input_descriptor_p, m, k, k, 16, input_type,
-      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
-
-  // initialize dense descriptor
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, entry.dense_input_descriptor_p, (is_B_contiguous) ? k : n,
-      (is_B_contiguous) ? n : k, (is_B_contiguous) ? n : k, 16, input_type,
-      CUSPARSE_ORDER_ROW));
-
-  // initialize result descriptor
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtDenseDescriptorInit(&handle, entry.res_descriptor_p, m, n, n,
-                                    16, output_type, CUSPARSE_ORDER_ROW));
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &handle, entry.C_descriptor_p, m, n, n, 16, C_type, CUSPARSE_ORDER_ROW));
-
-  entry.matmul_p = new cusparseLtMatmulDescriptor_t();
-  entry.plan_p = new cusparseLtMatmulPlan_t();
-  entry.alg_sel_p = new cusparseLtMatmulAlgSelection_t();
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &handle, entry.matmul_p, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (is_B_contiguous) ? CUSPARSE_OPERATION_NON_TRANSPOSE
-                        : CUSPARSE_OPERATION_TRANSPOSE,
-      entry.sparse_input_descriptor_p, entry.dense_input_descriptor_p,
-      entry.C_descriptor_p, entry.res_descriptor_p, compute_type));
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
-      &handle, entry.alg_sel_p, entry.matmul_p, CUSPARSELT_MATMUL_ALG_DEFAULT));
-  int num_search_iters = 5;
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
-      &handle, entry.alg_sel_p, CUSPARSELT_MATMUL_SEARCH_ITERATIONS,
-      &num_search_iters, sizeof(num_search_iters)));
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanInit(
-      &handle, entry.plan_p, entry.matmul_p, entry.alg_sel_p));
-
-  size_t workspace_size;
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&handle, entry.plan_p, &workspace_size));
-  AT_CUDA_CHECK(cudaMalloc((void**)&entry.workspace_ptr, workspace_size));
-}
-
-}  // namespace cusparseLt
-}  // namespace vllm
-
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input) {
-  namespace vc = vllm::cusparseLt;
-  if (!vc::handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
-    vc::handle_initialized = true;
-  }
-
-  cudaDataType type;
-  auto compression_factor = 9;
-  cusparseLtMatDescriptor_t input_descriptor;
-
-  switch (input.scalar_type()) {
-    case at::ScalarType::Char:
-      type = CUDA_R_8I;
-      compression_factor = 10;
-      break;
-    case at::ScalarType::Half:
-      type = CUDA_R_16F;
-      break;
-    case at::ScalarType::BFloat16:
-      type = CUDA_R_16BF;
-      break;
-    case at::ScalarType::Float:
-      type = CUDA_R_32F;
-      break;
-    case at::ScalarType::Float8_e4m3fn:
-      type = CUDA_R_8F_E4M3;
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix");
-      break;
-  }
-
-  auto compressed_tensor =
-      input.new_empty(input.numel() * compression_factor / 16);
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &input_descriptor, input.size(0), input.size(1),
-      input.size(1), 16, type, CUSPARSE_ORDER_ROW,
-      CUSPARSELT_SPARSITY_50_PERCENT));
-
-  size_t compressed_size, compressed_buffer_size;
-  TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
-      &vc::handle, &input_descriptor, &compressed_size,
-      &compressed_buffer_size));
-
-  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-  auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
-      &vc::handle, &input_descriptor, true, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      input.data_ptr(), compressed_tensor.data_ptr(), compressedBufferPtr.get(),
-      stream));
-  return compressed_tensor;
-}
-
-torch::Tensor cslt_mm_semi_structured(
-    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt) {
-  namespace vc = vllm::cusparseLt;
-  if (!vc::handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
-    vc::handle_initialized = true;
-  }
-
-  auto input_dtype = compressed_A.scalar_type();
-  auto out_dtype = dense_B.scalar_type();
-  auto compression_factor = (input_dtype == at::ScalarType::Char) ? 10 : 9;
-
-  int64_t k = dense_B.size(0);
-  int64_t n = dense_B.size(1);
-  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
-
-  vc::cacheID tuple_id = std::make_tuple(m, k, n, input_dtype);
-  bool found = vc::cusparseLt_cache.count(tuple_id);
-  if (not found) {
-    vc::prepare_mm_semi_structured(tuple_id, out_dtype,
-                                   dense_B.is_contiguous());
-  }
-  auto& entry = vc::cusparseLt_cache[tuple_id];
-
-  // set bias pointer for matmul, need to assign to get location
-  if (bias_opt.has_value()) {
-    auto& bias = bias_opt.value();
-    void* dBias = bias.data_ptr();
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-        &vc::handle, entry.matmul_p, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
-        sizeof(dBias)));
-  }
-
-  // float alpha = 1.0;
-  float alpha = alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
-  float beta = 0.0;
-  auto alpha_ptr = &alpha;
-
-  auto res_tensor_options =
-      c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-  at::Tensor res = at::empty({m, n}, res_tensor_options);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  if (found) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
-        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
-        dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
-        entry.workspace_ptr, &stream, 1));
-  } else {
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulSearch(
-        &vc::handle, entry.plan_p, alpha_ptr, compressed_A.data_ptr(),
-        dense_B.data_ptr(), &beta, res.data_ptr(), res.data_ptr(),
-        entry.workspace_ptr, &stream, 1));
-  }
-  return res;
-}
-
-torch::Tensor cslt_mm_fp8_semi_structured2(
-    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& alpha_opt,
-    const c10::optional<torch::Tensor>& bias_opt) {
-  namespace vc = vllm::cusparseLt;
-  if (!vc::handle_initialized) {
-    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&vc::handle));
-    vc::handle_initialized = true;
-  }
-
-  // cusparseLt data structures
-  cusparseLtMatmulDescriptor_t matmul;
-  cusparseLtMatmulPlan_t plan;
-  cusparseLtMatmulAlgSelection_t alg_sel;
-
-  cudaDataType input_type;
-  cudaDataType output_type;
-  cudaDataType C_type;
-  cusparseComputeType compute_type;
-  auto compression_factor = 9;
-  switch (compressed_A.scalar_type()) {
-    case at::ScalarType::Char:
-      input_type = CUDA_R_8I;
-      output_type = CUDA_R_8I;
-      C_type = CUDA_R_8I;
-      compute_type = CUSPARSE_COMPUTE_32I;
-      compression_factor = 10;
-      break;
-    case at::ScalarType::Half:
-      input_type = CUDA_R_16F;
-      output_type = CUDA_R_16F;
-      C_type = CUDA_R_16F;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    case at::ScalarType::BFloat16:
-      input_type = CUDA_R_16BF;
-      output_type = CUDA_R_16BF;
-      C_type = CUDA_R_16BF;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    case at::ScalarType::Float:
-      input_type = CUDA_R_32F;
-      output_type = CUDA_R_32F;
-      C_type = CUDA_R_32F;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    case at::ScalarType::Float8_e4m3fn:
-      input_type = CUDA_R_8F_E4M3;
-      output_type = CUDA_R_8F_E4M3;
-      C_type = CUDA_R_16F;
-      compute_type = CUSPARSE_COMPUTE_32F;
-      break;
-    default:
-      TORCH_CHECK(
-          false,
-          "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
-      break;
-  }
-
-  int64_t k = dense_B.size(0);
-  int64_t n = dense_B.size(1);
-  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
-  auto out_dtype = dense_B.scalar_type();
-
-  // initialize sparse descriptor
-  cusparseLtMatDescriptor_t sparse_input_descriptor;
-  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
-      &vc::handle, &sparse_input_descriptor, m, k, k, 16, input_type,
-      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT));
-
-  // initialize dense input descriptor
-  cusparseLtMatDescriptor_t dense_input_descriptor;
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &dense_input_descriptor, (dense_B.is_contiguous()) ? k : n,
-      (dense_B.is_contiguous()) ? n : k, (dense_B.is_contiguous()) ? n : k, 16,
-      input_type, CUSPARSE_ORDER_ROW));
-
-  // create result tensor
-  auto res_tensor_options =
-      c10::TensorOptions().dtype(out_dtype).device(dense_B.device());
-  at::Tensor res = at::empty({m, n}, res_tensor_options);
-
-  cusparseLtMatDescriptor_t res_descriptor;
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtDenseDescriptorInit(&vc::handle, &res_descriptor, m, n, n, 16,
-                                    output_type, CUSPARSE_ORDER_ROW));
-
-  cusparseLtMatDescriptor_t C_descriptor;
-  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
-      &vc::handle, &C_descriptor, m, n, n, 16, C_type, CUSPARSE_ORDER_ROW));
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
-      &vc::handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE
-                                : CUSPARSE_OPERATION_TRANSPOSE,
-      &sparse_input_descriptor, &dense_input_descriptor, &C_descriptor,
-      &res_descriptor, compute_type));
-
-  // set bias pointer for matmul, need to assign to get location
-  if (bias_opt.has_value()) {
-    auto& bias = bias_opt.value();
-    void* dBias = bias.data_ptr();
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-        &vc::handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias,
-        sizeof(dBias)));
-  }
-
-  float beta = 0.0;
-  const float alpha =
-      alpha_opt.has_value() ? static_cast<float>(*alpha_opt) : 1.0;
-  auto alpha_ptr = &alpha;
-
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
-      &vc::handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulPlanInit(&vc::handle, &plan, &matmul, &alg_sel));
-
-  size_t workspace_size;
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmulGetWorkspace(&vc::handle, &plan, &workspace_size));
-
-  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-  auto workspace_ptr = allocator.allocate(workspace_size);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatmul(&vc::handle, &plan, alpha_ptr, compressed_A.data_ptr(),
-                       dense_B.data_ptr(), &beta, res.data_ptr(),
-                       res.data_ptr(), workspace_ptr.get(), &stream, 1));
-
-  // Destroy descriptors
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
-  TORCH_CUDASPARSE_CHECK(
-      cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&C_descriptor));
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
-  // Destroy plan
-  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
-  return res;
-}
-
-void cslt_clear_cache() { vllm::cusparseLt::cusparseLt_cache.clear(); }
-
-  #else
-
-STUB_FUNC_IMPL()
-
-  #endif
-
-#else
-
-STUB_FUNC_IMPL()
-
-#endif
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index f6da79771672b..76afd4e649a21 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,6 +1,6 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
+from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter
 import torch
 from typing import List, Callable, Optional
 from compressed_tensors.compressors import ModelCompressor
@@ -37,79 +37,34 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                     params_dtype: torch.dtype, weight_loader: Callable,
                     **kwargs):
         layer.logical_widths = output_partition_sizes
-        weights_dtype = params_dtype
-        weights = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition // 2,
-            dtype=weights_dtype),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader)
+        self.params_dtype=params_dtype
+
+        # weights_dtype = params_dtype
+        # weights = ModelWeightParameter(data=torch.empty(
+        #     sum(output_partition_sizes),
+        #     input_size_per_partition // 2,
+        #     dtype=weights_dtype),
+        #     input_dim=1,
+        #     output_dim=0,
+        #     weight_loader=weight_loader)
 
         # parameter to store uncompressed weight or decompressed weight
-        weight_unpacked = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition,
-            dtype=weights_dtype),
+        weight = ModelWeightParameter(
+            data=torch.empty(sum(output_partition_sizes),
+                             input_size_per_partition,
+                             dtype=torch.float8_e4m3fn),
             input_dim=1,
             output_dim=0,
             weight_loader=weight_loader)
-        
-        if self.quantized:
-
-            # assume per tensor static quantization
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                    len(output_partition_sizes), dtype=torch.float),
-                                                    weight_loader=weight_loader)
-
-            weight_zero_point = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float8_e4m3fn),
-                                                weight_loader=weight_loader)
-            
-            
-            input_scale = PerTensorScaleParameter(data=torch.empty(
-                    len(output_partition_sizes), dtype=torch.float),
-                                                    weight_loader=weight_loader)
-            
-            input_zero_point = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float8_e4m3fn),
-                                                weight_loader=weight_loader)
-
 
-            layer.register_parameter("weight_scale", weight_scale)
-            layer.register_parameter("input_scale", input_scale)
-            layer.register_parameter("input_zero_point", input_zero_point)
-            layer.register_parameter("weight_zero_point", weight_zero_point)
-    
-        if self.compressed:
-            # store compression specific things to be used
-            # later during decompression
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1),
+                             dtype=torch.float32),
+                             output_dim=0,
+                             weight_loader=weight_loader)
 
-            bits_per_weight_element = weights.itemsize * 8 
-            meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
-
-            meta_input_size = (
-                input_size_per_partition // 32
-                if bits_per_weight_element == 8
-                else input_size_per_partition // 16
-            )
-            meta = ModelWeightParameter(data=torch.empty(
-                sum(output_partition_sizes), 
-                meta_input_size,
-                dtype=meta_dtype),
-                input_dim=1,
-                output_dim=0,
-                weight_loader=weight_loader)
-
-            # TODO: replace weight_packed name, with something
-            # more meaningful, like sparse24_packed, this will
-            # require changes on compressed_tensors side
-
-            layer.register_parameter("weight_packed", weights)
-            layer.register_parameter("meta", meta)
-
-        layer.register_parameter("weight", weight_unpacked)
-        
+        layer.register_parameter("weight_scale", weight_scale)
+        layer.register_parameter("weight", weight)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         """
@@ -118,77 +73,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         :param layer: The layer with the weights to be processed
         """
-        
-        # TODO: right now this is hard coded for 24 compressor
-        # replace by a better way to identify targetted params
-        # using COMPRESSION_PARAMS defined by sparse compressors
-        # and decompress the weights accordingly
-        if self.compressed and hasattr(layer, "weight_packed"):
-            # TODO: this name will also be changed to sparse24_packed
-            weight_packed_data = layer.weight_packed.data
-            meta = layer.meta.data
-
-            qkv_sizes = [2048, 256, 256]
-            gate_up_sizes = [5632, 5632]
-            split_weights = None 
-            split_meta = None
 
-            def _process_split(input_weight, input_meta):
-                weight_data = {
-                    "weight_packed": input_weight,
-                    "meta": input_meta
-                }
-                decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
-                return decompress
-
-            print(self.layer_name)
-            if "qkv" in self.layer_name:
-                split_weights = torch.split(weight_packed_data, qkv_sizes)
-                split_meta = torch.split(meta, qkv_sizes)
-            elif "gate_up" in self.layer_name:
-                split_weights = torch.split(weight_packed_data, gate_up_sizes)
-                split_meta = torch.split(meta, gate_up_sizes)
-            
-            if split_weights:
-                all_compress = []
-                for i in range(len(split_weights)):
-                    print(split_weights[i].shape, split_meta[i].shape)
-                    compress_i = _process_split(split_weights[i], split_meta[i])
-                    all_compress.append(compress_i)
-                
-                compressed = torch.cat(all_compress)
-                compressed = compress_to_torch_sparse_semi_structured_mat(compressed)
-            else:
-                decompress = _process_split(weight_packed_data, meta)
-                compressed = compress_to_torch_sparse_semi_structured_mat(decompress)
-            
-            layer.weight = Parameter(compressed, requires_grad=False)
-            
-        else:
-            # uncompressed case
-            # quantize the weights to fp8 and store them
-
-            dq_weight = layer.weight.data
-            weight_scale = layer.weight_scale.data
-
-            if len(weight_scale) != 1:
-                # needed for cases where modules are merged
-                # to reduce the number of scales to one
-                scale, q_weight = quantize_with_max_scale(
-                    dq_weight, weight_scale, layer.logical_widths
-                )
-            else:
-                # if modules are not merged, we can directly
-                # use the scale provided, and quantize the weights
-                q_weight, scale = ops.scaled_fp8_quant(dq_weight, weight_scale)
-
-            layer.weight_scale = Parameter(scale, requires_grad=False)
-
-            # Temporary check to ensure that the weights are 2:4 sparse
-            assert check_24(q_weight), "Not 2:4 sparse"
-            
-            compressed = compress_to_torch_sparse_semi_structured_mat(q_weight)
-            layer.weight = Parameter(compressed, requires_grad=False)
+        w_compressed, meta = ops.cutlass_compress_entry(layer.weight)
+        layer.weight = torch.nn.Parameter(w_compressed)
+        layer.meta = torch.nn.Parameter(meta)
+        
 
     def apply_weights(self,
                       layer: torch.nn.Module,
@@ -212,42 +101,55 @@ def apply_weights(self,
         return result.t().contiguous()
         """
 
-        if not self.quantized:
-            return semi_structured_dense_sparse_T_gemm(
-                a_dense=x, 
-                b_T_packed=layer.weight.data
-            )
-        
-        input_scale = layer.input_scale.data
-        weight_scale = layer.weight_scale.data
-        weight = layer.weight.data
-
-        # Quantize the input tensor to fp8
-        # can use the max scale for the input tensor
-        # as the merged modules have a same scale
-        # repeated for all the partitions
-        input_scale = input_scale.max()
         q_input, input_scale = ops.scaled_fp8_quant(x, input_scale)
-        
-        if q_input.is_contiguous():
-            # Make q_input non-contiguous
-            # as expected by the kernel
-            q_input = q_input.t().contiguous().t()
-
-
-        assert not q_input.is_contiguous(), "Input is contiguous, the Kernel expects non-contiguous input"
-        output =  semi_structured_dense_sparse_T_gemm_scaled(
-            a_dense=q_input,
-            b_T_packed=weight,
-            scale_a=input_scale,
-            scale_b=weight_scale,
+        breakpoint()
+        return ops.cutlass_scaled_sparse_mm(
+            a=layer.weight,
+            e=layer.meta,
+            b=q_input,
+            scale_a=layer.weight_scale,
+            scale_b=input_scale,
+            out_dtype=self.params_dtype,
             bias=bias
         )
-        output = output.to(x.dtype)
-        print()
-        print(f"{self.layer_name} executed")
-        print("\t", "Input shape:", x.shape, "weight shape:", weight.shape, "output shape:", output.shape)
-        return output
+
+
+        # if not self.quantized:
+        #     return semi_structured_dense_sparse_T_gemm(
+        #         a_dense=x, 
+        #         b_T_packed=layer.weight.data
+        #     )
+        
+        # input_scale = layer.input_scale.data
+        # weight_scale = layer.weight_scale.data
+        # weight = layer.weight.data
+
+        # # Quantize the input tensor to fp8
+        # # can use the max scale for the input tensor
+        # # as the merged modules have a same scale
+        # # repeated for all the partitions
+        # input_scale = input_scale.max()
+        # q_input, input_scale = ops.scaled_fp8_quant(x, input_scale)
+        
+        # if q_input.is_contiguous():
+        #     # Make q_input non-contiguous
+        #     # as expected by the kernel
+        #     q_input = q_input.t().contiguous().t()
+
+
+        # assert not q_input.is_contiguous(), "Input is contiguous, the Kernel expects non-contiguous input"
+        # output =  semi_structured_dense_sparse_T_gemm_scaled(
+        #     a_dense=q_input,
+        #     b_T_packed=weight,
+        #     scale_a=input_scale,
+        #     scale_b=weight_scale,
+        #     bias=bias
+        # )
+        # output = output.to(x.dtype)
+        # print()
+        # print(f"{self.layer_name} executed")
+        # print("\t", "Input shape:", x.shape, "weight shape:", weight.shape, "output shape:", output.shape)
+        # return output
 
 
 def quantize_with_max_scale(
diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
deleted file mode 100644
index ea4bf25acd7e5..0000000000000
--- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import torch
-from packaging.version import Version
-from torch.sparse import (SparseSemiStructuredTensor,
-                          SparseSemiStructuredTensorCUSPARSELT,
-                          to_sparse_semi_structured)
-
-from vllm._custom_ops import (cutlass_scaled_mm, semi_structured_fp8_compress,
-                              semi_structured_fp8_mm2, semi_structured_mm, semi_structured_clear_cache)
-from vllm.platforms import current_platform
-
-SparseSemiStructuredTensor._FORCE_CUTLASS = False
-
-
-def compress_to_torch_sparse_semi_structured_mat(pruned_tensor: torch.Tensor):
-    '''
-    Compresses original pruned (with zeros) tensor into packed version
-    Args:
-        pruned_tensor(torch.Tensor) - pruned but not packed tensor
-    Returns: 
-        torch.SparseSemiStructuredTensorCUSPARSELT: torch wrapped cusparseLt-packed tensor. 
-    ''' # noqa: E501
-
-    if pruned_tensor.dtype == torch.float8_e4m3fn:
-        packed = semi_structured_fp8_compress(pruned_tensor)
-        return SparseSemiStructuredTensorCUSPARSELT(
-            shape=pruned_tensor.shape,
-            packed=packed,
-            meta=None,
-            packed_t=None,
-            meta_t=None,
-            compressed_swizzled_bitmask=None,
-            fuse_transpose_cusparselt=SparseSemiStructuredTensor.
-            _FUSE_TRANSPOSE,
-            alg_id_cusparselt=SparseSemiStructuredTensor._DEFAULT_ALG_ID,
-            requires_grad=pruned_tensor.requires_grad,
-        )
-    else:
-        return to_sparse_semi_structured(pruned_tensor)
-
-
-def decompress_torch_sparse_semi_structured_mat(packed_tensor: torch.Tensor):
-    '''
-    Unpacks the cusparseLt packed tensor into pruned tensor
-    Args:
-        packed_tensor - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
-    Returns:
-        pruned (torch.Tensor) - pruned torch.tensor
-    ''' # noqa: E501
-    if packed_tensor.dtype == torch.float8_e4m3fn:
-        return semi_structured_mm(
-            packed_tensor.packed,
-            torch.eye(packed_tensor.shape[-1],
-                      dtype=packed_tensor.dtype,
-                      device=packed_tensor.device).t())
-    else:
-        # Fix of to_dense() function supporting int8
-        # cuSparseLT for int8 requires dense matrix to be non-contiguous
-        return torch.mm(
-            packed_tensor,
-            torch.eye(packed_tensor.shape[-1],
-                      dtype=packed_tensor.dtype,
-                      device=packed_tensor.device).t())
-
-
-def semi_structured_sparse_dense_gemm(a_packed: torch.Tensor,
-                                      b_dense: torch.Tensor,
-                                      bias: torch.Tensor = None,
-                                      cached: bool = True):
-    '''
-    Performs matrix multiplication (A @ B) of semi-structured sparse (A) and dense (B) matrices.
-    In case of int8 and fp8 types, dense matrix B has to be non-contiguous.
-    Args:
-        a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
-        b_dense (torch.Tensor) - dense matrix tensor.
-        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
-        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
-
-    Result:
-        torch.Tensor - Result of matrix multiplication.
-    ''' # noqa: E501
-    assert a_packed.dtype in [
-        torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn
-    ], f"Semi structured sparse-dense matmul does not support {a_packed.dtype}"
-    if b_dense.is_contiguous() and a_packed.dtype in [
-            torch.int8, torch.float8_e4m3fn
-    ]:
-        raise ValueError("cuSparseLt does not support"
-                         "contiguous dense matrix for int8 and fp8 types")
-
-    if cached:
-        return semi_structured_mm(a_packed.packed, b_dense, bias=bias)
-    else:
-        if a_packed.dtype == torch.float8_e4m3fn:
-            return semi_structured_fp8_mm2(a_packed.packed, b_dense, bias=bias)
-        else:
-            result = torch.mm(a_packed, b_dense)
-            if bias is not None:
-                result = torch.add(result, bias)
-            return result
-
-
-def semi_structured_dense_sparse_T_gemm(a_dense: torch.Tensor,
-                                        b_T_packed: torch.Tensor,
-                                        bias: torch.Tensor = None,
-                                        cached: bool = True):
-    '''
-    Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
-    Args:
-        a_dense (torch.Tensor) - dense matrix tensor.
-        b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
-        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
-        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
-    
-    Returns:
-        torch.Tensor - Result of matrix multiplication.
-    ''' # noqa: E501
-    return (semi_structured_sparse_dense_gemm(b_T_packed,
-                                              a_dense.t(),
-                                              bias=bias,
-                                              cached=cached)).t().contiguous()
-
-
-def semi_structured_sparse_dense_gemm_scaled(a_packed: torch.Tensor,
-                                             b_dense: torch.Tensor,
-                                             scale_a: torch.Tensor,
-                                             scale_b: torch.Tensor,
-                                             bias: torch.Tensor = None,
-                                             cached: bool = False):
-    '''
-    Performs scaled matrix multiplication (a @ b) of transposed semi-structured sparse and dense fp8 matrices
-    Args:
-        a_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat.
-        b_dense (torch.Tensor) - dense matrix tensor.
-        scale_a (torch.Tensor) - scaling factor for sparse matrix, must be in float32.
-        scale_b (torch.Tensor) - scaling factor for dense matrix, must be in float32.
-        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
-        cached (bool) - whether to use cached (faster) version of cusparseLt wrapper.
-
-    Returns:
-        torch.Tensor - Result of matrix multiplication.
-    ''' # noqa: E501
-
-    # cusparseLt requires alpha to be float
-    assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
-    scale = (scale_a * scale_b).item()
-    if cached:
-        return semi_structured_mm(a_packed.packed,
-                                  b_dense,
-                                  scale=scale,
-                                  bias=bias)
-    else:
-        return semi_structured_fp8_mm2(a_packed.packed,
-                                       b_dense,
-                                       bias=bias,
-                                       scale=scale)
-
-
-def semi_structured_dense_sparse_T_gemm_scaled(a_dense: torch.Tensor,
-                                               b_T_packed: torch.Tensor,
-                                               scale_a: torch.Tensor = None,
-                                               scale_b: torch.Tensor = None,
-                                               bias: torch.Tensor = None,
-                                               cached: bool = True):
-    '''
-    Performs matrix multiplication (a @ b_T) of transposed semi-structured sparse and dense matrices
-    Args:
-        a_dense (torch.Tensor) - dense matrix tensor.
-        b_T_packed (torch.Tensor) - torch wrapped cusparseLt-packed tensor. Result of compress_to_torch_sparse_semi_structured_mat
-        bias (torch.Tensor) - bias to fuse in matrix multiplication. default : None.
-        cached (bool) - whether to use cached(faster) version of cusparseLt wrapper.
-    
-    Returns:
-        torch.Tensor - Result of matrix multiplication.
-    '''  # noqa: E501
-    return (semi_structured_sparse_dense_gemm_scaled(b_T_packed,
-                                                     a_dense.t(),
-                                                     scale_a=scale_b,
-                                                     scale_b=scale_a,
-                                                     bias=bias,
-                                                     cached=cached)).t()
-
-def clear_cache():
-    semi_structured_clear_cache()
-
-# test utils
-def dense_matmul(A, B, dtype, scale_a=None, scale_b=None):
-    if dtype in [torch.int8, torch.float8_e4m3fn]:
-        if scale_a is None:
-            scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        if scale_b is None:
-            scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        return cutlass_scaled_mm(A, B, scale_a, scale_b,
-                                 torch.bfloat16).to(dtype)
-    else:
-        return A @ B
-
-
-def is_semi_structured_supported() -> bool:
-    if not (current_platform.is_cuda() or current_platform.is_rocm()):
-        return False
-
-    base_torch_version = Version(Version(torch.__version__).base_version)
-
-    capability = current_platform.get_device_capability()
-    assert capability is not None
-    capability = capability.to_int()
-    min_capability = 80
-
-    return capability == min_capability or (
-        capability > min_capability and base_torch_version >= Version("2.5.0"))
-
-
-def get_random_mat(M, K, dtype):
-    rand_tensor_dtype = dtype
-    if dtype in [torch.int8, torch.float8_e4m3fn]:
-        rand_tensor_dtype = torch.float16
-    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
-    mat = mat.masked_fill_(mat == 0, 1)
-    return mat.to(dtype)
-
-
-def generate_pruned_semi_structured_mat(M, K, dtype):
-    mask = torch.Tensor([0, 0, 1, 1]).tile((M, K // 4)).cuda().bool()
-    rand_tensor_dtype = dtype
-    if dtype in [torch.int8, torch.float8_e4m3fn]:
-        rand_tensor_dtype = torch.float16
-    mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda()
-    mat = mat.masked_fill_(mat == 0, 1)
-    mat = mat * mask
-    return mat.to(dtype)

From f5bc9eb28daff826eaa26145cb792fe39d98960d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 02:48:32 +0000
Subject: [PATCH 46/92] fixed

---
 csrc/ops.h          | 17 -----------------
 vllm/_custom_ops.py | 26 --------------------------
 2 files changed, 43 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index ebf45b14c0236..3c9b814a456e8 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -227,20 +227,3 @@ std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
 #endif
-
-#ifndef USE_ROCM
-torch::Tensor cslt_compress_fp8_semi_structured(const torch::Tensor& input);
-
-torch::Tensor cslt_mm_semi_structured(
-    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& scale_opt,
-    const c10::optional<torch::Tensor>& bias_opt);
-
-torch::Tensor cslt_mm_fp8_semi_structured2(
-    const torch::Tensor& compressed_A, const torch::Tensor& dense_B,
-    const c10::optional<double>& scale_opt,
-    const c10::optional<torch::Tensor>& bias_opt);
-
-void cslt_clear_cache();
-
-#endif
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 686be90ce2da0..ffaae206d45c5 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -743,32 +743,6 @@ def scaled_fp8_quant(
     return output, scale
 
 
-# semi structured fp8
-def semi_structured_fp8_compress(input: torch.Tensor) -> torch.Tensor:
-    assert input.dtype == torch.float8_e4m3fn
-    return torch.ops._C.cslt_compress_fp8_semi_structured(input)
-
-
-def semi_structured_mm(A_compressed: torch.Tensor,
-                       B_dense: torch.Tensor,
-                       scale: Optional[float] = None,
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return torch.ops._C.cslt_mm_semi_structured(A_compressed, B_dense, scale,
-                                                bias)
-
-
-def semi_structured_fp8_mm2(
-        A_compressed: torch.Tensor,
-        B_dense: torch.Tensor,
-        scale: Optional[float] = None,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return torch.ops._C.cslt_mm_fp8_semi_structured2(A_compressed, B_dense,
-                                                     scale, bias)
-
-def semi_structured_clear_cache() -> None:
-    return torch.ops._C.cslt_clear_cache()
-
-
 # int8
 def scaled_int8_quant(
     input: torch.Tensor,

From 13160765c0d5c0924a6bf5c4543f442d344cbc42 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 03:18:23 +0000
Subject: [PATCH 47/92] updated

---
 csrc/torch_bindings.cpp                        | 18 ------------------
 .../schemes/compressed_tensors_24.py           |  1 -
 2 files changed, 19 deletions(-)

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 15a19210df922..fe7af760bdfc9 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -323,24 +323,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "int pad_slot_id) -> ()");
   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
 
-  ops.def("cslt_compress_fp8_semi_structured(Tensor! input) -> Tensor");
-  ops.impl("cslt_compress_fp8_semi_structured", torch::kCUDA,
-           &cslt_compress_fp8_semi_structured);
-
-  ops.def(
-      "cslt_mm_semi_structured(Tensor! compressed_A, Tensor! denseB,"
-      "float!? scale, Tensor!? bias) -> Tensor");
-  ops.impl("cslt_mm_semi_structured", torch::kCUDA, &cslt_mm_semi_structured);
-
-  ops.def(
-      "cslt_mm_fp8_semi_structured2(Tensor! compressed_A, Tensor! denseB,"
-      "float!? scale, Tensor!? bias) -> Tensor");
-  ops.impl("cslt_mm_fp8_semi_structured2", torch::kCUDA,
-           &cslt_mm_fp8_semi_structured2);
-
-  ops.def("cslt_clear_cache() -> ()");
-  ops.impl("cslt_clear_cache", &cslt_clear_cache);
-
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 76afd4e649a21..19233c6eca7bc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -113,7 +113,6 @@ def apply_weights(self,
             bias=bias
         )
 
-
         # if not self.quantized:
         #     return semi_structured_dense_sparse_T_gemm(
         #         a_dense=x, 

From 4d2b12c1c5b72bf7dffd63c27553c4cf6168c0b7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 03:19:15 +0000
Subject: [PATCH 48/92] updated

---
 csrc/torch_bindings.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index fe7af760bdfc9..d75f29c7ff246 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -322,7 +322,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "bool silu_activation,"
       "int pad_slot_id) -> ()");
   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
-
 #endif
 
   // Quantized GEMM for GPTQ.

From fe30b5349f2a843295fb648567e6c2d9c3449216 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 03:45:53 +0000
Subject: [PATCH 49/92] updated, calling things properly

---
 .../cusparseLt_benchmarks/benchmark_24.py     | 251 ------------------
 .../cusparseLt_benchmarks/weight_shapes.py    |  43 ---
 vllm/_custom_ops.py                           |   2 +-
 .../schemes/compressed_tensors_24.py          |  62 +----
 4 files changed, 8 insertions(+), 350 deletions(-)
 delete mode 100644 benchmarks/cusparseLt_benchmarks/benchmark_24.py
 delete mode 100644 benchmarks/cusparseLt_benchmarks/weight_shapes.py

diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py
deleted file mode 100644
index b66ef0fa7b29d..0000000000000
--- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py
+++ /dev/null
@@ -1,251 +0,0 @@
-import argparse
-import copy
-import itertools
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    compress_to_torch_sparse_semi_structured_mat, dense_matmul, get_random_mat,
-    is_semi_structured_supported, semi_structured_sparse_dense_gemm)
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-
-# helpers
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = get_random_mat(m, k, dtype)
-    b = get_random_mat(n, k, dtype).t()
-    return a, b
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
-    min_run_time = 1
-
-    globals = {
-        "args": args,
-        "kwargs": kwargs,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench(m: int, k: int, n: int, label: str, sub_label: str,
-          use_fp8: bool) -> Iterable[TMeasurement]:
-    a, b = make_rand_tensors(torch.float16, m, n, k)
-
-    timers = []
-    # pytorch float16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_fp16_fp16_matmul", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # pytorch bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_matmul", torch.mm,
-                 a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # cusparseLt fp16
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_fp16_fp16_2_4",
-                 semi_structured_sparse_dense_gemm,
-                 compress_to_torch_sparse_semi_structured_mat(a), b))
-
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_fp16_fp16_2_4_noncached",
-                 semi_structured_sparse_dense_gemm,
-                 compress_to_torch_sparse_semi_structured_mat(a),
-                 b,
-                 cached=False))
-
-    # cusparseLt bf16
-    a, b = make_rand_tensors(torch.bfloat16, m, n, k)
-    a_compressed = compress_to_torch_sparse_semi_structured_mat(a.to(dtype=torch.bfloat16))
-
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_bf16_bf16_2_4",
-                 semi_structured_sparse_dense_gemm, a_compressed, b))
-
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_bf16_bf16_2_4_noncached",
-                 semi_structured_sparse_dense_gemm,
-                 a_compressed,
-                 b,
-                 cached=False))
-
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    # cutlass i8
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_matmul", dense_matmul, a, b,
-                 torch.int8))
-
-    # cusparseLt i8
-    a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
-    # warmup
-    semi_structured_sparse_dense_gemm(a_compressed, b)
-    timers.append(
-        bench_fn(label, sub_label, "cusparseLt_i8_i8_2_4",
-                 semi_structured_sparse_dense_gemm, a_compressed, b))
-
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "cusparseLt_i8_i8_2_4_noncached",
-                 semi_structured_sparse_dense_gemm,
-                 a_compressed,
-                 b,
-                 cached=False))
-
-    if use_fp8:
-        a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-        # cutlass fp8
-        timers.append(
-            bench_fn(label, sub_label, "cutlass_fp8_fp8_matmul-w-scales",
-                     dense_matmul, a, b, torch.float8_e4m3fn))
-
-        # cusparseLt fp8
-        a_compressed = compress_to_torch_sparse_semi_structured_mat(a)
-
-        # warmup
-        semi_structured_sparse_dense_gemm(a_compressed, b)
-
-        timers.append(
-            bench_fn(label, sub_label, "cusparseLt_fp8_fp8_2_4",
-                     semi_structured_sparse_dense_gemm, a_compressed, b))
-
-        timers.append(
-            bench_fn(label,
-                     sub_label,
-                     "cusparseLt_fp8_fp8_2_4_noncached",
-                     semi_structured_sparse_dense_gemm,
-                     a_compressed,
-                     b,
-                     cached=False))
-
-    return timers
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(MKNs: Iterable[Tuple[int, int, int]],
-        use_fp8: bool) -> Iterable[TMeasurement]:
-    results = []
-    # MKNs = [(1024, 8192, 14336)]
-    # MKNs = [(2048, 8192, 14336)]
-    # MKNs = [(2048, 8192, 14336), (2048, 8192, 14336)]
-    # MKNs = [(32, 11008, 4096)]
-    # MKNs = [(2048, 11008, 14336)]
-
-    for m, k, n in MKNs:
-        timers = bench(m, k, n, "gemm", f"MKN=({m}x{k}x{n})", use_fp8)
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-
-def run_model_bench(args):
-    if not is_semi_structured_supported():
-        raise ValueError("Device does not support semi-structured sparsity")
-
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            assert m % 32 == 0, "Batch size has to be a multiple of 32"
-            for k, n in KNs:
-                if k % 32 or n % 32:
-                    continue
-                MKNs.append((m, k, n))
-
-        data = run(MKNs, args.use_fp8)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results cuSparseLt {model}-TP{tp_size} ====")
-        print_timers(data)
-
-
-if __name__ == '__main__':
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark cuSparseLt 2:4 GEMMs.
-
-    To run dimensions from a model:
-        python3 ./benchmarks/cusparseLt_benchmarks/benchmark_24.py --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cusparseLt implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--models",
-                        nargs="+",
-                        type=str,
-                        default=DEFAULT_MODELS,
-                        choices=WEIGHT_SHAPES.keys())
-    parser.add_argument("--tp-sizes",
-                        nargs="+",
-                        type=int,
-                        default=DEFAULT_TP_SIZES)
-    parser.add_argument("--batch-sizes",
-                        nargs="+",
-                        type=int,
-                        default=DEFAULT_BATCH_SIZES)
-    parser.add_argument(
-        '--use-fp8',
-        action='store_true',
-        help='Add benchmarking fp8 matmul (on supporting fp8 platforms)')
-
-    args = parser.parse_args()
-    run_model_bench(args)
diff --git a/benchmarks/cusparseLt_benchmarks/weight_shapes.py b/benchmarks/cusparseLt_benchmarks/weight_shapes.py
deleted file mode 100644
index 25ec9d6028627..0000000000000
--- a/benchmarks/cusparseLt_benchmarks/weight_shapes.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-}
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ffaae206d45c5..0f31cdeb630fa 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -546,7 +546,7 @@ def cutlass_scaled_sparse_mm(a: torch.Tensor,
                       scale_b: torch.Tensor,
                       out_dtype: torch.dtype,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    # assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.shape[0] == b.shape[
         1] and bias.dtype == out_dtype
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 19233c6eca7bc..44190665b2278 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -5,13 +5,6 @@
 from typing import List, Callable, Optional
 from compressed_tensors.compressors import ModelCompressor
 from torch.nn import Parameter
-from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    compress_to_torch_sparse_semi_structured_mat,
-    semi_structured_dense_sparse_T_gemm,
-    semi_structured_sparse_dense_gemm_scaled,
-    semi_structured_dense_sparse_T_gemm_scaled,
-    )
-from torch.sparse import to_sparse_semi_structured
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import sparse_semi_structured_to_dense_cutlass, sparse_semi_structured_from_dense_cutlass
 from vllm import _custom_ops as ops
 from typing import Tuple
@@ -75,8 +68,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         """
 
         w_compressed, meta = ops.cutlass_compress_entry(layer.weight)
-        layer.weight = torch.nn.Parameter(w_compressed)
-        layer.meta = torch.nn.Parameter(meta)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
         
 
     def apply_weights(self,
@@ -95,61 +88,20 @@ def apply_weights(self,
         :return: The output tensor of the layer 
         """
 
-        """ debugging code
-        a_sparse = to_sparse_semi_structured(layer.weight)
-        result = torch.mm(a_sparse, x.t().contiguous())
-        return result.t().contiguous()
-        """
+        q_input, input_scale = ops.scaled_fp8_quant(
+            x, use_per_token_if_dynamic=True)
 
-        q_input, input_scale = ops.scaled_fp8_quant(x, input_scale)
-        breakpoint()
-        return ops.cutlass_scaled_sparse_mm(
+        out = ops.cutlass_scaled_sparse_mm(
             a=layer.weight,
             e=layer.meta,
-            b=q_input,
+            b=q_input.t(),
             scale_a=layer.weight_scale,
             scale_b=input_scale,
             out_dtype=self.params_dtype,
             bias=bias
         )
 
-        # if not self.quantized:
-        #     return semi_structured_dense_sparse_T_gemm(
-        #         a_dense=x, 
-        #         b_T_packed=layer.weight.data
-        #     )
-        
-        # input_scale = layer.input_scale.data
-        # weight_scale = layer.weight_scale.data
-        # weight = layer.weight.data
-
-        # # Quantize the input tensor to fp8
-        # # can use the max scale for the input tensor
-        # # as the merged modules have a same scale
-        # # repeated for all the partitions
-        # input_scale = input_scale.max()
-        # q_input, input_scale = ops.scaled_fp8_quant(x, input_scale)
-        
-        # if q_input.is_contiguous():
-        #     # Make q_input non-contiguous
-        #     # as expected by the kernel
-        #     q_input = q_input.t().contiguous().t()
-
-
-        # assert not q_input.is_contiguous(), "Input is contiguous, the Kernel expects non-contiguous input"
-        # output =  semi_structured_dense_sparse_T_gemm_scaled(
-        #     a_dense=q_input,
-        #     b_T_packed=weight,
-        #     scale_a=input_scale,
-        #     scale_b=weight_scale,
-        #     bias=bias
-        # )
-        # output = output.to(x.dtype)
-        # print()
-        # print(f"{self.layer_name} executed")
-        # print("\t", "Input shape:", x.shape, "weight shape:", weight.shape, "output shape:", output.shape)
-        # return output
-
+        return out.t().contiguous()
 
 def quantize_with_max_scale(
         weight: torch.Tensor, weight_scale: torch.Tensor,

From 4c61b191a591970edd26c9ebdd1f75921a35c556 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 04:13:26 +0000
Subject: [PATCH 50/92] running end to end but not passing

---
 vllm/_custom_ops.py                           |  9 +++++++
 .../schemes/compressed_tensors_24.py          | 27 +++++++++----------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0f31cdeb630fa..6c0c8249e7557 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -689,6 +689,7 @@ def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     num_token_padding: Optional[int] = None,
+    pad_to_multiple: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -721,7 +722,15 @@ def scaled_fp8_quant(
     # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
     out_dtype: torch.dtype = torch.float8_e4m3fnuz \
             if current_platform.is_rocm() else torch.float8_e4m3fn
+    if pad_to_multiple:
+        assert not num_token_padding
+        remainder = input.shape[0] % pad_to_multiple
+        delta = 0
+        if remainder > 0:
+            delta = pad_to_multiple - remainder
+        shape = (input.shape[0] + delta, input.shape[1])
     if num_token_padding:
+        assert not pad_to_multiple
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     output = torch.empty(shape, device=input.device, dtype=out_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 44190665b2278..64c9becd64ad7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -18,11 +18,9 @@ def __init__(self, model_compressor: Optional[ModelCompressor] = None, layer_nam
         self.quantized = True  # toggle based on the case we're running
         self.compressed = False  # toggle based on the case we're running
 
-
-
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 90
 
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                     output_partition_sizes: List[int],
@@ -32,15 +30,6 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         layer.logical_widths = output_partition_sizes
         self.params_dtype=params_dtype
 
-        # weights_dtype = params_dtype
-        # weights = ModelWeightParameter(data=torch.empty(
-        #     sum(output_partition_sizes),
-        #     input_size_per_partition // 2,
-        #     dtype=weights_dtype),
-        #     input_dim=1,
-        #     output_dim=0,
-        #     weight_loader=weight_loader)
-
         # parameter to store uncompressed weight or decompressed weight
         weight = ModelWeightParameter(
             data=torch.empty(sum(output_partition_sizes),
@@ -88,8 +77,12 @@ def apply_weights(self,
         :return: The output tensor of the layer 
         """
 
+        PAD_MULTIPLE = 16
+        remainder = x.shape[0] % 16
+        pad_size = PAD_MULTIPLE - remainder if remainder > 0 else 0
+
         q_input, input_scale = ops.scaled_fp8_quant(
-            x, use_per_token_if_dynamic=True)
+            x, pad_to_multiple=PAD_MULTIPLE, use_per_token_if_dynamic=True)
 
         out = ops.cutlass_scaled_sparse_mm(
             a=layer.weight,
@@ -101,7 +94,13 @@ def apply_weights(self,
             bias=bias
         )
 
-        return out.t().contiguous()
+        out = out.t()
+        if pad_size > 0:
+            out = out[:-pad_size,:].contiguous()
+        else:
+            out = out.contiguous()
+        
+        return out
 
 def quantize_with_max_scale(
         weight: torch.Tensor, weight_scale: torch.Tensor,

From 86716f8bda076c00cc767a9b5523be3b2005fb52 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 04:14:38 +0000
Subject: [PATCH 51/92] updated

---
 .../schemes/compressed_tensors_24.py          | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 64c9becd64ad7..de2644758edad 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -102,30 +102,6 @@ def apply_weights(self,
         
         return out
 
-def quantize_with_max_scale(
-        weight: torch.Tensor, weight_scale: torch.Tensor,
-        logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
-    # Max scale to be used for quanitzation.
-    max_w_scale = weight_scale.max()
-
-    # QKV / MLP is fused in the on disk checkpoint if any of the
-    # weight scales are still set to the default since we initialize
-    # N weight scales for N shards but we only load 1 weight scale
-    # from disk in this case. Skip requantization in this case (since)
-    # we already are quantized with the single scale.
-    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
-    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
-        torch.float8_e4m3fn).min)
-    q_weight = torch.empty_like(weight).to(torch.float8_e4m3fn)
-    # If unfused checkpoint, need quantize with the single scale.
-    if unfused_module_in_checkpoint:
-        start = 0
-        for idx, logical_width in enumerate(logical_widths):
-            end = start + logical_width
-            q_weight[start:end, :], _ = ops.scaled_fp8_quant(
-                weight[start:end, :], max_w_scale)
-            start = end    
-    return max_w_scale, q_weight
 
 def check_24(tensor):
     new_tensor = tensor.view(-1, 4)    

From 349d90442a1d5fa2bef13640c753353a111d134f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 15:02:53 +0000
Subject: [PATCH 52/92] stash

---
 vllm/_custom_ops.py                           | 11 +--------
 .../compressed_tensors/compressed_tensors.py  |  3 +++
 .../schemes/compressed_tensors_24.py          | 24 +++++++++++--------
 .../layers/quantization/utils/w8a8_utils.py   |  3 +++
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6c0c8249e7557..7e022ac142e54 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -689,7 +689,6 @@ def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     num_token_padding: Optional[int] = None,
-    pad_to_multiple: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -721,16 +720,8 @@ def scaled_fp8_quant(
     shape: Union[Tuple[int, int], torch.Size] = input.shape
     # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
     out_dtype: torch.dtype = torch.float8_e4m3fnuz \
-            if current_platform.is_rocm() else torch.float8_e4m3fn
-    if pad_to_multiple:
-        assert not num_token_padding
-        remainder = input.shape[0] % pad_to_multiple
-        delta = 0
-        if remainder > 0:
-            delta = pad_to_multiple - remainder
-        shape = (input.shape[0] + delta, input.shape[1])
+            if current_platform.is_rocm() else torch.float8_e4m3f
     if num_token_padding:
-        assert not pad_to_multiple
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     output = torch.empty(shape, device=input.device, dtype=out_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 598dea3d91fd1..22d0488379537 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -361,6 +361,9 @@ def get_scheme(
             model_compressor=self.model_compressor,
             layer_name=layer_name
         )
+        # scheme = CompressedTensorsW8A8Fp8(
+        #     strategy=QuantizationStrategy.CHANNEL,
+        #     is_static_input_scheme=False)
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index de2644758edad..7ca5abbcee457 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,13 +1,11 @@
+from typing import List, Callable, Optional
+import torch
+
+from compressed_tensors.compressors import ModelCompressor
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter
-import torch
-from typing import List, Callable, Optional
-from compressed_tensors.compressors import ModelCompressor
-from torch.nn import Parameter
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import sparse_semi_structured_to_dense_cutlass, sparse_semi_structured_from_dense_cutlass
 from vllm import _custom_ops as ops
-from typing import Tuple
 
 __all__ = ["CompressedTensors24"]
 
@@ -78,12 +76,18 @@ def apply_weights(self,
         """
 
         PAD_MULTIPLE = 16
-        remainder = x.shape[0] % 16
+        remainder = x.shape[0] % 16        
         pad_size = PAD_MULTIPLE - remainder if remainder > 0 else 0
+        if pad_size > 0:
 
+            input = torch.nn.functional.pad(x, (0,0,0,pad_size), value=0)
+        else:
+            input = x
+    
         q_input, input_scale = ops.scaled_fp8_quant(
-            x, pad_to_multiple=PAD_MULTIPLE, use_per_token_if_dynamic=True)
-
+            input, use_per_token_if_dynamic=True)
+    
+        # print(f"{q_input.shape=}")
         out = ops.cutlass_scaled_sparse_mm(
             a=layer.weight,
             e=layer.meta,
@@ -99,7 +103,7 @@ def apply_weights(self,
             out = out[:-pad_size,:].contiguous()
         else:
             out = out.contiguous()
-        
+
         return out
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 445117ac99a34..8290fed64e98b 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -109,6 +109,9 @@ def apply_fp8_linear(
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         # Fused GEMM_DQ
+        breakpoint()
+              
+              
         output = ops.cutlass_scaled_mm(qinput,
                                        weight,
                                        out_dtype=input.dtype,

From b860f9ee0bc122946a508161979ffcef7c8c3359 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 15 Nov 2024 15:11:10 +0000
Subject: [PATCH 53/92] update

---
 vllm/_custom_ops.py                                           | 2 +-
 .../compressed_tensors/schemes/compressed_tensors_24.py       | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 7e022ac142e54..0f31cdeb630fa 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -720,7 +720,7 @@ def scaled_fp8_quant(
     shape: Union[Tuple[int, int], torch.Size] = input.shape
     # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
     out_dtype: torch.dtype = torch.float8_e4m3fnuz \
-            if current_platform.is_rocm() else torch.float8_e4m3f
+            if current_platform.is_rocm() else torch.float8_e4m3fn
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     output = torch.empty(shape, device=input.device, dtype=out_dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 7ca5abbcee457..da61485ff0d29 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -57,7 +57,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         w_compressed, meta = ops.cutlass_compress_entry(layer.weight)
         layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
-        
 
     def apply_weights(self,
                       layer: torch.nn.Module,
@@ -79,7 +78,6 @@ def apply_weights(self,
         remainder = x.shape[0] % 16        
         pad_size = PAD_MULTIPLE - remainder if remainder > 0 else 0
         if pad_size > 0:
-
             input = torch.nn.functional.pad(x, (0,0,0,pad_size), value=0)
         else:
             input = x
@@ -87,7 +85,6 @@ def apply_weights(self,
         q_input, input_scale = ops.scaled_fp8_quant(
             input, use_per_token_if_dynamic=True)
     
-        # print(f"{q_input.shape=}")
         out = ops.cutlass_scaled_sparse_mm(
             a=layer.weight,
             e=layer.meta,
@@ -101,6 +98,7 @@ def apply_weights(self,
         out = out.t()
         if pad_size > 0:
             out = out[:-pad_size,:].contiguous()
+            # ^ this is of shape [5, 6144]
         else:
             out = out.contiguous()
 

From df462b532849aa1f0893c3b17ad4b7e65eb06a0d Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Mon, 18 Nov 2024 02:39:08 +0000
Subject: [PATCH 54/92] Fix batch size and zeros issue

---
 .../cutlass_benchmarks/sparse_mm/bench_v1.py  | 11 ++-
 .../cutlass_benchmarks/sparse_mm/utils.py     |  2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 70 ++++++++++---------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 43 +++++++-----
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  8 +--
 vllm/_custom_ops.py                           |  2 +-
 6 files changed, 74 insertions(+), 62 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
index 22616c2359b74..76be1cd395bfd 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -5,7 +5,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_rand_sparse_tensors
+from utils import make_rand_sparse_tensors, to_fp16
 
 import vllm._custom_ops as ops
 
@@ -87,12 +87,17 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
     aT = a.t()
     bT = b.t()
-    bf16_a = a.to(dtype=torch.bfloat16)
-    bf16_bT = bT.to(dtype=torch.bfloat16)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.float16)
+    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.float16)
+
+    if not torch.allclose(out.t(), out_ref):
+        print("Incorrect result")
+        exit()
+
     timers = []
 
     # pytorch impl w. bf16
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
index 2d753b254a0ab..0c7bde70412c7 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
@@ -49,7 +49,7 @@ def prune_to_2_4(tensor):
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
                              k: int) -> Tuple[torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
+    b = torch.randn((n, k), device='cuda') * 5
 
     # # Initialize a to all ones
     # a = torch.ones((m, k), device='cuda')
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index a62598587b1b1..4f8359db1518d 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -53,26 +53,28 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass3xGemmM512 =
       typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
 
-  uint32_t const m = a.size(0);
+  uint32_t const n = b.size(1);
   uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
+      std::max(static_cast<uint32_t>(64), next_pow_2(n));  // next power of 2
+
+  // if (mp2 <= 64) {
+  //   // n in [1, 64]
+  //   return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+  //       out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  // } else if (mp2 <= 128) {
+  if (mp2 <= 128) {
+    // n in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 256) {
-    // m in (128, 256]
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  // } else if (mp2 <= 256) {
   } else {
-    // m in (256, inf)
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+    // n in (128, 256]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  // } else {
+  //   // n in (256, inf)
+  //   return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+  //       out, a, e, b, std::forward<EpilogueArgs>(args)...);
   }
 }
 
@@ -262,24 +264,24 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
   }
 }
 
-void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& e,
-                                torch::Tensor const& b,
-                                torch::Tensor const& a_scales,
-                                torch::Tensor const& b_scales,
-                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (azp) {
-    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
-        out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
-        out, a, e, b, a_scales, b_scales, azp_adj, bias);
-  }
-}
+// void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
+//                                 torch::Tensor const& e,
+//                                 torch::Tensor const& b,
+//                                 torch::Tensor const& a_scales,
+//                                 torch::Tensor const& b_scales,
+//                                 torch::Tensor const& azp_adj,
+//                                 c10::optional<torch::Tensor> const& azp,
+//                                 c10::optional<torch::Tensor> const& bias) {
+//   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+//   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+//   if (azp) {
+//     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+//         out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
+//   } else {
+//     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
+//         out, a, e, b, a_scales, b_scales, azp_adj, bias);
+//   }
+// }
 
 #endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 0ff65ef199a91..e442968dc7af6 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -374,18 +374,22 @@ struct cutlass_3x_gemm {
 
   using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
 
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideD = Stride<Int<1>, int64_t, Int<0>>;
   using ElementC = void;
   using StrideC = StrideD;
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
+  static constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, ElementAcc, ElementC, StrideC, AlignmentCD, ElementD,
+          StrideD, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -396,8 +400,8 @@ struct cutlass_3x_gemm {
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 32, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
           ElementAcc, TileShape, ClusterShape,
           Stages,
           KernelSchedule>::CollectiveOp;
@@ -419,26 +423,27 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 
   int32_t m = a.size(0);
   int32_t n = b.size(1);
-  int32_t k = a.size(1);
+  int32_t k = b.size(0);
 
   int64_t lda = a.stride(0);
   int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
+  int64_t ldc = out.stride(1);
+
+  // using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  // using StrideC = typename Gemm::StrideC;
 
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
+  using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
 
-  StrideA a_stride{lda, Int<1>{}, 0};
   StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+  StrideC c_stride{Int<1>{}, ldc, Int<0>{}};
 
   using GemmKernel = typename Gemm::GemmKernel;
   typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
 
-  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
-  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
-
   using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
   using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
 
@@ -513,10 +518,10 @@ struct sm90_fp8_config_default {
   // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_2,_1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -530,7 +535,7 @@ struct sm90_fp8_config_M64 {
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_64, _64, _256>;
+  using TileShape = Shape<_128, _64, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
   using TileSchedule = cutlass::gemm::PersistentScheduler;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 736d21dd03297..aaf964fe3f4c0 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -51,10 +51,10 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
 
   // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
-  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
-  TORCH_CHECK(c.stride(0) % 16 == 0 &&
-              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major
+  // TORCH_CHECK(c.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(b.stride(1) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0f31cdeb630fa..4e57b1f48a900 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -553,7 +553,7 @@ def cutlass_scaled_sparse_mm(a: torch.Tensor,
 
     m = a.shape[0]
     n = b.shape[1]
-    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    out = torch.empty((n, m), dtype=out_dtype, device=a.device).t()
 
     torch.ops._C.cutlass_scaled_sparse_mm(out, a, e, b, scale_a, scale_b, bias)
 

From 14f6141fbaf49ed4489b34a7f2d48f3045c6e4d8 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 18 Nov 2024 13:42:01 +0000
Subject: [PATCH 55/92] working e2e

---
 .../schemes/compressed_tensors_24.py          | 25 +++++--------------
 .../layers/quantization/utils/w8a8_utils.py   |  3 ---
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index da61485ff0d29..b1709186e4c07 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -55,7 +55,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         """
 
         w_compressed, meta = ops.cutlass_compress_entry(layer.weight)
-        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.w_compressed = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
     def apply_weights(self,
@@ -73,20 +73,12 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
-
-        PAD_MULTIPLE = 16
-        remainder = x.shape[0] % 16        
-        pad_size = PAD_MULTIPLE - remainder if remainder > 0 else 0
-        if pad_size > 0:
-            input = torch.nn.functional.pad(x, (0,0,0,pad_size), value=0)
-        else:
-            input = x
-    
+        
         q_input, input_scale = ops.scaled_fp8_quant(
-            input, use_per_token_if_dynamic=True)
-    
+            x, use_per_token_if_dynamic=True)
+
         out = ops.cutlass_scaled_sparse_mm(
-            a=layer.weight,
+            a=layer.w_compressed,
             e=layer.meta,
             b=q_input.t(),
             scale_a=layer.weight_scale,
@@ -96,12 +88,7 @@ def apply_weights(self,
         )
 
         out = out.t()
-        if pad_size > 0:
-            out = out[:-pad_size,:].contiguous()
-            # ^ this is of shape [5, 6144]
-        else:
-            out = out.contiguous()
-
+        assert out.is_contiguous()
         return out
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 8290fed64e98b..445117ac99a34 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -109,9 +109,6 @@ def apply_fp8_linear(
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         # Fused GEMM_DQ
-        breakpoint()
-              
-              
         output = ops.cutlass_scaled_mm(qinput,
                                        weight,
                                        out_dtype=input.dtype,

From abfd85dd9750b5d58205def2e4a473cca89ddbbc Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Mon, 18 Nov 2024 17:41:13 +0000
Subject: [PATCH 56/92] Enable other datatypes

---
 .../cutlass_benchmarks/sparse_mm/bench_v1.py  | 96 +++++++++++++------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  2 +-
 vllm/_custom_ops.py                           |  4 +-
 3 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
index 76be1cd395bfd..c54b2bf264db9 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -5,7 +5,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_rand_sparse_tensors, to_fp16
+from utils import make_rand_sparse_tensors, to_fp16, to_bf16
 
 import vllm._custom_ops as ops
 
@@ -32,12 +32,22 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
 def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
                sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.int8
-    a_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
 
+    # Create tensors
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    aT = a.t()
+    bT = b.t()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out.t(), out_ref):
+        print("Incorrect result")
+        exit()
+
     timers = []
 
     # pytorch impl - bfloat16
@@ -55,25 +65,25 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
                  torch.bfloat16))
 
     # cutlass with bias: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
                  bias))
     
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
                  torch.float16))
 
     # cutlass with bias: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
                  bias.to(dtype=torch.float16)))
 
     return timers
@@ -172,12 +182,28 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float16
-    a_compressed, e, a, b = make_rand_sparse_tensors(torch.float16, m, n, k)
 
+    m, k, n = 1, 128, 256
+
+    # Create tensors
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float16, m, n, k)
+    aT = a.t()
+    bT = b.t()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out_ref = to_bf16(a@bT)
+
+    if not torch.allclose(out.t(), out_ref, rtol=1e-2, atol=1e-2):
+        print("Incorrect result")
+        print(out.t())
+        print(out_ref)
+        exit()
+    else:
+        print("Correct result")
+
     timers = []
 
     # # pytorch impl w. bf16
@@ -213,25 +239,25 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
                  torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
 
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
+    # # cutlass impl: bf16 output, with bias
+    # timers.append(
+    #     bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
+    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
+    #              bias))
 
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
+    # # cutlass impl: fp16 output, with bias
+    # timers.append(
+    #     bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
+    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
+    #              bias.to(dtype=torch.float16)))
 
     return timers
 
@@ -239,12 +265,22 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.bfloat16
-    a_compressed, e, a, b = make_rand_sparse_tensors(torch.bfloat16, m, n, k)
 
+    # Create tensors
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.bfloat16, m, n, k)
+    aT = a.t()
+    bT = b.t()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out_ref = to_bf16(a@bT)
+
+    if not torch.allclose(out.t(), out_ref):
+        print("Incorrect result")
+        exit()
+
     timers = []
 
     # # pytorch impl w. bf16
@@ -280,24 +316,24 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
                  torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
                  bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
                  bias.to(dtype=torch.float16)))
 
     return timers
@@ -305,12 +341,12 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 
 def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
           sub_label: str) -> Iterable[TMeasurement]:
-    # if dtype == torch.int8:
-    #     return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
     if dtype == torch.float8_e4m3fn:
         return bench_fp8(dtype, m, k, n, label, sub_label)
-    # if dtype == torch.float16:
-    #     return bench_fp16(dtype, m, k, n, label, sub_label)
-    # if dtype == torch.bfloat16:
-    #     return bench_bf16(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float16:
+        return bench_fp16(dtype, m, k, n, label, sub_label)
+    if dtype == torch.bfloat16:
+        return bench_bf16(dtype, m, k, n, label, sub_label)
     raise ValueError("unsupported type")
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 4f8359db1518d..461fb30b80347 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -226,7 +226,7 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor co
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
-  else {
+  else { // a.dtype() == torch::kBFloat16
     TORCH_CHECK(a.dtype() == torch::kBFloat16);
     TORCH_CHECK(e.dtype() == torch::kUInt8);
     TORCH_CHECK(b.dtype() == torch::kBFloat16);
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4e57b1f48a900..ff6fa7583789d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -548,8 +548,8 @@ def cutlass_scaled_sparse_mm(a: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     # assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-    assert bias is None or bias.shape[0] == b.shape[
-        1] and bias.dtype == out_dtype
+    assert bias is None or bias.shape[0] == a.shape[0] \
+        and bias.dtype == out_dtype
 
     m = a.shape[0]
     n = b.shape[1]

From 540d0ce00da201c5b316bbc4def4de6ee8215e2a Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Tue, 19 Nov 2024 05:40:50 +0000
Subject: [PATCH 57/92] Add the heuristics for fp8

---
 .../cutlass_benchmarks/sparse_mm/bench_v1.py  |  4 ++--
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 22 +++++++++----------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 14 ++++++------
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
index c54b2bf264db9..d7d585bb6956d 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -101,8 +101,8 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.float16)
-    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.float16)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out.t(), out_ref):
         print("Incorrect result")
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 461fb30b80347..31f8392412d83 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -57,24 +57,22 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   uint32_t const mp2 =
       std::max(static_cast<uint32_t>(64), next_pow_2(n));  // next power of 2
 
-  // if (mp2 <= 64) {
-  //   // n in [1, 64]
-  //   return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-  //       out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  // } else if (mp2 <= 128) {
-  if (mp2 <= 128) {
+  if (mp2 <= 64) {
+    // n in [1, 64]
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
     // n in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  // } else if (mp2 <= 256) {
-  } else {
+  } else if (mp2 <= 256) {
     // n in (128, 256]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  // } else {
-  //   // n in (256, inf)
-  //   return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
-  //       out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // n in (256, inf)
+    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
   }
 }
 
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index e442968dc7af6..2f303677c0a8d 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -533,9 +533,9 @@ struct sm90_fp8_config_M64 {
   // M in [1, 64]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128, _64, _256>;
+  using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
   using TileSchedule = cutlass::gemm::PersistentScheduler;
@@ -552,9 +552,9 @@ struct sm90_fp8_config_M128 {
   // M in (64, 128]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128, _64, _256>;
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
   using TileSchedule = cutlass::gemm::PersistentScheduler;
@@ -590,9 +590,9 @@ struct sm90_fp8_config_M512 {
   // M in (256, ]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_256, _128, _128>;
+  using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
   using TileSchedule = cutlass::gemm::PersistentScheduler;

From b45c158f63827e0392dd9d9625bca5647d4204cf Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 19 Nov 2024 19:52:42 +0000
Subject: [PATCH 58/92] updated

---
 benchmarks/benchmark_throughput.py                            | 4 +++-
 .../quantization/compressed_tensors/compressed_tensors.py     | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 262b8652e49ff..83c686740784e 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -302,7 +302,9 @@ def main(args: argparse.Namespace):
                               for request in requests)
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
+          f"{total_num_tokens=} | {total_output_tokens=}"
+    )
 
     # Output JSON results if specified
     if args.output_json:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 22d0488379537..713bc8e789365 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -359,8 +359,7 @@ def get_scheme(
         """
         scheme = CompressedTensors24(
             model_compressor=self.model_compressor,
-            layer_name=layer_name
-        )
+            layer_name=layer_name)
         # scheme = CompressedTensorsW8A8Fp8(
         #     strategy=QuantizationStrategy.CHANNEL,
         #     is_static_input_scheme=False)

From 5f7339e49b0faa4c0cd8fbc36e43ad1dca038010 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 20 Nov 2024 18:39:50 +0000
Subject: [PATCH 59/92] Cleanup

---
 .../compressed_tensors/compressed_tensors.py  | 158 +++++++++++----
 .../schemes/compressed_tensors_24.py          | 183 ++++++++++++++++--
 2 files changed, 280 insertions(+), 61 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 713bc8e789365..a493190b8825e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -25,6 +25,7 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.config import SparsityCompressionConfig, SparsityStructure
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
@@ -37,6 +38,7 @@ def __init__(self,
                  quant_format: str,
                  kv_cache_scheme: Optional[Dict[str, Any]] = None,
                  model_compressor: Optional[ModelCompressor] = None,
+                 sparsity_scheme_map: Optional[Dict[str, Any]] = None
                  ):
 
         self.ignore = ignore
@@ -45,6 +47,7 @@ def __init__(self,
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
         self.model_compressor = model_compressor
+        self.sparsity_scheme_map = sparsity_scheme_map
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -52,6 +55,7 @@ def get_linear_method(self) -> "CompressedTensorsLinearMethod":
     def get_scaled_act_names(self) -> List[str]:
         return []
 
+    @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
         return [torch.float16, torch.bfloat16]
 
@@ -85,8 +89,32 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
+        model_compressor = ModelCompressor.from_compression_config(config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(sparsity_config=model_compressor.sparsity_config)
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            model_compressor=model_compressor,
+            sparsity_scheme_map=sparsity_scheme_map,
+        )
+    
+    @classmethod
+    def _sparsity_scheme_map_from_config(cls, sparsity_config: SparsityCompressionConfig):
+        sparse_targets = cast(List[str], sparsity_config.targets) if sparsity_config else []
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config
+            for target in sparse_targets
+        }
+        return sparse_scheme_map
+
+    @classmethod
+    def _quantization_scheme_map_from_config(cls, config: Dict[str, Any]):
         target_scheme_map: Dict[str, Any] = dict()
-        ignore = cast(List[str], config.get("ignore"))
         quant_format = cast(str, config.get("format"))
 
         # The quant_config has multiple config_groups, each containing
@@ -97,13 +125,13 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        """
-        for _, quant_config in config["config_groups"].items():
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
-                target_scheme_map[target][
-                    "weights"] = QuantizationArgs.parse_obj(
+                target_scheme_map[target]["weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -117,32 +145,9 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                         assert target_scheme_map[target][
                             "weights"].type == QuantizationType.FLOAT
                     else:
-                        target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.parse_obj(
+                        target_scheme_map[target]["input_activations"] = QuantizationArgs.model_validate(
                                 quant_config.get("input_activations"))
-        """
-        sparsity_config = config.get("sparsity_config")
-        targets = sparsity_config.get("targets")
-        sparsity_format = sparsity_config.get("format")
-        ignore = sparsity_config.get("ignore")
-        
-        for t in targets:
-            target_scheme_map[t] = sparsity_format
-
-        model_compressor = ModelCompressor.from_compression_config(config)
-
-        """
-        return cls(target_scheme_map=target_scheme_map,
-                   ignore=ignore,
-                   quant_format=quant_format,
-                   kv_cache_scheme=config.get("kv_cache_scheme"))
-        """
-        return cls(
-            target_scheme_map=target_scheme_map,
-            ignore=ignore,
-            quant_format=sparsity_format,
-            model_compressor=model_compressor,
-        )
+        return target_scheme_map
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -341,30 +346,99 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
-        """
+        
         matched_target = find_matched_target(
             layer_name=layer_name,
             module=layer,
             targets=self.target_scheme_map.keys())
 
-        # Find the quant_scheme
         scheme_dict = self.target_scheme_map[matched_target]
-        scheme = self._get_scheme_from_parts(
-            weight_quant=scheme_dict["weights"],
-            input_quant=scheme_dict["input_activations"])
+        weight_quant = scheme_dict.get("weights")
+        input_quant = scheme_dict.get("input_activations")
+
+        sparsity_scheme: Optional[SparsityCompressionConfig] = self.sparsity_scheme_map.get(matched_target)    
+
+        if self.supports_cutlass_24(
+            weight_quant=weight_quant,
+            input_quant=input_quant,
+            sparsity_scheme=sparsity_scheme
+        ):
+            # Have a valid sparsity scheme and the layer is supported by the Cutlass 2:4 Kernel
+            needs_decompression = sparsity_scheme.format != CompressionFormat.dense.value
+            is_quantized = weight_quant is not None or input_quant is not None 
+            
+            scheme = CompressedTensors24(
+                model_compressor=self.model_compressor,
+                layer_name=layer_name,
+                quantized=is_quantized,
+                do_decompress=needs_decompression,
+                weight_quant=weight_quant,
+                input_quant=input_quant
+            )
+        else:
+        # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
+        return scheme
+    
+    @staticmethod
+    def supports_cutlass_24(
+        weight_quant: Optional[QuantizationArgs], 
+        input_quant: Optional[QuantizationArgs], 
+        sparsity_scheme: Optional[SparsityCompressionConfig]=None
+        ) -> bool:
         """
-        scheme = CompressedTensors24(
-            model_compressor=self.model_compressor,
-            layer_name=layer_name)
-        # scheme = CompressedTensorsW8A8Fp8(
-        #     strategy=QuantizationStrategy.CHANNEL,
-        #     is_static_input_scheme=False)
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported 
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        
+        if (
+            sparsity_scheme is None or
+            sparsity_scheme.sparsity_structure != SparsityStructure.TWO_FOUR.value
+        ):
+            return False
+        
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+        
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+        
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value
+        ]
+
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+        
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.TOKEN.value
+        ]
+        
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+        
+        return weight_quant.num_bits == input_quant.num_bits == 8
 
-        return scheme
 
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index b1709186e4c07..98ca42f5dea76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -2,19 +2,31 @@
 import torch
 
 from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.quantization import QuantizationType, QuantizationStrategy
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter
+from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter, PerTensorScaleParameter
 from vllm import _custom_ops as ops
 
 __all__ = ["CompressedTensors24"]
 
 class CompressedTensors24(CompressedTensorsScheme):
-    def __init__(self, model_compressor: Optional[ModelCompressor] = None, layer_name = None):
+    def __init__(
+            self, 
+            model_compressor: Optional[ModelCompressor] = None, 
+            layer_name: Optional[str] = None,
+            quantized: bool = False,
+            do_decompress: bool = False,
+            weight_quant = None,
+            input_quant = None,
+            ):
         self.model_compressor = model_compressor
         self.layer_name = layer_name
-        self.quantized = True  # toggle based on the case we're running
-        self.compressed = False  # toggle based on the case we're running
+        self.quantized = quantized
+        self.do_decompress = do_decompress
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -26,35 +38,110 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                     params_dtype: torch.dtype, weight_loader: Callable,
                     **kwargs):
         layer.logical_widths = output_partition_sizes
-        self.params_dtype=params_dtype
+        self.output_dtype=params_dtype
+
+        weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
 
         # parameter to store uncompressed weight or decompressed weight
         weight = ModelWeightParameter(
             data=torch.empty(sum(output_partition_sizes),
                              input_size_per_partition,
-                             dtype=torch.float8_e4m3fn),
+                             dtype=weights_dtype),
             input_dim=1,
             output_dim=0,
             weight_loader=weight_loader)
+        
+        if self.do_decompress:
+            # store compression specific things to be used
+            # later during decompression
+
+            # compressed weight for 2:4 sparse (compressed-tensors)
+            weight_packed = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // 2,
+                dtype=weights_dtype),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader
+                )
+            
+            bits_per_weight_element = weight.itemsize * 8 
+            meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
+            meta_input_size = (
+                input_size_per_partition // 32
+                if bits_per_weight_element == 8
+                else input_size_per_partition // 16
+            )
+
+            # meta tensor for 2:4 decompression
+            meta = ModelWeightParameter(data=torch.empty(
+                sum(output_partition_sizes), 
+                meta_input_size,
+                dtype=meta_dtype),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader)
+
+            layer.register_parameter("weight_packed", weight_packed)
+            layer.register_parameter("meta", meta)
+        
+        if self.quantized:
+
+            if self.weight_quant.strategy == QuantizationStrategy.CHANNEL.value:
+                weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                dtype=torch.float32),
+                                output_dim=0,
+                                weight_loader=weight_loader)
+            else:
+                weight_scale = PerTensorScaleParameter(data=torch.empty(
+                    len(output_partition_sizes), dtype=torch.float32),
+                                                    weight_loader=weight_loader)
+                # check if this is needed
+                weight_zero_point = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=weights_dtype),
+                                                weight_loader=weight_loader)
+                layer.register_parameter("weight_zero_point", weight_zero_point)
+                
+                
+            layer.register_parameter("weight_scale", weight_scale)
+            
+            # input quant will be non-none
+            if not self.input_quant.dynamic:
+                # register input quant scale
+                if self.input_quant.strategy == QuantizationStrategy.CHANNEL.value:
+                    pass
+                else:
+                    input_scale = PerTensorScaleParameter(data=torch.empty(
+                    len(output_partition_sizes), dtype=torch.float32),
+                                                    weight_loader=weight_loader)
+                    # Can we ignore this?
+                    input_zero_point = PerTensorScaleParameter(data=torch.empty(
+                    len(output_partition_sizes), dtype=weights_dtype),
+                                                weight_loader=weight_loader)
+                    layer.register_parameter("input_zero_point", input_zero_point)
+
+                
+                layer.register_parameter("input_scale", input_scale)
 
-        weight_scale = ChannelQuantScaleParameter(
-            data=torch.empty((sum(output_partition_sizes), 1),
-                             dtype=torch.float32),
-                             output_dim=0,
-                             weight_loader=weight_loader)
-
-        layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight", weight)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         """
         Apply any transformations to the weights after loading
         them from disk
-
+        
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
         :param layer: The layer with the weights to be processed
+        
         """
-
-        w_compressed, meta = ops.cutlass_compress_entry(layer.weight)
+        weight_to_compress = (
+            layer.weight if not self.do_decompress
+            else self._decompress_24_weight(layer.weight_packed.data, layer.meta.data)
+        )
+        w_compressed, meta = ops.cutlass_compress_entry(weight_to_compress)
         layer.w_compressed = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
@@ -73,8 +160,11 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
-        
-        q_input, input_scale = ops.scaled_fp8_quant(
+        if hasattr(layer, "input_scale"):
+            q_input, input_scale = ops.scaled_fp8_quant(
+                x, scale=layer.input_scale)
+        else:
+            q_input, input_scale = ops.scaled_fp8_quant(
             x, use_per_token_if_dynamic=True)
 
         out = ops.cutlass_scaled_sparse_mm(
@@ -83,13 +173,68 @@ def apply_weights(self,
             b=q_input.t(),
             scale_a=layer.weight_scale,
             scale_b=input_scale,
-            out_dtype=self.params_dtype,
+            out_dtype=self.output_dtype,
             bias=bias
         )
 
         out = out.t()
         assert out.is_contiguous()
         return out
+    
+    def _decompress_24_weight(self, weight_packed: torch.Tensor, meta: torch.Tensor) -> torch.Tensor:
+        qkv_sizes = [2048, 256, 256]
+        gate_up_sizes = [5632, 5632]
+        split_weights = None 
+        split_meta = None
+
+        def _process_split(input_weight, input_meta):
+            weight_data = {
+                "weight_packed": input_weight,
+                "meta": input_meta
+            }
+            decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
+            return decompress
+
+        print(self.layer_name)
+        if "qkv" in self.layer_name:
+            split_weights = torch.split(weight_packed, qkv_sizes)
+            split_meta = torch.split(meta, qkv_sizes)
+        elif "gate_up" in self.layer_name:
+            split_weights = torch.split(weight_packed, gate_up_sizes)
+            split_meta = torch.split(meta, gate_up_sizes)
+
+        if split_weights:
+            all_compress = []
+            for i in range(len(split_weights)):
+                print(split_weights[i].shape, split_meta[i].shape)
+                compress_i = _process_split(split_weights[i], split_meta[i])
+                all_compress.append(compress_i)
+
+            decompressed = torch.cat(all_compress)
+        else:
+            decompressed = _process_split(weight_packed, meta)
+
+        return decompressed
+    
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+        
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+        
+        if (self.weight_quant.type == QuantizationType.FLOAT
+            and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+        
+        if (self.weight_quant.type == QuantizationType.INT
+            and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+        
+        raise ValueError("Quantization type not supported by Cutlass")
+
 
 
 def check_24(tensor):

From ed157778358fca1afd3d9fdca9ad1c0dddb7dc02 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Thu, 21 Nov 2024 00:29:37 +0000
Subject: [PATCH 60/92] Move model compressor to scheme

---
 .../compressed_tensors/compressed_tensors.py  | 42 ++++++++++++-------
 .../schemes/compressed_tensors_24.py          | 10 +++--
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index a493190b8825e..8146e08912f69 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, Literal, Optional, cast
 
 import torch
 from compressed_tensors.config import CompressionFormat
@@ -29,7 +29,8 @@
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
-
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
 class CompressedTensorsConfig(QuantizationConfig):
 
     def __init__(self,
@@ -37,8 +38,8 @@ def __init__(self,
                  ignore: List[str],
                  quant_format: str,
                  kv_cache_scheme: Optional[Dict[str, Any]] = None,
-                 model_compressor: Optional[ModelCompressor] = None,
-                 sparsity_scheme_map: Optional[Dict[str, Any]] = None
+                 sparsity_scheme_map: Optional[Dict[str, SparsityCompressionConfig]] = None,
+                 config: Optional[Dict[str, Any]] = None,
                  ):
 
         self.ignore = ignore
@@ -46,8 +47,8 @@ def __init__(self,
         # Map from [target -> scheme]
         self.target_scheme_map = target_scheme_map
         self.kv_cache_scheme = kv_cache_scheme
-        self.model_compressor = model_compressor
         self.sparsity_scheme_map = sparsity_scheme_map
+        self.config = config
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -92,28 +93,40 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         ignore: List[str] = cast(List[str], config.get("ignore", []))
         quant_format = cast(str, config.get("format"))
         target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
-        model_compressor = ModelCompressor.from_compression_config(config)
-        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(sparsity_config=model_compressor.sparsity_config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(config=config)
 
         return cls(
             target_scheme_map=target_scheme_map,
             ignore=ignore,
             quant_format=quant_format,
-            model_compressor=model_compressor,
             sparsity_scheme_map=sparsity_scheme_map,
+            config=config,
         )
     
     @classmethod
-    def _sparsity_scheme_map_from_config(cls, sparsity_config: SparsityCompressionConfig):
-        sparse_targets = cast(List[str], sparsity_config.targets) if sparsity_config else []
+    def _sparsity_scheme_map_from_config(cls, config: Dict[str, Any]) -> Dict[str, SparsityCompressionConfig]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            sparsity compression configurations
+        """
+        if (sparsity_config:=config.get(SPARSITY_CONFIG_NAME)) is None:
+            return dict()
+        
+        sparsity_config = SparsityCompressionConfig.model_validate(sparsity_config)
         sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
             target: sparsity_config
-            for target in sparse_targets
+            for target in sparsity_config.targets or list()
         }
         return sparse_scheme_map
 
     @classmethod
-    def _quantization_scheme_map_from_config(cls, config: Dict[str, Any]):
+    def _quantization_scheme_map_from_config(cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
         target_scheme_map: Dict[str, Any] = dict()
         quant_format = cast(str, config.get("format"))
 
@@ -366,14 +379,13 @@ def get_scheme(
             # Have a valid sparsity scheme and the layer is supported by the Cutlass 2:4 Kernel
             needs_decompression = sparsity_scheme.format != CompressionFormat.dense.value
             is_quantized = weight_quant is not None or input_quant is not None 
-            
             scheme = CompressedTensors24(
-                model_compressor=self.model_compressor,
                 layer_name=layer_name,
                 quantized=is_quantized,
                 do_decompress=needs_decompression,
                 weight_quant=weight_quant,
-                input_quant=input_quant
+                input_quant=input_quant,
+                config=self.config,
             )
         else:
         # Find the quant_scheme
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 98ca42f5dea76..15ea9ff3c3a37 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,4 +1,4 @@
-from typing import List, Callable, Optional
+from typing import Any, Dict, List, Callable, Optional
 import torch
 
 from compressed_tensors.compressors import ModelCompressor
@@ -13,19 +13,23 @@
 class CompressedTensors24(CompressedTensorsScheme):
     def __init__(
             self, 
-            model_compressor: Optional[ModelCompressor] = None, 
             layer_name: Optional[str] = None,
             quantized: bool = False,
             do_decompress: bool = False,
             weight_quant = None,
             input_quant = None,
+            config: Optional[Dict[str, Any]] = None,
             ):
-        self.model_compressor = model_compressor
         self.layer_name = layer_name
         self.quantized = quantized
         self.do_decompress = do_decompress
         self.weight_quant = weight_quant
         self.input_quant = input_quant
+        self.model_compressor = (
+            ModelCompressor.from_compression_config(compression_config=config)
+            if self.do_decompress and config is not None
+            else None
+            )
 
 
     @classmethod

From 34a84a4f36795d1a6df9398d75b5aa5c94f7c9c6 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Tue, 3 Dec 2024 17:53:30 +0000
Subject: [PATCH 61/92] Cleanup + updates for compressed_tensor changes

---
 .../schemes/compressed_tensors_24.py          | 56 +++++++------------
 1 file changed, 21 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 15ea9ff3c3a37..e3df8317bdb99 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -7,6 +7,10 @@
     CompressedTensorsScheme)
 from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter, PerTensorScaleParameter
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    )
 
 __all__ = ["CompressedTensors24"]
 
@@ -60,7 +64,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
             # later during decompression
 
             # compressed weight for 2:4 sparse (compressed-tensors)
-            weight_packed = ModelWeightParameter(data=torch.empty(
+            sparse_24_packed_weight = ModelWeightParameter(data=torch.empty(
                 sum(output_partition_sizes),
                 input_size_per_partition // 2,
                 dtype=weights_dtype),
@@ -86,7 +90,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                 output_dim=0,
                 weight_loader=weight_loader)
 
-            layer.register_parameter("weight_packed", weight_packed)
+            layer.register_parameter("sparse_24_packed_weight", sparse_24_packed_weight)
             layer.register_parameter("meta", meta)
         
         if self.quantized:
@@ -101,31 +105,16 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                 weight_scale = PerTensorScaleParameter(data=torch.empty(
                     len(output_partition_sizes), dtype=torch.float32),
                                                     weight_loader=weight_loader)
-                # check if this is needed
-                weight_zero_point = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=weights_dtype),
-                                                weight_loader=weight_loader)
-                layer.register_parameter("weight_zero_point", weight_zero_point)
-                
                 
             layer.register_parameter("weight_scale", weight_scale)
             
             # input quant will be non-none
             if not self.input_quant.dynamic:
                 # register input quant scale
-                if self.input_quant.strategy == QuantizationStrategy.CHANNEL.value:
-                    pass
-                else:
-                    input_scale = PerTensorScaleParameter(data=torch.empty(
-                    len(output_partition_sizes), dtype=torch.float32),
-                                                    weight_loader=weight_loader)
-                    # Can we ignore this?
-                    input_zero_point = PerTensorScaleParameter(data=torch.empty(
-                    len(output_partition_sizes), dtype=weights_dtype),
+                input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
                                                 weight_loader=weight_loader)
-                    layer.register_parameter("input_zero_point", input_zero_point)
-
-                
+                            
                 layer.register_parameter("input_scale", input_scale)
 
         layer.register_parameter("weight", weight)
@@ -142,8 +131,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         
         """
         weight_to_compress = (
-            layer.weight if not self.do_decompress
-            else self._decompress_24_weight(layer.weight_packed.data, layer.meta.data)
+            layer.weight.data if not self.do_decompress
+            else self._decompress_layer_weight(layer)
         )
         w_compressed, meta = ops.cutlass_compress_entry(weight_to_compress)
         layer.w_compressed = torch.nn.Parameter(w_compressed, requires_grad=False)
@@ -185,38 +174,35 @@ def apply_weights(self,
         assert out.is_contiguous()
         return out
     
-    def _decompress_24_weight(self, weight_packed: torch.Tensor, meta: torch.Tensor) -> torch.Tensor:
-        qkv_sizes = [2048, 256, 256]
-        gate_up_sizes = [5632, 5632]
+    def _decompress_layer_weight(self, layer: torch.nn.Module) -> torch.Tensor:
+        
+        sparse_24_packed_weight = layer.sparse_24_packed_weight.data
+        meta = layer.meta.data
+
         split_weights = None 
         split_meta = None
 
         def _process_split(input_weight, input_meta):
             weight_data = {
-                "weight_packed": input_weight,
+                "sparse_24_packed_weight": input_weight,
                 "meta": input_meta
             }
             decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
             return decompress
 
-        print(self.layer_name)
-        if "qkv" in self.layer_name:
-            split_weights = torch.split(weight_packed, qkv_sizes)
-            split_meta = torch.split(meta, qkv_sizes)
-        elif "gate_up" in self.layer_name:
-            split_weights = torch.split(weight_packed, gate_up_sizes)
-            split_meta = torch.split(meta, gate_up_sizes)
+        if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
+            split_weights = torch.split(sparse_24_packed_weight, layer.logical_widths)
+            split_meta = torch.split(meta, layer.logical_widths)
 
         if split_weights:
             all_compress = []
             for i in range(len(split_weights)):
-                print(split_weights[i].shape, split_meta[i].shape)
                 compress_i = _process_split(split_weights[i], split_meta[i])
                 all_compress.append(compress_i)
 
             decompressed = torch.cat(all_compress)
         else:
-            decompressed = _process_split(weight_packed, meta)
+            decompressed = _process_split(sparse_24_packed_weight, meta)
 
         return decompressed
     

From 4045bdabe4e7ea357d5263665a2c3362e62d854e Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Wed, 20 Nov 2024 23:57:56 +0000
Subject: [PATCH 62/92] Add cherry-picked heuristic for Llama3 8B model

---
 .../cutlass_benchmarks/sparse_mm/bench_v1.py  |   6 +-
 .../cutlass_benchmarks/sparse_mm/bench_v2.py  |  42 +++----
 .../cutlass_benchmarks/sparse_mm/utils.py     |   6 +-
 .../sparse_mm/weight_shapes.py                |   2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  82 +++++++++++--
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 114 ++++++++++++++++++
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |   2 +-
 vllm/_custom_ops.py                           |   2 +-
 8 files changed, 215 insertions(+), 41 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
index d7d585bb6956d..938611dda7447 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -96,7 +96,7 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # Create tensors
     b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
     aT = a.t()
-    bT = b.t()
+    bT = b
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
@@ -104,8 +104,8 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
 
-    if not torch.allclose(out.t(), out_ref):
-        print("Incorrect result")
+    if not torch.allclose(out, out_ref, rtol=1e-2, atol=1e-2):
+        print(f"Incorrect result for {m}, {k}, {n}")
         exit()
 
     timers = []
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
index f9b4871044526..19e85da657ec1 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -242,9 +242,8 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue)
             dtype, m, n, k
         )
         AsT = [x.t() for x in As]
-        BsT = [x.t() for x in Bs]
         bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
-        bf16_BsT = [x.to(dtype=torch.bfloat16) for x in BsT]
+        bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
         scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         # Because the transposed output will be computed
@@ -266,13 +265,13 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue)
             bench = BenchMM(cuda_graph_params, label, sub_label,
                             "pytorch_bf16_bf16_bf16_matmul-no-scales", 
                             torch.mm,
-                            ArgPool(bf16_As), ArgPool(bf16_BsT))
+                            ArgPool(bf16_As), ArgPool(bf16_Bs))
 
         elif kernel_type == 'pytorch_scaled_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
                             "pytorch_fp8_fp8_bf16_scaled_mm",
                             torch._scaled_mm,
-                            ArgPool(As), ArgPool(BsT),
+                            ArgPool(As), ArgPool(Bs),
                             scale_a=scale_a, scale_b=scale_b,
                             out_dtype=torch.bfloat16)
 
@@ -280,21 +279,21 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue)
             bench = BenchMM(cuda_graph_params, label, sub_label,
                             "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
                             torch._scaled_mm,
-                            ArgPool(As), ArgPool(BsT),
+                            ArgPool(As), ArgPool(Bs),
                             scale_a=scale_a, scale_b=scale_b,
                             out_dtype=torch.bfloat16,
                             use_fast_accum=True)
 
         elif kernel_type == 'cutlass_scaled_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_mm_default", 
+                            "cutlass_fp8_fp8_bf16_scaled_mm", 
                             ops.cutlass_scaled_mm,
-                            ArgPool(As), ArgPool(BsT), scale_a, scale_b,
+                            ArgPool(As), ArgPool(Bs), scale_a, scale_b,
                             torch.bfloat16)
 
-        elif kernel_type == 'cutlass_sparse_mm':
+        elif kernel_type == 'cutlass_scaled_sparse_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm_default", 
+                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm", 
                             ops.cutlass_scaled_sparse_mm,
                             ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
                             scale_b, scale_a, torch.bfloat16)
@@ -417,9 +416,8 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
                     dtype, m, n, k
                 )
                 AsT = [x.t() for x in As]
-                BsT = [x.t() for x in Bs]
                 bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
-                bf16_BsT = [x.to(dtype=torch.bfloat16) for x in BsT]
+                bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
                 scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
                 scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
                 out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
@@ -440,13 +438,13 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
                     bench = BenchMM(cuda_graph_params, label, sub_label,
                                     "pytorch_bf16_bf16_bf16_matmul-no-scales", 
                                     torch.mm,
-                                    ArgPool(bf16_As), ArgPool(bf16_BsT))
+                                    ArgPool(bf16_As), ArgPool(bf16_Bs))
 
                 elif kernel_type == 'pytorch_scaled_mm':
                     bench = BenchMM(cuda_graph_params, label, sub_label,
                                     "pytorch_fp8_fp8_bf16_scaled_mm",
                                     torch._scaled_mm,
-                                    ArgPool(As), ArgPool(BsT),
+                                    ArgPool(As), ArgPool(Bs),
                                     scale_a=scale_a, scale_b=scale_b,
                                     out_dtype=torch.bfloat16)
 
@@ -454,21 +452,21 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
                     bench = BenchMM(cuda_graph_params, label, sub_label,
                                     "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
                                     torch._scaled_mm,
-                                    ArgPool(As), ArgPool(BsT),
+                                    ArgPool(As), ArgPool(Bs),
                                     scale_a=scale_a, scale_b=scale_b,
                                     out_dtype=torch.bfloat16,
                                     use_fast_accum=True)
 
                 elif kernel_type == 'cutlass_scaled_mm':
                     bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "cutlass_fp8_fp8_bf16_scaled_mm_default", 
+                                    "cutlass_fp8_fp8_bf16_scaled_mm", 
                                     ops.cutlass_scaled_mm,
-                                    ArgPool(As), ArgPool(BsT), scale_a, scale_b,
+                                    ArgPool(As), ArgPool(Bs), scale_a, scale_b,
                                     torch.bfloat16)
 
-                elif kernel_type == 'cutlass_sparse_mm':
+                elif kernel_type == 'cutlass_scaled_sparse_mm':
                     bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "cutlass_fp8_fp8_bf16_scaled_sparse_mm_default", 
+                                    "cutlass_fp8_fp8_bf16_scaled_sparse_mm", 
                                     ops.cutlass_scaled_sparse_mm,
                                     ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
                                     scale_b, scale_a, torch.bfloat16)
@@ -525,11 +523,11 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
     
     # Prepare configs for all kernels
     standard_kernels = [
-        {'kernel_type': 'pytorch_mm'},
-        {'kernel_type': 'pytorch_scaled_mm'},
-        {'kernel_type': 'pytorch_scaled_mm_fast'},
+        # {'kernel_type': 'pytorch_mm'},
+        # {'kernel_type': 'pytorch_scaled_mm'},
+        # {'kernel_type': 'pytorch_scaled_mm_fast'},
         {'kernel_type': 'cutlass_scaled_mm'},
-        {'kernel_type': 'cutlass_sparse_mm'}
+        {'kernel_type': 'cutlass_scaled_sparse_mm'}
     ]
     
     # Create configs for standard kernels
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
index 0c7bde70412c7..0bc0816b1af62 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
@@ -49,14 +49,14 @@ def prune_to_2_4(tensor):
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
                              k: int) -> Tuple[torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
 
     # # Initialize a to all ones
     # a = torch.ones((m, k), device='cuda')
     # # Initialize b to all ones
     # b = torch.ones((n, k), device='cuda')
 
-    b = prune_to_2_4(b)
+    b = prune_to_2_4(b.t()).t()
 
     if dtype == torch.int8:
         a, b = to_int8(a), to_int8(b)
@@ -69,7 +69,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
     else:
         raise ValueError("unsupported dtype")
 
-    b_compressed, e = ops.cutlass_compress_entry(b)
+    b_compressed, e = ops.cutlass_compress_entry(b.t())
 
     # Compressed B, Metadata, Original A, B
     return b_compressed, e, a, b
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
index 2999244bf9b95..77f15891d84b2 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
@@ -66,7 +66,7 @@
     # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
     #         22016, 27648, 28672]
     "llama-representative-set": [
-        # ([4096, 4096], None),  # small K, small N
+        ([4096, 4096], None),  # small K, small N
         ([4096, 8192], None),  # small K, medium N
         ([4096, 22016], None),  # small K, large N
         ([14336, 4096], None),  # large K, small N
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 31f8392412d83..0b49ac1719a4f 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -52,20 +52,85 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
       typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM512 =
       typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
-
-  uint32_t const n = b.size(1);
-  uint32_t const mp2 =
+    
+  using Cutlass3xGemm1 =
+      typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm2 =
+      typename sm90_fp8_config_2<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm3 =
+      typename sm90_fp8_config_3<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm4 =
+      typename sm90_fp8_config_4<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm5 =
+      typename sm90_fp8_config_5<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm6 =
+      typename sm90_fp8_config_6<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm7 =
+      typename sm90_fp8_config_7<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemm8 =
+      typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const n = b.size(1); // Batch size
+  uint32_t const m = a.size(0);
+  uint32_t const np2 =
       std::max(static_cast<uint32_t>(64), next_pow_2(n));  // next power of 2
 
-  if (mp2 <= 64) {
+  if (np2 <= 64) {
+    if (m == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+    else if (m == 4096 || m == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (np2 <= 128) {
+    if (m == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+    else if (m == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+    else if (m == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else if (np2 <= 256) {
+    if (m == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+    else if (m == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+    else if (m == 6144) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    if (m == 6144 || m == 28672) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+    else if (m == 4096) {
+      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
+        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    }
+  }
+
+  // Otherwise the default heuristic
+  if (np2 <= 64) {
     // n in [1, 64]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
+  } else if (np2 <= 128) {
     // n in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 256) {
+  } else if (np2 <= 256) {
     // n in (128, 256]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
         out, a, e, b, std::forward<EpilogueArgs>(args)...);
@@ -181,8 +246,8 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor co
                                      torch::Tensor const& e,
                                      torch::Tensor const& b,
                                      EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(e.dtype() == torch::kUInt8);
   if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(e.dtype() == torch::kUInt8);
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
@@ -195,7 +260,6 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor co
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat8_e4m3fn) {
-    TORCH_CHECK(e.dtype() == torch::kUInt8);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
@@ -210,7 +274,6 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor co
     }
   }
   else if (a.dtype() == torch::kFloat16) {
-    TORCH_CHECK(e.dtype() == torch::kUInt8);
     TORCH_CHECK(b.dtype() == torch::kFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
@@ -226,7 +289,6 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor co
   }
   else { // a.dtype() == torch::kBFloat16
     TORCH_CHECK(a.dtype() == torch::kBFloat16);
-    TORCH_CHECK(e.dtype() == torch::kUInt8);
     TORCH_CHECK(b.dtype() == torch::kBFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 2f303677c0a8d..ad103e9151ca3 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -512,6 +512,120 @@ struct sm90_bf16_config_default {
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
+//////////////////////// Cherry-Picking Kernels ////////////////////////
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_1 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64,_64,_256>;
+  using ClusterShape = Shape<_8,_1,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_2 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128,_64,_256>;
+  using ClusterShape = Shape<_8,_1,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_3 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64,_64,_256>;
+  using ClusterShape = Shape<_1,_2,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_4 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64,_128,_256>;
+  using ClusterShape = Shape<_8,_1,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_5 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_8,_1,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_6 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64,_128,_256>;
+  using ClusterShape = Shape<_1,_2,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_7 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_1,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_8 {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128,_256,_128>;
+  using ClusterShape = Shape<_8,_1,_1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
+};
+////////////////////////////////////////////////////////////////////////
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_default {
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index aaf964fe3f4c0..e5a80e75a6b0b 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -52,7 +52,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
 
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1);  // Row-major
-  TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major
+  // TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major
   // TORCH_CHECK(c.stride(0) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(b.stride(1) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ff6fa7583789d..91a5cbe7321f1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -557,7 +557,7 @@ def cutlass_scaled_sparse_mm(a: torch.Tensor,
 
     torch.ops._C.cutlass_scaled_sparse_mm(out, a, e, b, scale_a, scale_b, bias)
 
-    return out
+    return out.t()
 
 
 # aqlm

From 7c61ab014593c2aa76ad21e16dd8765c7b375b06 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 3 Dec 2024 20:47:07 +0000
Subject: [PATCH 63/92] updated with latest kernel

---
 .../compressed_tensors/schemes/compressed_tensors_24.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index e3df8317bdb99..8607464bd9dc0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -170,7 +170,6 @@ def apply_weights(self,
             bias=bias
         )
 
-        out = out.t()
         assert out.is_contiguous()
         return out
     

From 4e060dfc6fd4dd7b52a9b10ec93a0797af7b37b6 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Dec 2024 17:05:30 +0000
Subject: [PATCH 64/92] remove compressed support; validate against ct models
 for tp=1,24

---
 .../compressed_tensors/compressed_tensors.py  | 114 +++++-----
 .../schemes/compressed_tensors_24.py          | 209 +++++-------------
 2 files changed, 121 insertions(+), 202 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 8146e08912f69..b8337fd250a8e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -31,16 +31,20 @@
 
 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
 QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
+
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self,
-                 target_scheme_map: Dict[str, Any],
-                 ignore: List[str],
-                 quant_format: str,
-                 kv_cache_scheme: Optional[Dict[str, Any]] = None,
-                 sparsity_scheme_map: Optional[Dict[str, SparsityCompressionConfig]] = None,
-                 config: Optional[Dict[str, Any]] = None,
-                 ):
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        sparsity_scheme_map: Optional[Dict[str,
+                                           SparsityCompressionConfig]] = None,
+        config: Optional[Dict[str, Any]] = None,
+    ):
 
         self.ignore = ignore
         self.quant_format = quant_format
@@ -92,8 +96,10 @@ def get_quant_method(
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         ignore: List[str] = cast(List[str], config.get("ignore", []))
         quant_format = cast(str, config.get("format"))
-        target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
-        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(config=config)
+        target_scheme_map = cls._quantization_scheme_map_from_config(
+            config=config)
+        sparsity_scheme_map = cls._sparsity_scheme_map_from_config(
+            config=config)
 
         return cls(
             target_scheme_map=target_scheme_map,
@@ -102,18 +108,21 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
             sparsity_scheme_map=sparsity_scheme_map,
             config=config,
         )
-    
+
     @classmethod
-    def _sparsity_scheme_map_from_config(cls, config: Dict[str, Any]) -> Dict[str, SparsityCompressionConfig]:
+    def _sparsity_scheme_map_from_config(
+            cls, config: Dict[str,
+                              Any]) -> Dict[str, SparsityCompressionConfig]:
         """
         :param config: The `quantization_config` dictionary from config.json
         :return: A dictionary mapping target layer names to their corresponding
             sparsity compression configurations
         """
-        if (sparsity_config:=config.get(SPARSITY_CONFIG_NAME)) is None:
+        if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None:
             return dict()
-        
-        sparsity_config = SparsityCompressionConfig.model_validate(sparsity_config)
+
+        sparsity_config = SparsityCompressionConfig.model_validate(
+            sparsity_config)
         sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
             target: sparsity_config
             for target in sparsity_config.targets or list()
@@ -121,7 +130,8 @@ def _sparsity_scheme_map_from_config(cls, config: Dict[str, Any]) -> Dict[str, S
         return sparse_scheme_map
 
     @classmethod
-    def _quantization_scheme_map_from_config(cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
+    def _quantization_scheme_map_from_config(
+            cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
         """
         :param config: The `quantization_config` dictionary from config.json
         :return: A dictionary mapping target layer names to their corresponding
@@ -144,7 +154,8 @@ def _quantization_scheme_map_from_config(cls, config: Dict[str, Any]) -> QUANTIZ
             targets = quant_config.get("targets")
             for target in targets:
                 target_scheme_map[target] = {}
-                target_scheme_map[target]["weights"] = QuantizationArgs.model_validate(
+                target_scheme_map[target][
+                    "weights"] = QuantizationArgs.model_validate(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
@@ -158,7 +169,8 @@ def _quantization_scheme_map_from_config(cls, config: Dict[str, Any]) -> QUANTIZ
                         assert target_scheme_map[target][
                             "weights"].type == QuantizationType.FLOAT
                     else:
-                        target_scheme_map[target]["input_activations"] = QuantizationArgs.model_validate(
+                        target_scheme_map[target][
+                            "input_activations"] = QuantizationArgs.model_validate(
                                 quant_config.get("input_activations"))
         return target_scheme_map
 
@@ -359,7 +371,7 @@ def get_scheme(
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
-        
+
         matched_target = find_matched_target(
             layer_name=layer_name,
             module=layer,
@@ -369,42 +381,38 @@ def get_scheme(
         weight_quant = scheme_dict.get("weights")
         input_quant = scheme_dict.get("input_activations")
 
-        sparsity_scheme: Optional[SparsityCompressionConfig] = self.sparsity_scheme_map.get(matched_target)    
+        sparsity_scheme: Optional[
+            SparsityCompressionConfig] = self.sparsity_scheme_map.get(
+                matched_target)
 
-        if self.supports_cutlass_24(
-            weight_quant=weight_quant,
-            input_quant=input_quant,
-            sparsity_scheme=sparsity_scheme
-        ):
-            # Have a valid sparsity scheme and the layer is supported by the Cutlass 2:4 Kernel
-            needs_decompression = sparsity_scheme.format != CompressionFormat.dense.value
-            is_quantized = weight_quant is not None or input_quant is not None 
+        if self.supports_cutlass_24(weight_quant=weight_quant,
+                                    input_quant=input_quant,
+                                    sparsity_scheme=sparsity_scheme):
+            # Have a valid sparsity scheme 
+            # Validate layer is supported by Cutlass 2:4 Kernel
             scheme = CompressedTensors24(
-                layer_name=layer_name,
-                quantized=is_quantized,
-                do_decompress=needs_decompression,
+                quantized=weight_quant is not None or input_quant is not None,
                 weight_quant=weight_quant,
-                input_quant=input_quant,
-                config=self.config,
+                input_quant=input_quant
             )
         else:
-        # Find the quant_scheme
+            # Find the quant_scheme
             scheme = self._get_scheme_from_parts(
                 weight_quant=weight_quant,
                 input_quant=input_quant,
-                )
+            )
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
         self._check_scheme_supported(scheme.get_min_capability())
         return scheme
-    
+
     @staticmethod
     def supports_cutlass_24(
-        weight_quant: Optional[QuantizationArgs], 
-        input_quant: Optional[QuantizationArgs], 
-        sparsity_scheme: Optional[SparsityCompressionConfig]=None
-        ) -> bool:
+            weight_quant: Optional[QuantizationArgs],
+            input_quant: Optional[QuantizationArgs],
+            sparsity_scheme: Optional[SparsityCompressionConfig] = None
+    ) -> bool:
         """
         Check if the layer is supported by the Cutlass 2:4 Kernel
         Conditions:
@@ -418,21 +426,21 @@ def supports_cutlass_24(
         :return: True if the layer is supported by the Cutlass 2:4 Kernel
             False otherwise
         """
-        
-        if (
-            sparsity_scheme is None or
-            sparsity_scheme.sparsity_structure != SparsityStructure.TWO_FOUR.value
-        ):
+        is_valid_sparsity = (sparsity_scheme is not None
+                             and sparsity_scheme.sparsity_structure
+                             == SparsityStructure.TWO_FOUR.value
+                             and sparsity_scheme.format == "dense")
+        if not is_valid_sparsity:
             return False
-        
+
         # Unquantized cases are supported
         if weight_quant is None and input_quant is None:
             return True
-        
+
         # Weight only quantization is not-supported
         if weight_quant is not None and input_quant is None:
             return False
-        
+
         supported_weight_quant_strategies = [
             QuantizationStrategy.TENSOR.value,
             QuantizationStrategy.CHANNEL.value
@@ -440,17 +448,15 @@ def supports_cutlass_24(
 
         if weight_quant.strategy not in supported_weight_quant_strategies:
             return False
-        
+
         supported_input_quant_strategies = [
-            QuantizationStrategy.TENSOR.value,
-            QuantizationStrategy.TOKEN.value
+            QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
         ]
-        
+
         if input_quant.strategy not in supported_input_quant_strategies:
             return False
-        
-        return weight_quant.num_bits == input_quant.num_bits == 8
 
+        return weight_quant.num_bits == input_quant.num_bits == 8
 
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 8607464bd9dc0..4b23d9c05582a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,128 +1,79 @@
 from typing import Any, Dict, List, Callable, Optional
 import torch
 
-from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.quantization import QuantizationType, QuantizationStrategy
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter, PerTensorScaleParameter
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import (
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    )
 
 __all__ = ["CompressedTensors24"]
 
+
 class CompressedTensors24(CompressedTensorsScheme):
+
     def __init__(
-            self, 
-            layer_name: Optional[str] = None,
-            quantized: bool = False,
-            do_decompress: bool = False,
-            weight_quant = None,
-            input_quant = None,
-            config: Optional[Dict[str, Any]] = None,
-            ):
-        self.layer_name = layer_name
+        self,
+        quantized: bool = False,
+        weight_quant=None,
+        input_quant=None
+    ):
         self.quantized = quantized
-        self.do_decompress = do_decompress
         self.weight_quant = weight_quant
         self.input_quant = input_quant
-        self.model_compressor = (
-            ModelCompressor.from_compression_config(compression_config=config)
-            if self.do_decompress and config is not None
-            else None
-            )
-
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 90
 
     def create_weights(self, layer: torch.nn.Module, input_size: int,
-                    output_partition_sizes: List[int],
-                    input_size_per_partition: int,
-                    params_dtype: torch.dtype, weight_loader: Callable,
-                    **kwargs):
-        layer.logical_widths = output_partition_sizes
-        self.output_dtype=params_dtype
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
 
+        self.output_dtype = params_dtype
         weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
 
-        # parameter to store uncompressed weight or decompressed weight
-        weight = ModelWeightParameter(
-            data=torch.empty(sum(output_partition_sizes),
-                             input_size_per_partition,
-                             dtype=weights_dtype),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader)
-        
-        if self.do_decompress:
-            # store compression specific things to be used
-            # later during decompression
-
-            # compressed weight for 2:4 sparse (compressed-tensors)
-            sparse_24_packed_weight = ModelWeightParameter(data=torch.empty(
-                sum(output_partition_sizes),
-                input_size_per_partition // 2,
-                dtype=weights_dtype),
-                input_dim=1,
-                output_dim=0,
-                weight_loader=weight_loader
-                )
-            
-            bits_per_weight_element = weight.itemsize * 8 
-            meta_dtype = torch.int32 if bits_per_weight_element == 8 else torch.int16
-            meta_input_size = (
-                input_size_per_partition // 32
-                if bits_per_weight_element == 8
-                else input_size_per_partition // 16
-            )
-
-            # meta tensor for 2:4 decompression
-            meta = ModelWeightParameter(data=torch.empty(
-                sum(output_partition_sizes), 
-                meta_input_size,
-                dtype=meta_dtype),
-                input_dim=1,
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=weights_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
+        if self.weight_quant.strategy == QuantizationStrategy.CHANNEL.value:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
                 output_dim=0,
                 weight_loader=weight_loader)
+        else:
+            assert self.weight_quant.strategy == QuantizationStrategy.TOKEN.value
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
 
-            layer.register_parameter("sparse_24_packed_weight", sparse_24_packed_weight)
-            layer.register_parameter("meta", meta)
-        
-        if self.quantized:
+        layer.register_parameter("weight_scale", weight_scale)
 
-            if self.weight_quant.strategy == QuantizationStrategy.CHANNEL.value:
-                weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
-                                dtype=torch.float32),
-                                output_dim=0,
-                                weight_loader=weight_loader)
-            else:
-                weight_scale = PerTensorScaleParameter(data=torch.empty(
-                    len(output_partition_sizes), dtype=torch.float32),
-                                                    weight_loader=weight_loader)
-                
-            layer.register_parameter("weight_scale", weight_scale)
-            
-            # input quant will be non-none
-            if not self.input_quant.dynamic:
-                # register input quant scale
-                input_scale = PerTensorScaleParameter(data=torch.empty(
+        # input quant will be non-none
+        if not self.input_quant.dynamic:
+            # register input quant scale
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR.value
+            input_scale = PerTensorScaleParameter(data=torch.empty(
                 len(output_partition_sizes), dtype=torch.float32),
-                                                weight_loader=weight_loader)
-                            
-                layer.register_parameter("input_scale", input_scale)
+                                                  weight_loader=weight_loader)
+
+            layer.register_parameter("input_scale", input_scale)
 
         layer.register_parameter("weight", weight)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         """
-        Apply any transformations to the weights after loading
-        them from disk
+        Compress weights after loading. Store compressed weight and meta
+            tensor
         
         :post-condition: layer.w_compressed and layer.meta are
             set to the compressed weight and meta tensor in the
@@ -130,12 +81,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         :param layer: The layer with the weights to be processed
         
         """
-        weight_to_compress = (
-            layer.weight.data if not self.do_decompress
-            else self._decompress_layer_weight(layer)
-        )
-        w_compressed, meta = ops.cutlass_compress_entry(weight_to_compress)
-        layer.w_compressed = torch.nn.Parameter(w_compressed, requires_grad=False)
+        w_compressed, meta = ops.cutlass_compress_entry(layer.weight.data)
+        layer.w_compressed = torch.nn.Parameter(w_compressed,
+                                                requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
     def apply_weights(self,
@@ -153,81 +101,46 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
+        print("running")
         if hasattr(layer, "input_scale"):
             q_input, input_scale = ops.scaled_fp8_quant(
                 x, scale=layer.input_scale)
         else:
             q_input, input_scale = ops.scaled_fp8_quant(
-            x, use_per_token_if_dynamic=True)
-
-        out = ops.cutlass_scaled_sparse_mm(
-            a=layer.w_compressed,
-            e=layer.meta,
-            b=q_input.t(),
-            scale_a=layer.weight_scale,
-            scale_b=input_scale,
-            out_dtype=self.output_dtype,
-            bias=bias
-        )
+                x, use_per_token_if_dynamic=True)
+
+        out = ops.cutlass_scaled_sparse_mm(a=layer.w_compressed,
+                                           e=layer.meta,
+                                           b=q_input.t(),
+                                           scale_a=layer.weight_scale,
+                                           scale_b=input_scale,
+                                           out_dtype=self.output_dtype,
+                                           bias=bias)
 
         assert out.is_contiguous()
         return out
-    
-    def _decompress_layer_weight(self, layer: torch.nn.Module) -> torch.Tensor:
-        
-        sparse_24_packed_weight = layer.sparse_24_packed_weight.data
-        meta = layer.meta.data
-
-        split_weights = None 
-        split_meta = None
-
-        def _process_split(input_weight, input_meta):
-            weight_data = {
-                "sparse_24_packed_weight": input_weight,
-                "meta": input_meta
-            }
-            decompress = self.model_compressor.sparsity_compressor.decompress_weight(weight_data)
-            return decompress
-
-        if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
-            split_weights = torch.split(sparse_24_packed_weight, layer.logical_widths)
-            split_meta = torch.split(meta, layer.logical_widths)
-
-        if split_weights:
-            all_compress = []
-            for i in range(len(split_weights)):
-                compress_i = _process_split(split_weights[i], split_meta[i])
-                all_compress.append(compress_i)
-
-            decompressed = torch.cat(all_compress)
-        else:
-            decompressed = _process_split(sparse_24_packed_weight, meta)
 
-        return decompressed
-    
     def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
         if not self.quantized:
             return params_dtype
-        
+
         is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
 
         if not is_8_bits:
             raise ValueError("Cutlass only supports 8-bit quantization")
-        
+
         if (self.weight_quant.type == QuantizationType.FLOAT
-            and self.input_quant.type == QuantizationType.FLOAT):
+                and self.input_quant.type == QuantizationType.FLOAT):
             return torch.float8_e4m3fn
-        
+
         if (self.weight_quant.type == QuantizationType.INT
-            and self.input_quant.type == QuantizationType.INT):
+                and self.input_quant.type == QuantizationType.INT):
             return torch.int8
-        
-        raise ValueError("Quantization type not supported by Cutlass")
 
+        raise ValueError("Quantization type not supported by Cutlass")
 
 
 def check_24(tensor):
-    new_tensor = tensor.view(-1, 4)    
+    new_tensor = tensor.view(-1, 4)
     zero_counts = (new_tensor == 0).sum(dim=1)
     return (zero_counts >= 2).all().item()
-

From 7a6d0271402cbc1faa21ffedf264c26e66489bbf Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Dec 2024 17:37:10 +0000
Subject: [PATCH 65/92] add testing cases

---
 tests/quantization/test_compressed_tensors.py | 27 ++++++++++++++++++-
 tests/weight_loading/models.txt               |  1 +
 .../schemes/compressed_tensors_24.py          |  1 -
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 03097569b2b3b..d7deedb2dc49e 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -11,7 +11,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, CompressedTensors24)
 
 
 @pytest.mark.parametrize(
@@ -178,3 +178,28 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
+
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel", "token")])
+def test_compressed_tensors_2of4(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+        assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+        assert qkv_proj.scheme.input_quant.strategy == input_strategy
+        assert qkv_proj.scheme.quantized == True
+        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        assert sparsity_map.get("Linear").format == "dense"
+        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
\ No newline at end of file
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index a4ee9538d646b..ea0fa57f9b242 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,6 +21,7 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 4b23d9c05582a..133e8a236f722 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -101,7 +101,6 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
-        print("running")
         if hasattr(layer, "input_scale"):
             q_input, input_scale = ops.scaled_fp8_quant(
                 x, scale=layer.input_scale)

From 0987c9825850c3d617bd4bdf253a2ffbbcca4fe9 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Dec 2024 19:56:02 +0000
Subject: [PATCH 66/92] add support for all cases; update tests

---
 tests/quantization/test_compressed_tensors.py | 16 ++++++++---
 .../compressed_tensors/compressed_tensors.py  | 11 ++++----
 .../schemes/compressed_tensors_24.py          | 28 +++++++++++--------
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index d7deedb2dc49e..ea6d019872654 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -179,9 +179,17 @@ def test_compressed_tensors_kv_cache(vllm_runner):
         output = llm.generate_greedy("Hello world!", max_tokens=20)
         assert output
 
-@pytest.mark.parametrize(
-    "args_2of4",
-    [("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel", "token")])
+
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
+     "token"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+     "channel", "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token")
+])
 def test_compressed_tensors_2of4(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
@@ -202,4 +210,4 @@ def test_compressed_tensors_2of4(vllm_runner, args_2of4):
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
-        assert output
\ No newline at end of file
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b8337fd250a8e..da34b9b9aa68c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -388,13 +388,12 @@ def get_scheme(
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
                                     sparsity_scheme=sparsity_scheme):
-            # Have a valid sparsity scheme 
+            # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
-            scheme = CompressedTensors24(
-                quantized=weight_quant is not None or input_quant is not None,
-                weight_quant=weight_quant,
-                input_quant=input_quant
-            )
+            scheme = CompressedTensors24(quantized=weight_quant is not None
+                                         or input_quant is not None,
+                                         weight_quant=weight_quant,
+                                         input_quant=input_quant)
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 133e8a236f722..5fee61d340f7c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -4,20 +4,20 @@
 from compressed_tensors.quantization import QuantizationType, QuantizationStrategy
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter, PerTensorScaleParameter
+from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter, PerTensorScaleParameter, BasevLLMParameter
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
 
 __all__ = ["CompressedTensors24"]
 
 
 class CompressedTensors24(CompressedTensorsScheme):
 
-    def __init__(
-        self,
-        quantized: bool = False,
-        weight_quant=None,
-        input_quant=None
-    ):
+    def __init__(self,
+                 quantized: bool = False,
+                 weight_quant=None,
+                 input_quant=None):
         self.quantized = quantized
         self.weight_quant = weight_quant
         self.input_quant = input_quant
@@ -33,6 +33,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        **kwargs):
 
         self.output_dtype = params_dtype
+        layer.logical_widths = output_partition_sizes
         weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
 
         # parameter to store uncompressed weight
@@ -51,7 +52,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                 output_dim=0,
                 weight_loader=weight_loader)
         else:
-            assert self.weight_quant.strategy == QuantizationStrategy.TOKEN.value
+            assert self.weight_quant.strategy == QuantizationStrategy.TENSOR.value
             weight_scale = PerTensorScaleParameter(data=torch.empty(
                 len(output_partition_sizes), dtype=torch.float32),
                                                    weight_loader=weight_loader)
@@ -62,9 +63,9 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         if not self.input_quant.dynamic:
             # register input quant scale
             assert self.input_quant.strategy == QuantizationStrategy.TENSOR.value
-            input_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                  weight_loader=weight_loader)
+            input_scale = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.float32),
+                                            weight_loader=weight_loader)
 
             layer.register_parameter("input_scale", input_scale)
 
@@ -81,6 +82,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         :param layer: The layer with the weights to be processed
         
         """
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+            layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths),
+                                                    requires_grad=False)
         w_compressed, meta = ops.cutlass_compress_entry(layer.weight.data)
         layer.w_compressed = torch.nn.Parameter(w_compressed,
                                                 requires_grad=False)

From 299fe324652315a619ea8d7a0e017879b951f1b5 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Sat, 7 Dec 2024 13:31:57 +0000
Subject: [PATCH 67/92] rebase fix?

---
 .../epilogue/scaled_mm_epilogues_c3x.hpp            |  4 ++--
 csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu     | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 95764ecddc79f..fcc17c7727f94 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -36,13 +36,13 @@ struct ScaledEpilogueBase {
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
       Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 2fc430f9daa24..7317b53d946aa 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,5 +1,17 @@
 #include <stddef.h>
+
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
 #include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
 #include "cutlass/cutlass.h"
 #include "scaled_mm_c3x.cuh"
 
@@ -439,6 +451,7 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
+#endif
 // hyper-parameter sweep kernels
 
 void cutlass_scaled_mm_sm90_dispatch(torch::Tensor& out, torch::Tensor const& a,

From fbbd4699cf4605b56b5b799dc9d064b58fe27df8 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Sun, 8 Dec 2024 06:02:06 +0000
Subject: [PATCH 68/92] Fix the w8a8 build errors

---
 .../broadcast_load_epilogue_c2x.hpp           | 496 ++++++++++++++++++
 .../broadcast_load_epilogue_c3x.hpp           | 447 ++++++++++++++++
 .../cutlass_w8a8/scaled_mm_c2x.cu             |  53 +-
 .../cutlass_w8a8/scaled_mm_c2x.cuh            | 302 +++++++++++
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 296 +----------
 .../cutlass_w8a8/scaled_mm_c3x.cuh            |  18 +-
 .../cutlass_w8a8/scaled_mm_entry.cu           |   8 +-
 7 files changed, 1289 insertions(+), 331 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
 create mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp

diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
new file mode 100644
index 0000000000000..d407d66ab2aa6
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
@@ -0,0 +1,496 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/visitor_load.hpp from
+// https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either
+// row/column or scalar broadcasting where the tensor being loaded from is
+// always passed in via a device pointer. This lets one compiled kernel handle
+// all cases of per-tensor or per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graph
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cute/tensor.hpp"
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorRowOrScalarBroadcast {
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    // This function is modified from VisitorRowBroadcast
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+
+      if (params_ptr->row_broadcast) {
+        // In this case we are loading from a row vector and broadcasting
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          bool guard = get<1>(coord_v(i)) < n;
+          cutlass::arch::global_load<VecType, sizeof(VecType)>(
+              dst_v(i), (void const*)&src_v(i), guard);
+        }
+      } else {
+        // In this case we are loading from a scalar and broadcasting
+        VecType filled_vec;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < VecLength; i++) {
+          reinterpret_cast<Element*>(&filled_vec)[i] = *(params_ptr->ptr_row);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          if (get<1>(coord_v(i)) < n) {
+            dst_v(i) = filled_vec;
+          }
+        }
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow),
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// This is a modified RowBroadcast that will broadcast 0 if ptr_row is null
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorRowOrZeroBroadcast {
+
+  // This struct has been modified to remove null_default (because it's always 0)
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrZeroBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowOrZeroBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    // This function is modified from VisitorRowBroadcast
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+
+      if (params_ptr->ptr_row != nullptr) {
+        // In this case we are loading from a row vector and broadcasting
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          bool guard = get<1>(coord_v(i)) < n;
+          cutlass::arch::global_load<VecType, sizeof(VecType)>(
+              dst_v(i), (void const*)&src_v(i), guard);
+        }
+      } else {
+        // In this case we are broadcasting 0
+        VecType filled_vec;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < VecLength; i++) {
+          reinterpret_cast<Element*>(&filled_vec)[i] = Element{0};
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(src_v); ++i) {
+          if (get<1>(coord_v(i)) < n) {
+            dst_v(i) = filled_vec;
+          }
+        }
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow),
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColOrScalarBroadcast {
+
+  // This struct has been modified to have a bool indicating that ptr_col is a
+  // scalar that must be broadcast.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      RTensor&& tC_rCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_rCol(cute::forward<RTensor>(tC_rCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gCol;
+    RTensor tC_rCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+
+    // This function is modified from VisitorColBroadcast
+    CUTLASS_DEVICE void 
+    begin_epilogue() {
+      clear(tC_rCol);
+
+      Tensor pred = make_tensor<bool>(shape(tC_gCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tC_cCol(i)) < m;
+      }
+
+      if (params_ptr->col_broadcast) {
+        // In this case we are loading from a column vector and broadcasting
+        copy_if(pred, tC_gCol, tC_rCol);
+      } else {
+        // In this case we are loading from a scalar and broadcasting
+        auto dst_v = filter(tC_rCol);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(dst_v); ++i) {
+          if (pred(i)) {
+            dst_v(i) = *(params_ptr->ptr_col);
+          }
+        }
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_col;
+      frg_col.fill(tC_rCol(row_idx,iter_idx));
+      return frg_col;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+    Tensor tC_rCol = make_tensor_like(tC_gCol);
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tC_cCol = group_modes<1,4>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_rCol),
+      decltype(tC_cCol), ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_rCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+}
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
new file mode 100644
index 0000000000000..58b1e8ff159fb
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@@ -0,0 +1,447 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcast {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcast {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
+  }
+};
+
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index dbb72e8bbd3f5..ee801e16573d4 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -8,10 +8,6 @@
 #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
 
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp"
-
-using namespace vllm;
-
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
    NVIDIA GPUs with SM versions prior to sm90 (Hopper).
@@ -26,11 +22,12 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t,
+                                            Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -45,10 +42,10 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -64,10 +61,10 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
@@ -81,11 +78,12 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
+                                            Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -100,10 +98,10 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -119,10 +117,10 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
@@ -136,12 +134,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             Epilogue>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t,
+                                                   Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -149,13 +148,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::bfloat16_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, Epilogue>(
+      return vllm::cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                                  cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
@@ -171,10 +170,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -190,10 +189,10 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index d03242f44ab1d..6329ff63623e2 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,6 +21,7 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
+#include "broadcast_load_epilogue_c2x.hpp"
 #include "common.hpp"
 // clang-format on
 
@@ -70,6 +71,307 @@ struct enable_sm89_to_sm90 : Kernel {
 #endif
   }
 };
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad =
+      cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+          OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad =
+      cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  template <typename T>
+  using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast<
+      OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  template <typename T>
+  using RowOrZeroLoad =
+      cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast<
+          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      // it would technically work but no use case as data_ptr is never nullptr
+      static_assert(!std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ protected:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue directly supports per-tensor azp in int32 form.
+ * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
+ * term, which should already be multiplied with the scalar azp.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBiasAzp
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
+
+  // This is the full AZP term, azp * J @ B, shape (1,n)
+  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute float(accum - azp_adj), both operands are int32_t
+  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Accum, AzpWithAdj>;
+
+  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
+                                              EVTComputeAzp>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
+                                              EVTComputeScaleB, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue supports per-token azp by computing and applying
+ * the correction term using a rank-1 update. If the term were materialized,
+ * it would require O(m*n) space, and this way it only requires O(m+n) space.
+ * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
+ * point for each row of A.
+ * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
+ *
+ * This epilogue also supports bias, which remains per-channel.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBiasAzpToken
+    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
+
+  // Per-token azp term, shape (m,1)
+  using Azp = typename SUPER::template ColLoad<int32_t>;
+
+  // This is the AZP adjustment term, J @ B, shape (1,n)
+  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
+
+  // Compute azp * azp_adj
+  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, int32_t, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAzp =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Azp, AzpAdj>;
+
+  // Compute float(accum - azp*azp_adj), all operands are int32_t
+  using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::minus, float, int32_t,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeAcc =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAcc, Accum, EVTComputeAzp>;
+
+  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTComputeScaleB =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
+                                              EVTComputeAcc>;
+
+  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
+                                              EVTComputeScaleB, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& azp_adj,
+                                   torch::Tensor const& azp,
+                                   c10::optional<torch::Tensor> const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
+    auto azp_adj_args =
+        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+  }
+};
+
 template <typename Arch, template <typename> typename ArchGuard,
           typename ElementAB_, typename ElementD_,
           template <typename, typename> typename Epilogue_, typename TileShape,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 7317b53d946aa..84e1f367c8722 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,293 +1,8 @@
 #include <stddef.h>
-
-#include <cudaTypedefs.h>
-
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
 #include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
 #include "cutlass/cutlass.h"
 #include "scaled_mm_c3x.cuh"
 
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
-using namespace vllm;
-
-/*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
-*/
-
-namespace {
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-}  // namespace
-
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
@@ -424,11 +139,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == c.dtype(),
                 "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
         c, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        c, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
+                                                           b_scales);
   }
 }
 
@@ -443,15 +158,14 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
 
-#endif
 // hyper-parameter sweep kernels
 
 void cutlass_scaled_mm_sm90_dispatch(torch::Tensor& out, torch::Tensor const& a,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
index 9b1dd748bfbbe..b44b7cbf65080 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
@@ -369,9 +369,7 @@ struct cutlass_3x_gemm {
   using ElementAB = ElementAB_;
   using ElementD = ElementD_;
 
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, AccType,
-                                AccType>::type;
+  using ElementAcc = AccType;
 
   using EpilogueDescriptor =
       cutlass::epilogue::collective::detail::EpilogueDescriptor<
@@ -386,12 +384,16 @@ struct cutlass_3x_gemm {
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
+  static constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD,
+          StrideD, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -402,8 +404,8 @@ struct cutlass_3x_gemm {
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
           ElementAcc, TileShape, ClusterShape,
           Stages,
           KernelSchedule>::CollectiveOp;
@@ -425,7 +427,7 @@ inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 
   int32_t m = a.size(0);
   int32_t n = b.size(1);
-  int32_t k = a.size(1);
+  int32_t k = b.size(0);
 
   int64_t lda = a.stride(0);
   int64_t ldb = b.stride(1);
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 97a969cf5e3e0..1657f7d0b16e8 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -137,11 +137,9 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
     return;
   }
 
-  if (version_num >= 75) {
-    // Turing
-    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
-    return;
-  }
+  // Turing
+  TORCH_CHECK(version_num >= 75);
+  cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
 #endif
 
   TORCH_CHECK_NOT_IMPLEMENTED(

From 6dfc5c9a966ea8db9ca6f99d185ae538d0cbb91c Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Sun, 8 Dec 2024 17:14:06 +0000
Subject: [PATCH 69/92] fix int8 so that it works; applying format'

---
 benchmarks/benchmark_throughput.py            |   3 +-
 .../cutlass_benchmarks/sparse_mm/bench_v1.py  |  97 +++--
 .../cutlass_benchmarks/sparse_mm/bench_v2.py  | 182 +++++----
 .../sparse_mm/mm_benchmarks.py                |  12 +-
 .../cutlass_benchmarks/sparse_mm/utils.py     |  11 +-
 csrc/ops.h                                    |  10 +-
 .../cutlass_w8a8/scaled_mm_c3x.cuh            |  61 +--
 csrc/sparse/cutlass/sparse_compressor.cu      | 111 +++--
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 118 +++---
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 119 +++---
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  18 +-
 csrc/sparse/cutlass/util/device_memory.h      | 155 ++++---
 csrc/sparse/cutlass/util/exceptions.h         |  28 +-
 csrc/sparse/cutlass/util/helper.h             |  93 ++---
 csrc/sparse/cutlass/util/host_tensor.h        | 384 +++++++++---------
 csrc/torch_bindings.cpp                       |  12 +-
 sane_cute_errors.py                           | 140 +++++--
 tests/kernels/test_semi_structured.py         |   5 +-
 tests/quantization/test_compressed_tensors.py |  11 +-
 vllm/_custom_ops.py                           |  22 +-
 .../compressed_tensors/compressed_tensors.py  |  19 +-
 .../compressed_tensors/schemes/__init__.py    |  17 +-
 .../schemes/compressed_tensors_24.py          | 105 +++--
 vllm/model_executor/parameter.py              |   2 +-
 24 files changed, 933 insertions(+), 802 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6935343aa40d3..e92b5d00dc9f5 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -362,8 +362,7 @@ def main(args: argparse.Namespace):
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
-          f"{total_num_tokens=} | {total_output_tokens=}"
-    )
+          f"{total_num_tokens=} | {total_output_tokens=}")
 
     # Output JSON results if specified
     if args.output_json:
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
index 938611dda7447..6af88d9e946df 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -5,7 +5,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_rand_sparse_tensors, to_fp16, to_bf16
+from utils import make_rand_sparse_tensors, to_bf16
 
 import vllm._custom_ops as ops
 
@@ -41,7 +41,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out.t(), out_ref):
@@ -65,26 +66,26 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
 
     # cutlass with bias: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
-                 bias))
-    
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16, bias))
+
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     # cutlass with bias: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
-                 bias.to(dtype=torch.float16)))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16, bias.to(dtype=torch.float16)))
 
     return timers
 
@@ -94,14 +95,16 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     assert dtype == torch.float8_e4m3fn
 
     # Create tensors
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
     aT = a.t()
     bT = b
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    # bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out, out_ref, rtol=1e-2, atol=1e-2):
@@ -169,18 +172,19 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     return timers
 
 
 def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float16
 
     m, k, n = 1, 128, 256
@@ -191,10 +195,11 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     bT = b.t()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    # bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
-    out_ref = to_bf16(a@bT)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
+    out_ref = to_bf16(a @ bT)
 
     if not torch.allclose(out.t(), out_ref, rtol=1e-2, atol=1e-2):
         print("Incorrect result")
@@ -239,31 +244,35 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     # # cutlass impl: bf16 output, with bias
     # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
-    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
-    #              bias))
+    #     bench_fn(label, sub_label,
+    #             "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
+    #             ops.cutlass_scaled_sparse_mm, b_compressed, e, aT,
+    #             scale_b, scale_a, torch.bfloat16, bias))
 
     # # cutlass impl: fp16 output, with bias
     # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
-    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
+    #     bench_fn(label, sub_label,
+    #              "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
+    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+    #              scale_a, torch.float16,
     #              bias.to(dtype=torch.float16)))
 
     return timers
 
 
 def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.bfloat16
 
     # Create tensors
@@ -274,8 +283,9 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
-    out_ref = to_bf16(a@bT)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
+    out_ref = to_bf16(a @ bT)
 
     if not torch.allclose(out.t(), out_ref):
         print("Incorrect result")
@@ -316,31 +326,34 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
-                 bias))
+        bench_fn(label, sub_label,
+                 "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16, bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
-                 bias.to(dtype=torch.float16)))
+        bench_fn(label, sub_label,
+                 "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16, bias.to(dtype=torch.float16)))
 
     return timers
 
 
 def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
+             sub_label: str) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
         return bench_int8(dtype, m, k, n, label, sub_label)
     if dtype == torch.float8_e4m3fn:
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
index 19e85da657ec1..927d4ddbdb802 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
@@ -1,10 +1,11 @@
 import dataclasses
-import random
-from typing import Any, Callable, Iterable, Optional, Tuple, Dict, List
-
 import multiprocessing as mp
+import os
+import traceback
 from multiprocessing import Process, Queue
+from pathlib import Path
 from queue import Empty
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -12,13 +13,6 @@
 from utils import make_n_rand_sparse_tensors
 
 import vllm._custom_ops as ops
-import traceback
-
-import json
-import os
-import hashlib
-from datetime import datetime
-from pathlib import Path
 
 
 @dataclasses.dataclass
@@ -224,30 +218,29 @@ def __exit__(self, exc_type, exc_value, traceback):
             print(f"exc traceback {traceback}")
 
 
-def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue):
+def run_single_benchmark_process(kernel_config: Dict, gpu_id: int,
+                                 queue: Queue):
     """
     Run a single kernel benchmark in an isolated process.
     Puts (success, result, config) tuple in the queue.
     """
     try:
         torch.cuda.set_device(gpu_id)
-        
+
         # Initialize CUDA tensors
         m, k, n = kernel_config['m'], kernel_config['k'], kernel_config['n']
         dtype = kernel_config['dtype']
-        
+
         # Create tensors
         BComps, Es, As, Bs = make_n_rand_sparse_tensors(
-            kernel_config.get('arg_pool_size', 1), 
-            dtype, m, n, k
-        )
+            kernel_config.get('arg_pool_size', 1), dtype, m, n, k)
         AsT = [x.t() for x in As]
         bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
         bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
         scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         # Because the transposed output will be computed
-        out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
+        # out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
 
         # Setup benchmark params
         cuda_graph_params = None
@@ -263,41 +256,46 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue)
 
         if kernel_type == 'pytorch_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "pytorch_bf16_bf16_bf16_matmul-no-scales", 
-                            torch.mm,
-                            ArgPool(bf16_As), ArgPool(bf16_Bs))
+                            "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                            torch.mm, ArgPool(bf16_As), ArgPool(bf16_Bs))
 
         elif kernel_type == 'pytorch_scaled_mm':
-            bench = BenchMM(cuda_graph_params, label, sub_label,
+            bench = BenchMM(cuda_graph_params,
+                            label,
+                            sub_label,
                             "pytorch_fp8_fp8_bf16_scaled_mm",
                             torch._scaled_mm,
-                            ArgPool(As), ArgPool(Bs),
-                            scale_a=scale_a, scale_b=scale_b,
+                            ArgPool(As),
+                            ArgPool(Bs),
+                            scale_a=scale_a,
+                            scale_b=scale_b,
                             out_dtype=torch.bfloat16)
 
         elif kernel_type == 'pytorch_scaled_mm_fast':
-            bench = BenchMM(cuda_graph_params, label, sub_label,
+            bench = BenchMM(cuda_graph_params,
+                            label,
+                            sub_label,
                             "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
                             torch._scaled_mm,
-                            ArgPool(As), ArgPool(Bs),
-                            scale_a=scale_a, scale_b=scale_b,
+                            ArgPool(As),
+                            ArgPool(Bs),
+                            scale_a=scale_a,
+                            scale_b=scale_b,
                             out_dtype=torch.bfloat16,
                             use_fast_accum=True)
 
         elif kernel_type == 'cutlass_scaled_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_mm", 
-                            ops.cutlass_scaled_mm,
-                            ArgPool(As), ArgPool(Bs), scale_a, scale_b,
-                            torch.bfloat16)
+                            "cutlass_fp8_fp8_bf16_scaled_mm",
+                            ops.cutlass_scaled_mm, ArgPool(As), ArgPool(Bs),
+                            scale_a, scale_b, torch.bfloat16)
 
         elif kernel_type == 'cutlass_scaled_sparse_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm", 
-                            ops.cutlass_scaled_sparse_mm,
-                            ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
-                            scale_b, scale_a, torch.bfloat16)
-
+                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                            ops.cutlass_scaled_sparse_mm, ArgPool(BComps),
+                            ArgPool(Es), ArgPool(AsT), scale_b, scale_a,
+                            torch.bfloat16)
 
         # Run the benchmark
         result = bench.run()
@@ -311,8 +309,11 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int, queue: Queue)
         # Explicit cleanup
         torch.cuda.empty_cache()
 
+
 def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
-    """Worker process that spawns individual benchmark processes for each kernel."""
+    """
+    Worker process that spawns individual benchmark processes for each kernel.
+    """
     try:
         while True:
             try:
@@ -324,8 +325,8 @@ def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
                 process_queue = Queue()
 
                 # Create and start a new process for this kernel benchmark
-                p = Process(target=run_single_benchmark_process, 
-                          args=(kernel_config, gpu_id, process_queue))
+                p = Process(target=run_single_benchmark_process,
+                            args=(kernel_config, gpu_id, process_queue))
                 p.start()
 
                 # Wait for result with timeout (5 minutes for benchmarking)
@@ -333,7 +334,8 @@ def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
                     success, result, config = process_queue.get(timeout=300)
                     result_queue.put((success, result, config))
                 except Empty:
-                    print(f"Kernel {kernel_config.get('kernel_type')} benchmark timed out")
+                    print(f"Kernel {kernel_config.get('kernel_type')} ",
+                          "benchmark timed out")
                     result_queue.put((False, None, kernel_config))
 
                 # Cleanup
@@ -353,7 +355,10 @@ def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
     finally:
         print(f"GPU {gpu_id} worker finished")
 
-def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
+
+def run_kernels_on_gpus(
+        configs: List[Dict]
+) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
     MULTI_GPU_MULTI_PROCESS = False  # Set to False for single GPU testing
     if MULTI_GPU_MULTI_PROCESS:
         gpus_list = [0]
@@ -371,7 +376,8 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
         # Start GPU workers
         workers = []
         for gpu_id in gpus_list:
-            p = Process(target=benchmark_gpu_worker, args=(gpu_id, task_queue, result_queue))
+            p = Process(target=benchmark_gpu_worker,
+                        args=(gpu_id, task_queue, result_queue))
             p.start()
             workers.append(p)
 
@@ -403,24 +409,22 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
         gpu_id = 0  # Using the same GPU as before
         torch.cuda.set_device(gpu_id)
         # configs = configs[:10]  # Keep the original slice
-        
+
         for config in configs:
             try:
                 # Initialize CUDA tensors
                 m, k, n = config['m'], config['k'], config['n']
                 dtype = config['dtype']
-                
+
                 # Create tensors
                 BComps, Es, As, Bs = make_n_rand_sparse_tensors(
-                    config.get('arg_pool_size', 1), 
-                    dtype, m, n, k
-                )
+                    config.get('arg_pool_size', 1), dtype, m, n, k)
                 AsT = [x.t() for x in As]
                 bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
                 bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
                 scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
                 scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-                out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
+                # out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
 
                 # Setup benchmark params
                 cuda_graph_params = None
@@ -436,49 +440,58 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
 
                 if kernel_type == 'pytorch_mm':
                     bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "pytorch_bf16_bf16_bf16_matmul-no-scales", 
-                                    torch.mm,
-                                    ArgPool(bf16_As), ArgPool(bf16_Bs))
+                                    "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                                    torch.mm, ArgPool(bf16_As),
+                                    ArgPool(bf16_Bs))
 
                 elif kernel_type == 'pytorch_scaled_mm':
-                    bench = BenchMM(cuda_graph_params, label, sub_label,
+                    bench = BenchMM(cuda_graph_params,
+                                    label,
+                                    sub_label,
                                     "pytorch_fp8_fp8_bf16_scaled_mm",
                                     torch._scaled_mm,
-                                    ArgPool(As), ArgPool(Bs),
-                                    scale_a=scale_a, scale_b=scale_b,
+                                    ArgPool(As),
+                                    ArgPool(Bs),
+                                    scale_a=scale_a,
+                                    scale_b=scale_b,
                                     out_dtype=torch.bfloat16)
 
                 elif kernel_type == 'pytorch_scaled_mm_fast':
-                    bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                                    torch._scaled_mm,
-                                    ArgPool(As), ArgPool(Bs),
-                                    scale_a=scale_a, scale_b=scale_b,
-                                    out_dtype=torch.bfloat16,
-                                    use_fast_accum=True)
+                    bench = BenchMM(
+                        cuda_graph_params,
+                        label,
+                        sub_label,
+                        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                        torch._scaled_mm,
+                        ArgPool(As),
+                        ArgPool(Bs),
+                        scale_a=scale_a,
+                        scale_b=scale_b,
+                        out_dtype=torch.bfloat16,
+                        use_fast_accum=True)
 
                 elif kernel_type == 'cutlass_scaled_mm':
                     bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "cutlass_fp8_fp8_bf16_scaled_mm", 
-                                    ops.cutlass_scaled_mm,
-                                    ArgPool(As), ArgPool(Bs), scale_a, scale_b,
+                                    "cutlass_fp8_fp8_bf16_scaled_mm",
+                                    ops.cutlass_scaled_mm, ArgPool(As),
+                                    ArgPool(Bs), scale_a, scale_b,
                                     torch.bfloat16)
 
                 elif kernel_type == 'cutlass_scaled_sparse_mm':
                     bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "cutlass_fp8_fp8_bf16_scaled_sparse_mm", 
+                                    "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
                                     ops.cutlass_scaled_sparse_mm,
-                                    ArgPool(BComps), ArgPool(Es), ArgPool(AsT), 
+                                    ArgPool(BComps), ArgPool(Es), ArgPool(AsT),
                                     scale_b, scale_a, torch.bfloat16)
 
                 # Run the benchmark
                 result = bench.run()
-                
+
                 # Print progress
                 print(f"Success: {kernel_type}")
-                    
+
                 results.append((True, result, config))
-                
+
                 # Cleanup
                 torch.cuda.empty_cache()
 
@@ -487,25 +500,26 @@ def run_kernels_on_gpus(configs: List[Dict]) -> List[Tuple[bool, Optional[TMeasu
                 print(traceback.format_exc())
                 results.append((False, None, config))
                 torch.cuda.empty_cache()
-                
+
         return results
 
 
 def get_cache_path() -> str:
     """Get the path to the cache file for the given configuration hash."""
-    return f'{Path(os.path.dirname(os.path.realpath(__file__)))}/stable_kernels.json'
+    path = Path(os.path.dirname(os.path.realpath(__file__)))
+    return f'{path}/stable_kernels.json'
 
 
 def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
               with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
-    
+
     # Check if context is not set
-    try:
+    try:  # noqa: SIM105
         mp.set_start_method('spawn', force=True)
     except RuntimeError:
         pass
-    
+
     timers = []
     gpus_list = [5]  # Using the same GPU list as original code
 
@@ -520,28 +534,34 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
         'label': label,
         'sub_label': sub_label
     }
-    
+
     # Prepare configs for all kernels
     standard_kernels = [
         # {'kernel_type': 'pytorch_mm'},
         # {'kernel_type': 'pytorch_scaled_mm'},
         # {'kernel_type': 'pytorch_scaled_mm_fast'},
-        {'kernel_type': 'cutlass_scaled_mm'},
-        {'kernel_type': 'cutlass_scaled_sparse_mm'}
+        {
+            'kernel_type': 'cutlass_scaled_mm'
+        },
+        {
+            'kernel_type': 'cutlass_scaled_sparse_mm'
+        }
     ]
-    
+
     # Create configs for standard kernels
     all_configs = [{**base_config, **kernel} for kernel in standard_kernels]
-    
+
     # Run all kernels distributed across GPUs
-    print(f"Running {len(all_configs)} benchmarks across {len(gpus_list)} GPUs...")
+    print(
+        f"Running {len(all_configs)} benchmarks across {len(gpus_list)} GPUs..."
+    )
     results = run_kernels_on_gpus(all_configs)
-    
+
     # Process results
     for success, result, _ in results:
         if success and result is not None:
             timers.append(result)
-    
+
     return timers
 
 
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
index 82567a57b303a..ffb731588cbaf 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
@@ -41,7 +41,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
                               args.with_arg_pool, m, k, n, label,
                               f"MKN=({m}x{k}x{n})")
         else:
-            timers = bench_v1(args.dtype, m, k, n, f"scaled-sparse-{dtype}-gemm",
+            timers = bench_v1(args.dtype, m, k, n,
+                              f"scaled-sparse-{dtype}-gemm",
                               f"MKN=({m}x{k}x{n})")
 
         print_timers(timers)
@@ -161,10 +162,11 @@ def to_torch_dtype(dt):
             """,  # noqa: E501
         formatter_class=argparse.RawTextHelpFormatter)
 
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
+    parser.add_argument(
+        "--dtype",
+        type=to_torch_dtype,
+        required=True,
+        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
     parser.add_argument(
         '--with-cuda-graph',
         type=int,
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
index 0bc0816b1af62..49e8a1bdfe2ef 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
+++ b/benchmarks/cutlass_benchmarks/sparse_mm/utils.py
@@ -28,14 +28,16 @@ def prune_to_2_4(tensor):
     # Reshape tensor to [N, 4] where N is number of groups of 4
     original_shape = tensor.shape
     reshaped = tensor.reshape(-1, 4)
-    
+
     # Get indices of top 2 absolute values in each group of 4
     _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
-    
+
     # Create binary mask
     mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
-    
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
     # Apply mask and reshape back
     pruned = reshaped * mask
 
@@ -45,7 +47,6 @@ def prune_to_2_4(tensor):
     return pruned.reshape(original_shape)
 
 
-
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
                              k: int) -> Tuple[torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda') * 5
diff --git a/csrc/ops.h b/csrc/ops.h
index a8e8349539f4d..171b70eb80aee 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -158,13 +158,13 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
 bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability);
 
 void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
-                       torch::Tensor const& e,
-                       torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias);
+                              torch::Tensor const& e, torch::Tensor const& b,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias);
 
 bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
-                                 torch::Tensor const& a);
+                            torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
index b44b7cbf65080..fc78c00835501 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
@@ -384,16 +384,19 @@ struct cutlass_3x_gemm {
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
-  static constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD,
-          StrideD, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD, StrideD,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -474,18 +477,23 @@ inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
-using ReductionMode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::ReductionMode;
-using DecompositionMode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
-using RasterOrderOptions = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions; 
+using ReductionMode = cutlass::gemm::kernel::detail::
+    PersistentTileSchedulerSm90StreamKParams::ReductionMode;
+using DecompositionMode = cutlass::gemm::kernel::detail::
+    PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+using RasterOrderOptions = cutlass::gemm::kernel::detail::
+    PersistentTileSchedulerSm90Params::RasterOrderOptions;
 
 template <typename Gemm, typename... EpilogueArgs>
-inline void cutlass_gemm_caller_streamk(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                ReductionMode reduction_mode,
-                                DecompositionMode decomposition_mode,
-                                EpilogueArgs&&... epilogue_params) {
-
-  static_assert(std::is_same<typename Gemm::KernelType::TileSchedulerTag, cutlass::gemm::StreamKScheduler>::value, "Must be streamk scheduler");
+inline void cutlass_gemm_caller_streamk(torch::Tensor& out,
+                                        torch::Tensor const& a,
+                                        torch::Tensor const& b,
+                                        ReductionMode reduction_mode,
+                                        DecompositionMode decomposition_mode,
+                                        EpilogueArgs&&... epilogue_params) {
+  static_assert(std::is_same<typename Gemm::KernelType::TileSchedulerTag,
+                             cutlass::gemm::StreamKScheduler>::value,
+                "Must be streamk scheduler");
 
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
@@ -521,24 +529,23 @@ inline void cutlass_gemm_caller_streamk(torch::Tensor& out, torch::Tensor const&
       c_ptr, c_stride, c_ptr, c_stride};
 
   typename GemmKernel::TileSchedulerArguments tile_scheduler_args(
-    1,
-    1,
-    RasterOrderOptions::Heuristic,
-    decomposition_mode
-  );
+      1, 1, RasterOrderOptions::Heuristic, decomposition_mode);
   tile_scheduler_args.reduction_mode = reduction_mode;
 
   // Copied from examples...
-  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a given device ID. This
-  // information is used by the underlying kernel.
+  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a
+  // given device ID. This information is used by the underlying kernel.
   cutlass::KernelHardwareInfo hw_info;
-  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
-  // to use a GPU other than that with device ID 0.
+  // Change device_id to another value if you are running on a machine with
+  // multiple GPUs and wish to use a GPU other than that with device ID 0.
   hw_info.device_id = 0;
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
 
-  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
-                                      epilogue_args, hw_info, tile_scheduler_args};
+  typename GemmKernel::Arguments args{Gemm::Mode,    prob_shape,
+                                      mainloop_args, epilogue_args,
+                                      hw_info,       tile_scheduler_args};
 
   // Launch the CUTLASS GEMM kernel.
   using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index 660ee33044d9f..e91df52417d1e 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -10,7 +10,6 @@
 
 #include "cutlass/cutlass.h"
 
-
 #include "cute/tensor.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cutlass/numeric_types.h"
@@ -45,14 +44,12 @@
 #include "sparse_scaled_mm_c3x.cuh"
 
 /// Make A structured sparse by replacing elements with 0 and compress it
-template<typename ElementA_>
-bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
-{
+template <typename ElementA_>
+bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
+                           torch::Tensor const& a) {
   // Checks for conformality
-  TORCH_CHECK(a.dtype() == torch::kInt8 ||
-              a.dtype() == torch::kFloat8_e4m3fn ||
-              a.dtype() == torch::kFloat16 ||
-              a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
   TORCH_CHECK(a.dim() == 2)
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1)
@@ -60,7 +57,7 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   int m = a.size(0);
   int k = a.size(1);
 
-  using ProblemShape = Shape<int,int,int,int>;
+  using ProblemShape = Shape<int, int, int, int>;
   using ElementA = ElementA_;
   using LayoutTagA = cutlass::layout::RowMajor;
 
@@ -68,22 +65,23 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
   using StrideE = StrideA;
 
-  using Gemm =
-    typename std::conditional<std::is_same_v<ElementA, int8_t>,
+  using Gemm = typename std::conditional<
+      std::is_same_v<ElementA, int8_t>,
       typename sm90_int8_config_default<int8_t, cutlass::half_t,
                                         ScaledEpilogue>::Cutlass3xGemm,
-      typename std::conditional<std::is_same_v<ElementA, cutlass::float_e4m3_t>,
-        typename sm90_fp8_config_default<cutlass::float_e4m3_t, cutlass::half_t,
-                                          ScaledEpilogue>::Cutlass3xGemm,
-        typename std::conditional<std::is_same_v<ElementA, cutlass::half_t>,
-          typename sm90_fp16_config_default<cutlass::half_t, cutlass::half_t,
-                                            ScaledEpilogue>::Cutlass3xGemm,
-          typename sm90_bf16_config_default<cutlass::bfloat16_t,
-                                            cutlass::half_t,
-                                            ScaledEpilogue>::Cutlass3xGemm
-        >::type
-      >::type
-    >::type;
+      typename std::conditional<
+          std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+          typename sm90_fp8_config_default<cutlass::float_e4m3_t,
+                                           cutlass::half_t,
+                                           ScaledEpilogue>::Cutlass3xGemm,
+          typename std::conditional<
+              std::is_same_v<ElementA, cutlass::half_t>,
+              typename sm90_fp16_config_default<cutlass::half_t,
+                                                cutlass::half_t,
+                                                ScaledEpilogue>::Cutlass3xGemm,
+              typename sm90_bf16_config_default<
+                  cutlass::bfloat16_t, cutlass::half_t,
+                  ScaledEpilogue>::Cutlass3xGemm>::type>::type>::type;
 
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
@@ -114,28 +112,26 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   // typename Gemm::GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
 
   // Offline compressor kernel
-  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
-                            ProblemShape,
-                            ElementA,
-                            LayoutTagA,
-                            SparseConfig>;
-
-  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
-                            ProblemShape,
-                            ElementA,
-                            LayoutTagA,
-                            SparseConfig,
-                            cutlass::arch::Sm90>;
-
-  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
-  
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
+
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          ProblemShape, ElementA, LayoutTagA, SparseConfig,
+          cutlass::arch::Sm90>;
+
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
   auto [M, N, K, L] = prob_shape;
 
   StrideA stride_A;
   StrideA stride_A_compressed;
   StrideE stride_E;
 
-  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  stride_A =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
 
   CompressorUtility compressor_utility(prob_shape, stride_A);
 
@@ -147,33 +143,35 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
 
   // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
   // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
-  // cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
+  // cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE>
+  // block_E;
 
   auto a_compressed_ptr = static_cast<ElementA*>(a_compressed.data_ptr());
-  auto e_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(e.data_ptr());
+  auto e_ptr =
+      static_cast<typename Gemm::CollectiveMainloop::ElementE*>(e.data_ptr());
 
   // block_A_compressed.reset(M * KC * L);
   // block_E.reset(ME * KE * L);
 
-  stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
-  stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
+  stride_A_compressed =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
+  stride_E =
+      cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
 
   // // Random sparsification is performed on host
   // std::vector<ElementA> block_A_host(m * k);
   // cutlass::device_memory::copy_to_host(block_A_host.data(), a_ptr, m * k);
-  // compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), 2024);
-  // cutlass::device_memory::copy_to_device(a_ptr, block_A_host.data(), m * k);
+  // compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(),
+  // 2024); cutlass::device_memory::copy_to_device(a_ptr, block_A_host.data(), m
+  // * k);
 
   cutlass::KernelHardwareInfo hw_info;
   hw_info.device_id = 0;
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-  typename Compressor::Arguments arguments {
-    prob_shape,
-    { a_ptr,
-      stride_A,
-      a_compressed_ptr,
-      e_ptr },
-    {hw_info} };
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, stride_A, a_compressed_ptr, e_ptr}, {hw_info}};
 
   Compressor compressor_op;
   size_t workspace_size = Compressor::get_workspace_size(arguments);
@@ -187,16 +185,15 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e, torch:
   return true;
 }
 
-bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a)
-{
+bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+                            torch::Tensor const& a) {
   if (a.dtype() == torch::kBFloat16) {
     return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
   } else if (a.dtype() == torch::kFloat16) {
     return sparsify_and_compress<cutlass::half_t>(a_compressed, e, a);
   } else if (a.dtype() == torch::kFloat8_e4m3fn) {
     return sparsify_and_compress<cutlass::float_e4m3_t>(a_compressed, e, a);
-  }
-  else if (a.dtype() == torch::kInt8) {
+  } else if (a.dtype() == torch::kInt8) {
     return sparsify_and_compress<int8_t>(a_compressed, e, a);
   }
   return false;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 0b49ac1719a4f..e524695c2662c 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -27,7 +27,7 @@
 #include "util/common.hpp"
 // clang-format on
 
-#include "sparse_scaled_mm_c3x.cuh"
+  #include "sparse_scaled_mm_c3x.cuh"
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
@@ -52,7 +52,7 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
       typename sm90_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM512 =
       typename sm90_fp8_config_M512<InType, OutType, Epilogue>::Cutlass3xGemm;
-    
+
   using Cutlass3xGemm1 =
       typename sm90_fp8_config_1<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemm2 =
@@ -70,7 +70,7 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass3xGemm8 =
       typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
 
-  uint32_t const n = b.size(1); // Batch size
+  uint32_t const n = b.size(1);  // Batch size
   uint32_t const m = a.size(0);
   uint32_t const np2 =
       std::max(static_cast<uint32_t>(64), next_pow_2(n));  // next power of 2
@@ -78,46 +78,40 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   if (np2 <= 64) {
     if (m == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    }
-    else if (m == 4096 || m == 6144) {
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else if (m == 4096 || m == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
     }
   } else if (np2 <= 128) {
     if (m == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    }
-    else if (m == 28672) {
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else if (m == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    }
-    else if (m == 6144) {
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else if (m == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
     }
   } else if (np2 <= 256) {
     if (m == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    }
-    else if (m == 28672) {
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else if (m == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    }
-    else if (m == 6144) {
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else if (m == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
     }
   } else {
     if (m == 6144 || m == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    }
-    else if (m == 4096) {
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+    } else if (m == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, e, b, std::forward<EpilogueArgs>(args)...);
     }
   }
 
@@ -145,9 +139,9 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& e,
-                                    torch::Tensor const& b,
-                                    EpilogueArgs&&... args) {
+                                     torch::Tensor const& e,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::half_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat16);
   TORCH_CHECK(e.dtype() == torch::kUInt8);
@@ -155,20 +149,20 @@ void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
 
   using Cutlass3xGemmDefault =
       typename sm90_fp16_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
+                                        Epilogue>::Cutlass3xGemm;
 
-    // m in (128, inf)
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, e, b, std::forward<EpilogueArgs>(args)...);
 }
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& e,
-                                    torch::Tensor const& b,
-                                    EpilogueArgs&&... args) {
+                                     torch::Tensor const& e,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::bfloat16_t>());
   TORCH_CHECK(a.dtype() == torch::kBFloat16);
   TORCH_CHECK(e.dtype() == torch::kUInt8);
@@ -176,11 +170,11 @@ void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
 
   using Cutlass3xGemmDefault =
       typename sm90_bf16_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
+                                        Epilogue>::Cutlass3xGemm;
 
-    // m in (128, inf)
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+  // m in (128, inf)
+  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, e, b, std::forward<EpilogueArgs>(args)...);
 }
 
 template <typename InType, typename OutType,
@@ -242,10 +236,11 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
 
 template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
-void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& e,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
+void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& e,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... epilogue_args) {
   TORCH_CHECK(e.dtype() == torch::kUInt8);
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(b.dtype() == torch::kInt8);
@@ -272,44 +267,42 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor co
                                             cutlass::half_t, Epilogue>(
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
-  }
-  else if (a.dtype() == torch::kFloat16) {
+  } else if (a.dtype() == torch::kFloat16) {
     TORCH_CHECK(b.dtype() == torch::kFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
-                                            cutlass::bfloat16_t, Epilogue>(
+                                             cutlass::bfloat16_t, Epilogue>(
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
-                                            cutlass::half_t, Epilogue>(
+      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
+                                             Epilogue>(
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
-  }
-  else { // a.dtype() == torch::kBFloat16
+  } else {  // a.dtype() == torch::kBFloat16
     TORCH_CHECK(a.dtype() == torch::kBFloat16);
     TORCH_CHECK(b.dtype() == torch::kBFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
-                                            cutlass::bfloat16_t, Epilogue>(
+                                             cutlass::bfloat16_t, Epilogue>(
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
-                                            cutlass::half_t, Epilogue>(
+                                             cutlass::half_t, Epilogue>(
           out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
 
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                            torch::Tensor const& e,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
@@ -318,13 +311,13 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBias>(
         c, a, e, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogue>(c, a, e, b,
-                                                           a_scales,
-                                                           b_scales);
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogue>(
+        c, a, e, b, a_scales, b_scales);
   }
 }
 
-// void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
+// void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, torch::Tensor
+// const& a,
 //                                 torch::Tensor const& e,
 //                                 torch::Tensor const& b,
 //                                 torch::Tensor const& a_scales,
@@ -336,7 +329,8 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
 //   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
 //   if (azp) {
-//     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+//     return
+//     cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
 //         out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
 //   } else {
 //     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index ad103e9151ca3..7c94c38ac6d0b 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -47,9 +47,9 @@ template <typename Kernel>
 struct enable_sm90_or_later : Kernel {
   template <typename... Args>
   CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
     Kernel::operator()(std::forward<Args>(args)...);
-  #endif
+#endif
   }
 };
 
@@ -364,8 +364,8 @@ struct cutlass_3x_gemm {
   using ElementAB = ElementAB_;
   using ElementD = ElementD_;
   using ElementAcc = AccType;
-      // typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-      //                           float>::type;
+  // typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+  //                           float>::type;
 
   using EpilogueDescriptor =
       cutlass::epilogue::collective::detail::EpilogueDescriptor<
@@ -380,9 +380,12 @@ struct cutlass_3x_gemm {
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
-  static constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD  = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -416,8 +419,8 @@ struct cutlass_3x_gemm {
 
 template <typename Gemm, typename... EpilogueArgs>
 void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& e, torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
+                                torch::Tensor const& e, torch::Tensor const& b,
+                                EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
@@ -453,9 +456,8 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
   auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
   auto e_ptr = static_cast<ElementE*>(e.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_layout,
-                                                       b_ptr, b_stride,
-                                                       e_ptr, e_layout};
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, a_layout, b_ptr, b_stride, e_ptr, e_layout};
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
@@ -487,8 +489,7 @@ template <typename InType, typename OutType,
 struct sm90_fp16_config_default {
   // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::half_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
@@ -502,8 +503,7 @@ template <typename InType, typename OutType,
 struct sm90_bf16_config_default {
   // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::bfloat16_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
@@ -517,11 +517,10 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_1 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64,_64,_256>;
-  using ClusterShape = Shape<_8,_1,_1>;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -533,9 +532,10 @@ struct sm90_fp8_config_2 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128,_64,_256>;
-  using ClusterShape = Shape<_8,_1,_1>;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _64, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -545,11 +545,10 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_3 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64,_64,_256>;
-  using ClusterShape = Shape<_1,_2,_1>;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -559,11 +558,11 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_4 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_64,_128,_256>;
-  using ClusterShape = Shape<_8,_1,_1>;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -576,8 +575,8 @@ struct sm90_fp8_config_5 {
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128,_128,_256>;
-  using ClusterShape = Shape<_8,_1,_1>;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -587,11 +586,10 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_fp8_config_6 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64,_128,_256>;
-  using ClusterShape = Shape<_1,_2,_1>;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -603,9 +601,10 @@ struct sm90_fp8_config_7 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128,_128,_256>;
-  using ClusterShape = Shape<_1,_1,_1>;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -617,9 +616,10 @@ struct sm90_fp8_config_8 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128,_256,_128>;
-  using ClusterShape = Shape<_8,_1,_1>;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _128>;
+  using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -631,11 +631,10 @@ template <typename InType, typename OutType,
 struct sm90_fp8_config_default {
   // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128,_128,_128>;
-  using ClusterShape = Shape<_1,_2,_1>;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
@@ -646,9 +645,9 @@ template <typename InType, typename OutType,
 struct sm90_fp8_config_M64 {
   // M in [1, 64]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
   using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
@@ -656,8 +655,7 @@ struct sm90_fp8_config_M64 {
 
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float,
-                      TileSchedule>;
+                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -675,8 +673,7 @@ struct sm90_fp8_config_M128 {
 
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float,
-                      TileSchedule>;
+                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -686,7 +683,8 @@ struct sm90_fp8_config_M256 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
   using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
@@ -694,8 +692,7 @@ struct sm90_fp8_config_M256 {
 
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float,
-                      TileSchedule>;
+                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -705,7 +702,8 @@ struct sm90_fp8_config_M512 {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
   using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
@@ -713,8 +711,7 @@ struct sm90_fp8_config_M512 {
 
   using Cutlass3xGemm =
       cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float,
-                      TileSchedule>;
+                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
 template <typename InType, typename OutType,
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index e5a80e75a6b0b..9c2aed2eb3079 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -5,11 +5,11 @@
 
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                            torch::Tensor const& e,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias);
+                                   torch::Tensor const& e,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   c10::optional<torch::Tensor> const& bias);
 #endif
 
 bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability) {
@@ -39,10 +39,10 @@ int32_t test_get_sm_version_num() {
 }
 
 void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
-                       torch::Tensor const& e,
-                       torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias) {
+                              torch::Tensor const& e, torch::Tensor const& b,
+                              torch::Tensor const& a_scales,
+                              torch::Tensor const& b_scales,
+                              c10::optional<torch::Tensor> const& bias) {
   // Checks for conformality
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) * 2 == b.size(0) &&
diff --git a/csrc/sparse/cutlass/util/device_memory.h b/csrc/sparse/cutlass/util/device_memory.h
index 7d3fa73f62df8..fd121317c6c71 100644
--- a/csrc/sparse/cutlass/util/device_memory.h
+++ b/csrc/sparse/cutlass/util/device_memory.h
@@ -1,12 +1,12 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
@@ -18,14 +18,15 @@
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
 
@@ -51,10 +52,10 @@ namespace device_memory {
  * Allocation lifetime
  ******************************************************************************/
 
-/// Allocate a buffer of \p count elements of type \p T on the current CUDA device
+/// Allocate a buffer of \p count elements of type \p T on the current CUDA
+/// device
 template <typename T>
 T* allocate(size_t count = 1) {
-
   T* ptr = 0;
   size_t bytes = 0;
 
@@ -65,7 +66,8 @@ T* allocate(size_t count = 1) {
   if (cuda_error != cudaSuccess) {
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
     std::ostringstream os;
-    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
+    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes="
+       << bytes;
     CUTLASS_TRACE_HOST(os.str());
 #endif
     throw cuda_exception("Failed to allocate memory", cuda_error);
@@ -73,7 +75,8 @@ T* allocate(size_t count = 1) {
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
   else {
     std::ostringstream os;
-    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
+    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes="
+       << bytes;
     CUTLASS_TRACE_HOST(os.str());
   }
 #endif
@@ -106,24 +109,19 @@ void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
   if (cuda_error != cudaSuccess) {
     std::ostringstream os;
     os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
-       << "dst=" << dst << ", src=" << src
-       << ", bytes=" << bytes << ", count=" << count;
+       << "dst=" << dst << ", src=" << src << ", bytes=" << bytes
+       << ", count=" << count;
     if (kind == cudaMemcpyHostToDevice) {
       os << ", kind=cudaMemcpyHostToDevice";
-    }
-    else if (kind == cudaMemcpyDeviceToHost) {
+    } else if (kind == cudaMemcpyDeviceToHost) {
       os << ", kind=cudaMemcpyDeviceToHost";
-    }
-    else if (kind == cudaMemcpyDeviceToDevice) {
+    } else if (kind == cudaMemcpyDeviceToDevice) {
       os << ", kind=cudaMemcpyDeviceToDevice";
-    }
-    else if (kind == cudaMemcpyHostToHost) {
+    } else if (kind == cudaMemcpyHostToHost) {
       os << ", kind=cudaMemcpyHostToHost";
-    }
-    else if (kind == cudaMemcpyDefault) {
+    } else if (kind == cudaMemcpyDefault) {
       os << ", kind=cudaMemcpyDefault";
-    }
-    else {
+    } else {
       os << ", kind=Unknown";
     }
     os << ", error: " << cudaGetErrorString(cuda_error);
@@ -154,7 +152,8 @@ void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
 
 /// Copies elements from device memory to host-side range
 template <typename OutputIterator, typename T>
-void insert_to_host(OutputIterator begin, OutputIterator end, T const* device_begin) {
+void insert_to_host(OutputIterator begin, OutputIterator end,
+                    T const* device_begin) {
   size_t elements = end - begin;
   copy_to_host(&*begin, device_begin, elements);
 }
@@ -174,8 +173,7 @@ void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
 
 template <typename T>
 class DeviceAllocation {
-public:
-
+ public:
   /// Delete functor for CUDA device memory
   struct deleter {
     void operator()(T* ptr) {
@@ -188,7 +186,7 @@ class DeviceAllocation {
     }
   };
 
-public:
+ public:
   //
   // Data members
   //
@@ -199,26 +197,24 @@ class DeviceAllocation {
   /// Smart pointer
   platform::unique_ptr<T, deleter> smart_ptr;
 
-public:
-
+ public:
   //
   // Static methods
   //
 
-  /// Static member to compute the number of bytes needed for a given number of elements
+  /// Static member to compute the number of bytes needed for a given number of
+  /// elements
   static size_t bytes(size_t elements) {
     if (sizeof_bits<T>::value < 8) {
       size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;
       return elements / kElementsPerByte;
-    }
-    else {
+    } else {
       size_t const kBytesPerElement = sizeof_bits<T>::value / 8;
       return elements * kBytesPerElement;
     }
   }
 
-public:
-
+ public:
   //
   // Methods
   //
@@ -227,21 +223,23 @@ class DeviceAllocation {
   DeviceAllocation() : capacity(0) {}
 
   /// Constructor: allocates \p capacity elements on the current CUDA device
-  DeviceAllocation(size_t _capacity) : 
-    smart_ptr(device_memory::allocate<T>(_capacity)), capacity(_capacity) {}
+  DeviceAllocation(size_t _capacity)
+      : smart_ptr(device_memory::allocate<T>(_capacity)), capacity(_capacity) {}
 
-  /// Constructor: allocates \p capacity elements on the current CUDA device taking ownership of the allocation
-  DeviceAllocation(T *ptr, size_t _capacity) : smart_ptr(ptr), capacity(_capacity) {}
+  /// Constructor: allocates \p capacity elements on the current CUDA device
+  /// taking ownership of the allocation
+  DeviceAllocation(T* ptr, size_t _capacity)
+      : smart_ptr(ptr), capacity(_capacity) {}
 
   /// Copy constructor
-  DeviceAllocation(DeviceAllocation const &p): 
-    smart_ptr(device_memory::allocate<T>(p.capacity)), capacity(p.capacity) {
-
+  DeviceAllocation(DeviceAllocation const& p)
+      : smart_ptr(device_memory::allocate<T>(p.capacity)),
+        capacity(p.capacity) {
     device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
   }
 
   /// Move constructor
-  DeviceAllocation(DeviceAllocation &&p): capacity(0) {
+  DeviceAllocation(DeviceAllocation&& p) : capacity(0) {
     std::swap(smart_ptr, p.smart_ptr);
     std::swap(capacity, p.capacity);
   }
@@ -252,7 +250,8 @@ class DeviceAllocation {
   /// Returns a pointer to the managed object
   T* get() const { return smart_ptr.get(); }
 
-  /// Releases the ownership of the managed object (without deleting) and resets capacity to zero
+  /// Releases the ownership of the managed object (without deleting) and resets
+  /// capacity to zero
   T* release() {
     capacity = 0;
     return smart_ptr.release();
@@ -269,47 +268,45 @@ class DeviceAllocation {
     reset(device_memory::allocate<T>(_capacity), _capacity);
   }
 
-  /// Deletes managed object, if owned, and replaces its reference with a given pointer and capacity
+  /// Deletes managed object, if owned, and replaces its reference with a given
+  /// pointer and capacity
   void reset(T* _ptr, size_t _capacity) {
     smart_ptr.reset(_ptr);
     capacity = _capacity;
   }
 
-  /// Allocates a new buffer and copies the old buffer into it. The old buffer is then released.
+  /// Allocates a new buffer and copies the old buffer into it. The old buffer
+  /// is then released.
   void reallocate(size_t new_capacity) {
-    
-    platform::unique_ptr<T, deleter> new_allocation(device_memory::allocate<T>(new_capacity));
+    platform::unique_ptr<T, deleter> new_allocation(
+        device_memory::allocate<T>(new_capacity));
 
-    device_memory::copy_device_to_device(
-      new_allocation.get(), 
-      smart_ptr.get(), 
-      std::min(new_capacity, capacity));
+    device_memory::copy_device_to_device(new_allocation.get(), smart_ptr.get(),
+                                         std::min(new_capacity, capacity));
 
     std::swap(smart_ptr, new_allocation);
     std::swap(new_capacity, capacity);
   }
 
   /// Returns the number of elements
-  size_t size() const {
-    return capacity;
-  }
+  size_t size() const { return capacity; }
 
   /// Returns the number of bytes needed to store the allocation
-  size_t bytes() const {
-    return bytes(capacity);
-  }
+  size_t bytes() const { return bytes(capacity); }
 
   /// Returns a pointer to the object owned by *this
   T* operator->() const { return smart_ptr.get(); }
 
-  /// Returns the deleter object which would be used for destruction of the managed object.
+  /// Returns the deleter object which would be used for destruction of the
+  /// managed object.
   deleter& get_deleter() { return smart_ptr.get_deleter(); }
 
-  /// Returns the deleter object which would be used for destruction of the managed object (const)
+  /// Returns the deleter object which would be used for destruction of the
+  /// managed object (const)
   const deleter& get_deleter() const { return smart_ptr.get_deleter(); }
 
   /// Copies a device-side memory allocation
-  DeviceAllocation & operator=(DeviceAllocation const &p) {
+  DeviceAllocation& operator=(DeviceAllocation const& p) {
     if (capacity != p.capacity) {
       smart_ptr.reset(device_memory::allocate<T>(p.capacity));
       capacity = p.capacity;
@@ -319,44 +316,36 @@ class DeviceAllocation {
   }
 
   /// Move assignment
-  DeviceAllocation & operator=(DeviceAllocation && p) {
+  DeviceAllocation& operator=(DeviceAllocation&& p) {
     std::swap(smart_ptr, p.smart_ptr);
     std::swap(capacity, p.capacity);
     return *this;
   }
 
   /// Copies the entire allocation from another location in device memory.
-  void copy_from_device(T const *ptr) const {
-    copy_from_device(ptr, capacity);
-  }
+  void copy_from_device(T const* ptr) const { copy_from_device(ptr, capacity); }
 
   /// Copies a given number of elements from device memory
-  void copy_from_device(T const *ptr, size_t elements) const {
+  void copy_from_device(T const* ptr, size_t elements) const {
     device_memory::copy_device_to_device(get(), ptr, elements);
   }
 
-  void copy_to_device(T *ptr) const {
-    copy_to_device(ptr, capacity);
-  }
+  void copy_to_device(T* ptr) const { copy_to_device(ptr, capacity); }
 
-  void copy_to_device(T *ptr, size_t elements) const {
+  void copy_to_device(T* ptr, size_t elements) const {
     device_memory::copy_device_to_device(ptr, get(), elements);
   }
 
-  void copy_from_host(T const *ptr) const {
-    copy_from_host(ptr, capacity);
-  }
+  void copy_from_host(T const* ptr) const { copy_from_host(ptr, capacity); }
 
-  void copy_from_host(T const *ptr, size_t elements) const {
+  void copy_from_host(T const* ptr, size_t elements) const {
     device_memory::copy_to_device(get(), ptr, elements);
   }
 
-  void copy_to_host(T *ptr) const {
-    copy_to_host(ptr, capacity);
-  }
+  void copy_to_host(T* ptr) const { copy_to_host(ptr, capacity); }
 
-  void copy_to_host(T *ptr, size_t elements) const {
-    device_memory::copy_to_host(ptr, get(), elements); 
+  void copy_to_host(T* ptr, size_t elements) const {
+    device_memory::copy_to_host(ptr, get(), elements);
   }
 };
 
diff --git a/csrc/sparse/cutlass/util/exceptions.h b/csrc/sparse/cutlass/util/exceptions.h
index 54c62fdbb6f5d..163af9aca19c2 100644
--- a/csrc/sparse/cutlass/util/exceptions.h
+++ b/csrc/sparse/cutlass/util/exceptions.h
@@ -1,12 +1,12 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
@@ -18,14 +18,15 @@
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
 
@@ -48,7 +49,8 @@ namespace cutlass {
 class cuda_exception : public std::exception {
  public:
   /// Constructor
-  cuda_exception(const char* msg = "", cudaError_t err = cudaErrorUnknown) : msg(msg), err(err) {}
+  cuda_exception(const char* msg = "", cudaError_t err = cudaErrorUnknown)
+      : msg(msg), err(err) {}
 
   /// Returns the underlying CUDA \p cudaError_t
   cudaError_t cudaError() const { return err; }
diff --git a/csrc/sparse/cutlass/util/helper.h b/csrc/sparse/cutlass/util/helper.h
index f333fab9cac53..407e0a49b3a38 100644
--- a/csrc/sparse/cutlass/util/helper.h
+++ b/csrc/sparse/cutlass/util/helper.h
@@ -1,12 +1,12 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
@@ -18,14 +18,15 @@
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once
@@ -46,49 +47,41 @@
     }                                                                   \
   }
 
-
 /**
- * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
+ * GPU timer for recording the elapsed time across kernel(s) launched in GPU
+ * stream
  */
-struct GpuTimer
-{
-    cudaStream_t _stream_id;
-    cudaEvent_t _start;
-    cudaEvent_t _stop;
+struct GpuTimer {
+  cudaStream_t _stream_id;
+  cudaEvent_t _start;
+  cudaEvent_t _stop;
 
-    /// Constructor
-    GpuTimer() : _stream_id(0)
-    {
-        CUDA_CHECK(cudaEventCreate(&_start));
-        CUDA_CHECK(cudaEventCreate(&_stop));
-    }
+  /// Constructor
+  GpuTimer() : _stream_id(0) {
+    CUDA_CHECK(cudaEventCreate(&_start));
+    CUDA_CHECK(cudaEventCreate(&_stop));
+  }
 
-    /// Destructor
-    ~GpuTimer()
-    {
-        CUDA_CHECK(cudaEventDestroy(_start));
-        CUDA_CHECK(cudaEventDestroy(_stop));
-    }
+  /// Destructor
+  ~GpuTimer() {
+    CUDA_CHECK(cudaEventDestroy(_start));
+    CUDA_CHECK(cudaEventDestroy(_stop));
+  }
 
-    /// Start the timer for a given stream (defaults to the default stream)
-    void start(cudaStream_t stream_id = 0)
-    {
-        _stream_id = stream_id;
-        CUDA_CHECK(cudaEventRecord(_start, _stream_id));
-    }
+  /// Start the timer for a given stream (defaults to the default stream)
+  void start(cudaStream_t stream_id = 0) {
+    _stream_id = stream_id;
+    CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+  }
 
-    /// Stop the timer
-    void stop()
-    {
-        CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
-    }
+  /// Stop the timer
+  void stop() { CUDA_CHECK(cudaEventRecord(_stop, _stream_id)); }
 
-    /// Return the elapsed time (in milliseconds)
-    float elapsed_millis()
-    {
-        float elapsed = 0.0;
-        CUDA_CHECK(cudaEventSynchronize(_stop));
-        CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
-        return elapsed;
-    }
+  /// Return the elapsed time (in milliseconds)
+  float elapsed_millis() {
+    float elapsed = 0.0;
+    CUDA_CHECK(cudaEventSynchronize(_stop));
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
+    return elapsed;
+  }
 };
diff --git a/csrc/sparse/cutlass/util/host_tensor.h b/csrc/sparse/cutlass/util/host_tensor.h
index 3f061875b48dc..eeb0692c5b4b8 100644
--- a/csrc/sparse/cutlass/util/host_tensor.h
+++ b/csrc/sparse/cutlass/util/host_tensor.h
@@ -1,12 +1,12 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
@@ -18,14 +18,15 @@
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once
@@ -33,9 +34,10 @@
 /*! \file
   \brief HostTensor contributes management for both host and device memory.
 
-  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
-  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
-  for CUDA memcpy operations.
+  HostTensor allocates host and device memory upon construction. Basic
+  element-wise operations on host memory synchronize device memory
+  automatically. Explicit copy operations provide abstractions for CUDA memcpy
+  operations.
 
   Call {host, device}_{data, ref, view}() for accessing host or device memory.
 
@@ -57,14 +59,13 @@ namespace cutlass {
 
 /// Host tensor
 template <
-  /// Data type of element stored within tensor (concept: NumericType)
-  typename Element_,
-  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
-  typename Layout_
->
+    /// Data type of element stored within tensor (concept: NumericType)
+    typename Element_,
+    /// Defines a mapping from logical coordinate to linear memory (concept:
+    /// Layout)
+    typename Layout_>
 class HostTensor {
-public:
-
+ public:
   /// Data type of individual access
   using Element = Element_;
 
@@ -104,15 +105,23 @@ class HostTensor {
   /// Constant reference to element in tensor
   using ConstReference = typename ConstTensorRef::Reference;
 
-private:
-  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
-                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
-                                      Element, uint8_t>>;
-  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
-  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
-  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
-  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
-  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+ private:
+  using StorageUnit = typename platform::conditional_t<
+      std::is_same_v<Element, bool>,
+      uint8_t,  // Avoid the std::vector<bool> specialization
+      typename platform::conditional_t<sizeof_bits<Element>::value % 8 ==
+                                           0,  // Handle subbyte types
+                                       Element, uint8_t>>;
+  using StorageContainerCalculator =
+      cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+  static constexpr int kContainerTypeNumBits =
+      StorageContainerCalculator::kContainerTypeNumBits;
+  static constexpr int kContainerTypeNumLogicalElements =
+      StorageContainerCalculator::kContainerTypeNumLogicalElements;
+  static constexpr int kContainerTypeNumBytes =
+      StorageContainerCalculator::kContainerTypeNumBytes;
+  static constexpr int kContainerTypeNumStorageUnit =
+      StorageContainerCalculator::kContainerTypeNumStorageUnit;
 
   //
   // Data members
@@ -130,12 +139,13 @@ class HostTensor {
   /// Device-side memory
   device_memory::allocation<StorageUnit> device_;
 
-  /// number of containers 
+  /// number of containers
   size_t count_to_container_storage_unit_count(size_t count) {
-    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
+    return (count + kContainerTypeNumLogicalElements - 1) /
+           kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
   }
 
-public:
+ public:
   //
   // Device and Host Methods
   //
@@ -144,25 +154,17 @@ class HostTensor {
   HostTensor() {}
 
   /// Constructs a tensor given an extent. Assumes a packed layout
-  HostTensor(
-    TensorCoord const &extent,
-    bool device_backed = true
-  ) {
-
+  HostTensor(TensorCoord const& extent, bool device_backed = true) {
     this->reset(extent, Layout::packed(extent), device_backed);
   }
 
   /// Constructs a tensor given an extent and layout
-  HostTensor(
-    TensorCoord const &extent,
-    Layout const &layout,
-    bool device_backed = true
-  ) {
-
+  HostTensor(TensorCoord const& extent, Layout const& layout,
+             bool device_backed = true) {
     this->reset(extent, layout, device_backed);
   }
 
-  ~HostTensor() { }
+  ~HostTensor() {}
 
   /// Clears the HostTensor allocation to size/capacity = 0
   void reset() {
@@ -174,11 +176,13 @@ class HostTensor {
   }
 
   /// Resizes internal memory allocations without affecting layout or extent
-  void reserve(
-    size_t count,                                        ///< size of tensor in elements
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+  void reserve(size_t count,  ///< size of tensor in elements
+               bool device_backed_ =
+                   true) {  ///< if true, device memory is also allocated
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count="
+                       << count << ", device_backed_="
+                       << (device_backed_ ? "true" : "false") << ")");
 #endif
 
     device_.reset();
@@ -186,27 +190,30 @@ class HostTensor {
 
     size_t count_container = count_to_container_storage_unit_count(count);
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
-#endif    
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize("
+                       << count_container << ")");
+#endif
     host_.resize(count_container);
 
     // Allocate memory
     StorageUnit* device_memory = nullptr;
     if (device_backed_) {
 #if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
+      CUTLASS_TRACE_HOST(
+          "cutlass::HostTensor::reserve: device_memory::allocate("
+          << count_container << ")");
 #endif
       device_memory = device_memory::allocate<StorageUnit>(count_container);
     }
     device_.reset(device_memory, device_backed_ ? count_container : 0);
   }
 
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+  /// Updates the extent and layout of the HostTensor. Allocates memory
+  /// according to the new extent and layout.
+  void reset(TensorCoord const& extent,  ///< extent of logical tensor
+             Layout const& layout,       ///< layout object of tensor
+             bool device_backed_ =
+                 true) {  ///< if true, device memory is also allocated.
 
     extent_ = extent;
     layout_ = layout;
@@ -214,57 +221,63 @@ class HostTensor {
     reserve(size_t(layout_.capacity(extent_)), device_backed_);
   }
 
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout. Assumes a packed tensor configuration.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+  /// Updates the extent and layout of the HostTensor. Allocates memory
+  /// according to the new extent and layout. Assumes a packed tensor
+  /// configuration.
+  void reset(TensorCoord const& extent,  ///< extent of logical tensor
+             bool device_backed_ =
+                 true) {  ///< if true, device memory is also allocated.
 
     reset(extent, Layout::packed(extent), device_backed_);
   }
 
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset().
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+  /// Changes the size of the logical tensor. Only allocates memory if new
+  /// capacity exceeds reserved capacity. To force allocation, call reset().
+  void resize(TensorCoord const& extent,  ///< extent of logical tensor
+              Layout const& layout,       ///< layout object of tensor
+              bool device_backed_ =
+                  true) {  ///< if true, device memory is also allocated.
 
     extent_ = extent;
     layout_ = layout;
 
     LongIndex new_size = size_t(layout_.capacity(extent_));
-    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
+    LongIndex new_size_container =
+        count_to_container_storage_unit_count((layout_.capacity(extent_)));
 
-    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
+    if (static_cast<decltype(host_.size())>(new_size_container) >
+        host_.size()) {
       reserve(new_size, device_backed_);
     }
   }
 
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
+  /// Changes the size of the logical tensor. Only allocates memory if new
+  /// capacity exceeds reserved capacity. To force allocation, call reset().
+  /// Note, this form of resize() assumes a packed tensor configuration.
+  void resize(TensorCoord const& extent,  ///< extent of logical tensor
+              bool device_backed_ =
+                  true) {  ///< if true, device memory is also allocated.
 
     resize(extent, Layout::packed(extent), device_backed_);
   }
 
   /// Returns the logical number of elements stored in the host tensor
-  size_t size() const {
-    return layout_.capacity(extent_);
-  }
+  size_t size() const { return layout_.capacity(extent_); }
 
-  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
+  /// Returns the logical capacity in terms of number of elements. May be larger
+  /// than the size().
   LongIndex capacity() const {
-    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
+    return host_.size() / kContainerTypeNumStorageUnit *
+           kContainerTypeNumLogicalElements;
   }
 
   /// Gets pointer to host data
-  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
+  Element* host_data() { return reinterpret_cast<Element*>(host_.data()); }
 
   /// Gets pointer to host data with a pointer offset
-  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+  Element* host_data_ptr_offset(LongIndex ptr_element_offset) {
+    return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset);
+  }
 
   /// Gets a reference to an element in host memory
   Reference host_data(LongIndex idx) {
@@ -272,10 +285,14 @@ class HostTensor {
   }
 
   /// Gets pointer to host data
-  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
+  Element const* host_data() const {
+    return reinterpret_cast<Element const*>(host_.data());
+  }
 
   /// Gets pointer to host data with a pointer offset
-  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+  Element const* host_data_ptr_offset(LongIndex ptr_element_offset) const {
+    return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset);
+  }
 
   /// Gets a constant reference to an element in host memory
   ConstReference host_data(LongIndex idx) const {
@@ -283,51 +300,65 @@ class HostTensor {
   }
 
   /// Gets pointer to device data
-  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
+  Element* device_data() { return reinterpret_cast<Element*>(device_.get()); }
 
   /// Gets pointer to device data
-  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
+  Element const* device_data() const {
+    return reinterpret_cast<Element const*>(device_.get());
+  }
 
   /// Gets pointer to device data with a pointer offset
-  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+  Element* device_data_ptr_offset(LongIndex ptr_element_offset) {
+    return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset);
+  }
 
   /// Gets pointer to device data with a pointer offset
-  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+  Element const* device_data_ptr_offset(LongIndex ptr_element_offset) const {
+    return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset);
+  }
 
   /// Accesses the tensor reference pointing to data
-  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+  TensorRef host_ref(LongIndex ptr_element_offset = 0) {
+    return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_);
+  }
 
   /// Accesses the tensor reference pointing to data
-  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+  ConstTensorRef host_ref(LongIndex ptr_element_offset = 0) const {
+    return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_);
+  }
 
   /// Accesses the tensor reference pointing to data
-  TensorRef device_ref(LongIndex ptr_element_offset=0) {
+  TensorRef device_ref(LongIndex ptr_element_offset = 0) {
     return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
   }
 
   /// Accesses the tensor reference pointing to data
-  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
+  ConstTensorRef device_ref(LongIndex ptr_element_offset = 0) const {
     return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
   }
 
   /// Accesses the tensor reference pointing to data
-  TensorView host_view(LongIndex ptr_element_offset=0) {
-    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  TensorView host_view(LongIndex ptr_element_offset = 0) {
+    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_,
+                      extent_);
   }
 
   /// Accesses the tensor reference pointing to data
-  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  ConstTensorView host_view(LongIndex ptr_element_offset = 0) const {
+    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_,
+                           extent_);
   }
 
   /// Accesses the tensor reference pointing to data
-  TensorView device_view(LongIndex ptr_element_offset=0) {
-    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  TensorView device_view(LongIndex ptr_element_offset = 0) {
+    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_,
+                      extent_);
   }
 
   /// Accesses the tensor reference pointing to data
-  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  ConstTensorView device_view(LongIndex ptr_element_offset = 0) const {
+    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_,
+                           extent_);
   }
 
   /// Returns true if device memory is allocated
@@ -335,204 +366,187 @@ class HostTensor {
     return (device_.get() == nullptr) ? false : true;
   }
 
-
   /// Returns the layout object
-  Layout & layout() {
-    return layout_;
-  }
+  Layout& layout() { return layout_; }
 
   /// Returns the layout object
-  Layout layout() const {
-    return layout_;
-  }
+  Layout layout() const { return layout_; }
 
   /// Returns the layout object's stride vector
-  Stride stride() const {
-    return layout_.stride();
-  }
+  Stride stride() const { return layout_.stride(); }
 
   /// Returns the layout object's stride vector
-  Stride & stride() {
-    return layout_.stride();
-  }
+  Stride& stride() { return layout_.stride(); }
 
   /// Returns the layout object's stride in a given physical dimension
-  LongIndex stride(int dim) const {
-    return layout_.stride().at(dim);
-  }
+  LongIndex stride(int dim) const { return layout_.stride().at(dim); }
 
   /// Returns the layout object's stride in a given physical dimension
-  LongIndex & stride(int dim) {
-    return layout_.stride().at(dim);
-  }
+  LongIndex& stride(int dim) { return layout_.stride().at(dim); }
 
   /// Computes the offset of an index from the origin of the tensor
-  LongIndex offset(TensorCoord const& coord) const {
-    return layout_(coord);
-  }
+  LongIndex offset(TensorCoord const& coord) const { return layout_(coord); }
 
   /// Returns a reference to the element at the logical Coord in host memory
-  Reference at(TensorCoord const& coord) {
-    return host_data(offset(coord));
-  }
+  Reference at(TensorCoord const& coord) { return host_data(offset(coord)); }
 
-  /// Returns a const reference to the element at the logical Coord in host memory
+  /// Returns a const reference to the element at the logical Coord in host
+  /// memory
   ConstReference at(TensorCoord const& coord) const {
     return host_data(offset(coord));
   }
 
   /// Returns the extent of the tensor
-  TensorCoord extent() const {
-    return extent_;
-  }
+  TensorCoord extent() const { return extent_; }
 
   /// Returns the extent of the tensor
-  TensorCoord & extent() {
-    return extent_;
-  }
+  TensorCoord& extent() { return extent_; }
 
   /// Copies data from device to host
   void sync_host() {
     if (device_backed()) {
-      device_memory::copy_to_host(
-          host_.data(), device_.get(), device_.size());
+      device_memory::copy_to_host(host_.data(), device_.get(), device_.size());
     }
   }
 
   /// Copies data from host to device
   void sync_device() {
     if (device_backed()) {
-      device_memory::copy_to_device(
-          device_.get(), host_.data(), host_.size());
+      device_memory::copy_to_device(device_.get(), host_.data(), host_.size());
     }
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
   void copy_in_device_to_host(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+      Element const* ptr_device,  ///< source device memory
+      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
+                               ///< entire tensor is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+        host_.data(), reinterpret_cast<StorageUnit const*>(ptr_device),
+        container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
   void copy_in_device_to_device(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+      Element const* ptr_device,  ///< source device memory
+      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
+                               ///< entire tensor is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_device_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+        device_.get(), reinterpret_cast<StorageUnit const*>(ptr_device),
+        container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
   void copy_in_host_to_device(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+      Element const* ptr_host,  ///< source host memory
+      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
+                               ///< entire tensor is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+        device_.get(), reinterpret_cast<StorageUnit const*>(ptr_host),
+        container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
   void copy_in_host_to_host(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+      Element const* ptr_host,  ///< source host memory
+      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
+                               ///< entire tensor is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_host_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+        host_.data(), reinterpret_cast<StorageUnit const*>(ptr_host),
+        container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_host(
-    Element * ptr_host,               ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+  void copy_out_device_to_host(Element* ptr_host,  ///< source device memory
+                               LongIndex count = -1)
+      const {  ///< number of elements to transfer; if negative, entire tensor
+               ///< is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
+    device_memory::copy_to_host(reinterpret_cast<StorageUnit*>(ptr_host),
+                                device_.get(), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_device(
-    Element * ptr_device,             ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+  void copy_out_device_to_device(Element* ptr_device,  ///< source device memory
+                                 LongIndex count = -1)
+      const {  ///< number of elements to transfer; if negative, entire tensor
+               ///< is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
     device_memory::copy_device_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
+        reinterpret_cast<StorageUnit*>(ptr_device), device_.get(),
+        container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_device(
-    Element * ptr_device,             ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+  void copy_out_host_to_device(Element* ptr_device,  ///< source host memory
+                               LongIndex count = -1)
+      const {  ///< number of elements to transfer; if negative, entire tensor
+               ///< is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
+    device_memory::copy_to_device(reinterpret_cast<StorageUnit*>(ptr_device),
+                                  host_.data(), container_count);
   }
 
   /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_host(
-    Element * ptr_host,               ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+  void copy_out_host_to_host(Element* ptr_host,  ///< source host memory
+                             LongIndex count = -1)
+      const {  ///< number of elements to transfer; if negative, entire tensor
+               ///< is overwritten.
 
     if (count < 0) {
       count = capacity();
-    }
-    else {
+    } else {
       count = __NV_STD_MIN(capacity(), count);
     }
     size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
+    device_memory::copy_host_to_host(reinterpret_cast<StorageUnit*>(ptr_host),
+                                     host_.data(), container_count);
   }
 };
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 9ff4dcd6297bc..667aa94db3218 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -322,12 +322,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // Test
-  ops.def("cutlass_scaled_sparse_mm_supports_fp8(int cuda_device_capability) -> bool");
-  ops.impl("cutlass_scaled_sparse_mm_supports_fp8", &cutlass_scaled_sparse_mm_supports_fp8);
+  ops.def(
+      "cutlass_scaled_sparse_mm_supports_fp8(int cuda_device_capability) -> "
+      "bool");
+  ops.impl("cutlass_scaled_sparse_mm_supports_fp8",
+           &cutlass_scaled_sparse_mm_supports_fp8);
 
   // Test
-  ops.def("cutlass_compress_entry(Tensor! a_compressed, Tensor! e,"
-          " Tensor a) -> bool");
+  ops.def(
+      "cutlass_compress_entry(Tensor! a_compressed, Tensor! e,"
+      " Tensor a) -> bool");
   ops.impl("cutlass_compress_entry", &cutlass_compress_entry);
 
   // Mamba selective scan kernel
diff --git a/sane_cute_errors.py b/sane_cute_errors.py
index d742fe1fbc827..512a493142d94 100644
--- a/sane_cute_errors.py
+++ b/sane_cute_errors.py
@@ -7,108 +7,182 @@
 #  ```
 
 import sys
+
 import regex
 from colorama import Fore
 
+
 def _loop_replace(replace_fn, input_str, *args, **kwargs):
     new_string, count = replace_fn(input_str, *args, **kwargs)
     while count > 0:
         new_string, count = replace_fn(new_string, *args, **kwargs)
     return new_string
 
-def replace_delimited_substring(input_str, start_delim, end_delim, replace_fn, prefix=""):    
+
+def replace_delimited_substring(input_str,
+                                start_delim,
+                                end_delim,
+                                replace_fn,
+                                prefix=""):
     start_delim = regex.escape(start_delim)
     end_delim = regex.escape(end_delim)
-    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'
+    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'  # noqa: E501
     return regex.subn(rx, lambda x: replace_fn(x.group(2)), input_str)
 
-def replace_all_delimited_substrings(input_str, start_delim, end_delim, replace_fn, prefix=""):
-    return _loop_replace(replace_delimited_substring, input_str, start_delim, end_delim, replace_fn, prefix=prefix)
 
-def replace_delimiters(input_str, start_delim, end_delim, new_start, new_end, prefix=""):
+def replace_all_delimited_substrings(input_str,
+                                     start_delim,
+                                     end_delim,
+                                     replace_fn,
+                                     prefix=""):
+    return _loop_replace(replace_delimited_substring,
+                         input_str,
+                         start_delim,
+                         end_delim,
+                         replace_fn,
+                         prefix=prefix)
+
+
+def replace_delimiters(input_str,
+                       start_delim,
+                       end_delim,
+                       new_start,
+                       new_end,
+                       prefix=""):
     start_delim = regex.escape(start_delim)
     end_delim = regex.escape(end_delim)
-    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'
+    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'  # noqa: E501
     return regex.subn(rx, f"{new_start}\\2{new_end}", input_str)
 
-def replace_all_delimiters(input_str, start_delim, end_delim, new_start, new_end, prefix=""):
-    return _loop_replace(replace_delimiters, input_str, start_delim, end_delim, new_start, new_end, prefix=prefix)
+
+def replace_all_delimiters(input_str,
+                           start_delim,
+                           end_delim,
+                           new_start,
+                           new_end,
+                           prefix=""):
+    return _loop_replace(replace_delimiters,
+                         input_str,
+                         start_delim,
+                         end_delim,
+                         new_start,
+                         new_end,
+                         prefix=prefix)
+
 
 def replace(input_str, old, new):
     return regex.subn(old, new, input_str)
 
+
 def replace_all(input_str, old, new):
     return _loop_replace(replace, input_str, old, new)
-    
+
+
 def sepreate_at_line_of(input_str):
-    return regex.sub(r"at line (\d+) of ([^\n\r]*)", f"\n\t\tat {Fore.GREEN}\\2:\\1{Fore.RESET}", input_str)
+    return regex.sub(r"at line (\d+) of ([^\n\r]*)",
+                     f"\n\t\tat {Fore.GREEN}\\2:\\1{Fore.RESET}", input_str)
+
 
 def break_apart_instantiation_of(input_str):
+
     def replace_fn(x):
+
         def replace_fn_inner(x):
             x = regex.sub(r"([^\s=]+=)", r"\n\t\t  \1", x)
             return x
+
         x = regex.sub(r"(at line)", r"\n\t\t\1", x)
         y, _ = replace_delimited_substring(x, "[", "]", replace_fn_inner)
-        return "instantiation of " + regex.sub(r"([^(]*)", f"{Fore.MAGENTA}\\1{Fore.RESET}", y, count=1)
-    
-    return replace_all_delimited_substrings(input_str, "\"", "\"", replace_fn, prefix=r"instantiation of ")
+        return "instantiation of " + regex.sub(
+            r"([^(]*)", f"{Fore.MAGENTA}\\1{Fore.RESET}", y, count=1)
+
+    return replace_all_delimited_substrings(input_str,
+                                            "\"",
+                                            "\"",
+                                            replace_fn,
+                                            prefix=r"instantiation of ")
+
 
 def template_replace_commas_at_depth_0(x, new_char):
     brace_stack = []
-    brace_pairs = { "(": ")", "[": "]", "{": "}", "<": ">" }
+    brace_pairs = {"(": ")", "[": "]", "{": "}", "<": ">"}
     replaced_comma = False
-    
+
     for idx in range(len(x)):
         if x[idx] in brace_pairs:
             brace_stack.append(x[idx])
         elif len(brace_stack) > 0 and x[idx] == brace_pairs[brace_stack[-1]]:
             brace_stack.pop()
         if len(brace_stack) == 0 and x[idx] == ",":
-            x = x[:idx] + new_char + x[idx+1:]
+            x = x[:idx] + new_char + x[idx + 1:]
             replaced_comma = True
     return x, replaced_comma
 
 
 def replace_layout_commas(x):
+
     def replace_commas_inner(x):
         x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
         if not replaced:
-            x, _ = replace_delimiters(x, "<", ">", "", "", prefix="cute::tuple")
+            x, _ = replace_delimiters(x,
+                                      "<",
+                                      ">",
+                                      "",
+                                      "",
+                                      prefix="cute::tuple")
             x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
-        assert replaced == True
+        assert replaced == True  # noqa: E712
         return f"{Fore.BLUE}{x}{Fore.RESET}"
-    
-    x, _ = replace_delimited_substring(x, "<", ">", replace_commas_inner, prefix="cute::Layout")
+
+    x, _ = replace_delimited_substring(x,
+                                       "<",
+                                       ">",
+                                       replace_commas_inner,
+                                       prefix="cute::Layout")
     return x
 
+
 def replace_composed_layout_commas(x):
+
     def replace_commas_inner(x):
         x, replaced = template_replace_commas_at_depth_0(x, new_char=" o")
-        assert replaced == True
+        assert replaced == True  # noqa: E712
         return x
-    
-    x, _ = replace_delimited_substring(x, "<", ">", replace_commas_inner, prefix="cute::ComposedLayout")
+
+    x, _ = replace_delimited_substring(x,
+                                       "<",
+                                       ">",
+                                       replace_commas_inner,
+                                       prefix="cute::ComposedLayout")
     return x
 
+
 def clean_up_log(log):
     new_str = sepreate_at_line_of(log)
     new_str = break_apart_instantiation_of(new_str)
     new_str = replace_layout_commas(new_str)
     new_str = replace_composed_layout_commas(new_str)
-    new_str = replace_all_delimiters(new_str, "<", ">", "(", ")", prefix="cute::tuple")
-    new_str = replace_all_delimiters(new_str, "<", ">", "S<", ">", prefix="cute::Swizzle")
+    new_str = replace_all_delimiters(new_str,
+                                     "<",
+                                     ">",
+                                     "(",
+                                     ")",
+                                     prefix="cute::tuple")
+    new_str = replace_all_delimiters(new_str,
+                                     "<",
+                                     ">",
+                                     "S<",
+                                     ">",
+                                     prefix="cute::Swizzle")
     new_str = replace_all(new_str, r"cute::C<(\d+)>", r"_\1")
     new_str = replace_all(new_str, r"cute::_(\d+)", r"_\1")
     new_str = replace_all(new_str, r"cute::Underscore", r"_")
 
-    template_type_abbreviations = (
-        ("cute::ScaledBasis", "SB"),
-        ("cute::Tensor", "T"),
-        ("cute::ArithmeticTuple", "AT"),
-        ("cute::ArithmeticTupleIterator", "ATI"),
-        ("cute::ViewEngine", "VE")
-    )
+    template_type_abbreviations = (("cute::ScaledBasis",
+                                    "SB"), ("cute::Tensor", "T"),
+                                   ("cute::ArithmeticTuple",
+                                    "AT"), ("cute::ArithmeticTupleIterator",
+                                            "ATI"), ("cute::ViewEngine", "VE"))
 
     for template_type, abrv in template_type_abbreviations:
         new_str = replace_all(new_str, template_type + "<", abrv + "<")
@@ -116,4 +190,4 @@ def clean_up_log(log):
 
 
 data = sys.stdin.read()
-clean_up_log(data)
\ No newline at end of file
+clean_up_log(data)
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index e107630979250..783ae12ba6cc7 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -3,14 +3,13 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    compress_to_torch_sparse_semi_structured_mat,
+    clear_cache, compress_to_torch_sparse_semi_structured_mat,
     decompress_torch_sparse_semi_structured_mat, dense_matmul,
     generate_pruned_semi_structured_mat, get_random_mat,
     is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
     semi_structured_dense_sparse_T_gemm_scaled,
     semi_structured_sparse_dense_gemm,
-    semi_structured_sparse_dense_gemm_scaled,
-    clear_cache)
+    semi_structured_sparse_dense_gemm_scaled)
 
 DTYPES = [torch.float16, torch.bfloat16, torch.int8]
 SIZES = [(128, 128), (1024, 8192)]
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 4b9ac854b67ee..fd1a1d05de208 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,9 +10,10 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, CompressedTensors24)
+    CompressedTensors24, CompressedTensorsLinearMethod,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
 
 
 @pytest.mark.parametrize(
@@ -232,9 +233,9 @@ def test_compressed_tensors_2of4(vllm_runner, args_2of4):
 
         assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
         assert qkv_proj.scheme.input_quant.strategy == input_strategy
-        assert qkv_proj.scheme.quantized == True
+        assert qkv_proj.scheme.quantized
         assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
         assert sparsity_map.get("Linear").format == "dense"
         assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 02a648ab12bb3..230a0ad7aaa5c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -533,7 +533,8 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
 
 
 def cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability: int) -> bool:
-    return torch.ops._C.cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability)
+    return torch.ops._C.cutlass_scaled_sparse_mm_supports_fp8(
+        cuda_device_capability)
 
 
 def cutlass_compress_entry(a: torch.Tensor) \
@@ -548,7 +549,9 @@ def cutlass_compress_entry(a: torch.Tensor) \
     m = a.shape[0]
     k = a.shape[1]
     a_compressed = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
-    e = torch.empty((m, k // 2 // elemsPerElemE), dtype=torch.uint8, device=a.device)
+    e = torch.empty((m, k // 2 // elemsPerElemE),
+                    dtype=torch.uint8,
+                    device=a.device)
 
     if not (torch.ops._C.cutlass_compress_entry(a_compressed, e, a)):
         raise ValueError
@@ -556,13 +559,14 @@ def cutlass_compress_entry(a: torch.Tensor) \
     return a_compressed, e
 
 
-def cutlass_scaled_sparse_mm(a: torch.Tensor,
-                      e: torch.Tensor,
-                      b: torch.Tensor,
-                      scale_a: torch.Tensor,
-                      scale_b: torch.Tensor,
-                      out_dtype: torch.dtype,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        e: torch.Tensor,
+        b: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     # assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.shape[0] == a.shape[0] \
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 60b1584582bcf..b4bc0956c0655 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,9 @@
 from typing import Any, Dict, List, Literal, Optional, cast
 
 import torch
-from compressed_tensors.config import CompressionFormat
+from compressed_tensors.config import (CompressionFormat,
+                                       SparsityCompressionConfig,
+                                       SparsityStructure)
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
@@ -15,17 +17,15 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
     CompressedTensorsMoEMethod)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
+    W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16, CompressedTensors24)
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
-from compressed_tensors.compressors import ModelCompressor
-from compressed_tensors.config import SparsityCompressionConfig, SparsityStructure
 
 __all__ = ["CompressedTensorsLinearMethod"]
 
@@ -40,9 +40,8 @@ def __init__(
         target_scheme_map: Dict[str, Any],
         ignore: List[str],
         quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
         kv_cache_scheme: Optional[Dict[str, Any]] = None,
-        sparsity_scheme_map: Optional[Dict[str,
-                                           SparsityCompressionConfig]] = None,
         config: Optional[Dict[str, Any]] = None,
     ):
 
@@ -166,7 +165,7 @@ def _quantization_scheme_map_from_config(
                             "weights"].type == QuantizationType.FLOAT
                     else:
                         target_scheme_map[target][
-                            "input_activations"] = QuantizationArgs.model_validate(
+                            "input_activations"] = QuantizationArgs.model_validate(  # noqa: E501
                                 quant_config.get("input_activations"))
         return target_scheme_map
 
@@ -392,7 +391,7 @@ def get_scheme(
                                          input_quant=input_quant)
         else:
             # Find the quant_scheme
-            scheme = self._get_scheme_from_parts(
+            scheme = self._get_scheme_from_parts(  # type: ignore
                 weight_quant=weight_quant,
                 input_quant=input_quant,
             )
@@ -441,6 +440,8 @@ def supports_cutlass_24(
             QuantizationStrategy.CHANNEL.value
         ]
 
+        assert weight_quant is not None
+        assert input_quant is not None
         if weight_quant.strategy not in supported_weight_quant_strategies:
             return False
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 369466bbe68ef..569ecaa6f5a76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -6,18 +6,13 @@
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
                                        CompressedTensorsWNA16)
-from .compressed_tensors_24 import (
-    CompressedTensors24
-)
+
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
 
 __all__ = [
-    "CompressedTensorsScheme",
-    "CompressedTensorsWNA16",
-    "CompressedTensorsW8A16Fp8",
-    "CompressedTensorsW4A16Sparse24",
-    "CompressedTensorsW8A8Int8",
-    "CompressedTensorsW8A8Fp8",
-    "WNA16_SUPPORTED_BITS",
-    "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
     "CompressedTensors24"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 5fee61d340f7c..9da7151ff1b15 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,13 +1,19 @@
-from typing import Any, Dict, List, Callable, Optional
+from typing import Callable, List, Optional
+
 import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
 
-from compressed_tensors.quantization import QuantizationType, QuantizationStrategy
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.parameter import ModelWeightParameter, ChannelQuantScaleParameter, PerTensorScaleParameter, BasevLLMParameter
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     convert_to_channelwise)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
 
 __all__ = ["CompressedTensors24"]
 
@@ -16,8 +22,9 @@ class CompressedTensors24(CompressedTensorsScheme):
 
     def __init__(self,
                  quantized: bool = False,
-                 weight_quant=None,
-                 input_quant=None):
+                 weight_quant: Optional[QuantizationArgs] = None,
+                 input_quant: Optional[QuantizationArgs] = None):
+
         self.quantized = quantized
         self.weight_quant = weight_quant
         self.input_quant = input_quant
@@ -34,40 +41,46 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
 
         self.output_dtype = params_dtype
         layer.logical_widths = output_partition_sizes
-        weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
 
         # parameter to store uncompressed weight
         weight = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes),
             input_size_per_partition,
-            dtype=weights_dtype),
+            dtype=self.weights_dtype),
                                       input_dim=1,
                                       output_dim=0,
                                       weight_loader=weight_loader)
 
-        if self.weight_quant.strategy == QuantizationStrategy.CHANNEL.value:
-            weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
-                                 dtype=torch.float32),
-                output_dim=0,
-                weight_loader=weight_loader)
-        else:
-            assert self.weight_quant.strategy == QuantizationStrategy.TENSOR.value
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                   weight_loader=weight_loader)
-
-        layer.register_parameter("weight_scale", weight_scale)
-
-        # input quant will be non-none
-        if not self.input_quant.dynamic:
-            # register input quant scale
-            assert self.input_quant.strategy == QuantizationStrategy.TENSOR.value
-            input_scale = BasevLLMParameter(data=torch.empty(
-                1, dtype=torch.float32),
-                                            weight_loader=weight_loader)
-
-            layer.register_parameter("input_scale", input_scale)
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader)
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader)
+
+            layer.register_parameter("weight_scale", weight_scale)
+
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(data=torch.empty(
+                    1, dtype=torch.float32),
+                                                weight_loader=weight_loader)
+
+                layer.register_parameter("input_scale", input_scale)
 
         layer.register_parameter("weight", weight)
 
@@ -82,14 +95,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         :param layer: The layer with the weights to be processed
         
         """
-        if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+        if (self.weight_quant and self.weight_quant.strategy
+                == QuantizationStrategy.TENSOR.value):
             layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
                 weight_scale=layer.weight_scale,
                 logical_widths=layer.logical_widths),
                                                     requires_grad=False)
         w_compressed, meta = ops.cutlass_compress_entry(layer.weight.data)
-        layer.w_compressed = torch.nn.Parameter(w_compressed,
-                                                requires_grad=False)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
     def apply_weights(self,
@@ -107,14 +120,23 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
+        scale = None
         if hasattr(layer, "input_scale"):
-            q_input, input_scale = ops.scaled_fp8_quant(
-                x, scale=layer.input_scale)
-        else:
-            q_input, input_scale = ops.scaled_fp8_quant(
-                x, use_per_token_if_dynamic=True)
+            scale = layer.input_scale
 
-        out = ops.cutlass_scaled_sparse_mm(a=layer.w_compressed,
+        if self.weights_dtype == torch.int8:
+            ops_output = ops.scaled_int8_quant(x, scale=scale)
+            q_input = ops_output[0]
+            input_scale = ops_output[1]
+        else:
+            assert self.weights_dtype == torch.float8_e4m3fn
+            if scale is not None:
+                q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+            else:
+                q_input, input_scale = ops.scaled_fp8_quant(
+                    x, use_per_token_if_dynamic=True)
+
+        out = ops.cutlass_scaled_sparse_mm(a=layer.weight,
                                            e=layer.meta,
                                            b=q_input.t(),
                                            scale_a=layer.weight_scale,
@@ -129,6 +151,9 @@ def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
         if not self.quantized:
             return params_dtype
 
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+
         is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
 
         if not is_8_bits:
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index f9666b3870fb1..7a6d7c90f34d5 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -104,7 +104,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
         param_data = param_data.narrow(self.output_dim, shard_offset,
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)                                 
+                                             tp_rank * shard_size, shard_size)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 

From 4820ebe69daf14a27916b4b3652e4db11ad4a0e4 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Sun, 8 Dec 2024 17:52:58 +0000
Subject: [PATCH 70/92] support for sparse only ct models

---
 .../compressed_tensors/compressed_tensors.py  | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b4bc0956c0655..87d543ced7d0c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -367,14 +367,23 @@ def get_scheme(
         # so we do not have to re-write these functions
         # need to make accelerate optional in ct to do this
 
-        matched_target = find_matched_target(
-            layer_name=layer_name,
-            module=layer,
-            targets=self.target_scheme_map.keys())
-
-        scheme_dict = self.target_scheme_map[matched_target]
-        weight_quant = scheme_dict.get("weights")
-        input_quant = scheme_dict.get("input_activations")
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys())
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+        elif self.sparsity_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.sparsity_scheme_map.keys())
+            weight_quant = None
+            input_quant = None
 
         sparsity_scheme: Optional[
             SparsityCompressionConfig] = self.sparsity_scheme_map.get(

From 2c32ce010e900b835c4bd3d3b8d17470451ba6db Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Sun, 8 Dec 2024 19:49:15 +0000
Subject: [PATCH 71/92] add 2:4 sparse only support, add test cases, add
 torch.comile workaround

---
 tests/quantization/test_compressed_tensors.py | 27 +++++++-
 .../compressed_tensors/compressed_tensors.py  |  2 +
 .../schemes/compressed_tensors_24.py          | 66 +++++++++++++------
 3 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index fd1a1d05de208..31624cbf1f6f0 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -221,7 +221,7 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
      "tensor", "token")
 ])
-def test_compressed_tensors_2of4(vllm_runner, args_2of4):
+def test_compressed_tensors_2of4_quant(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
@@ -239,6 +239,31 @@ def test_compressed_tensors_2of4(vllm_runner, args_2of4):
         assert sparsity_map.get("Linear").format == "dense"
         assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
+
+
+@pytest.mark.parametrize(
+    "args_2of4",
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+        assert qkv_proj.scheme.weight_quant is None
+        assert qkv_proj.scheme.input_quant is None
+        assert not qkv_proj.scheme.quantized
+        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+        assert sparsity_map.get("Linear").format == "dense"
+        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
         assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 87d543ced7d0c..0c1fc18228f5c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -385,6 +385,8 @@ def get_scheme(
             weight_quant = None
             input_quant = None
 
+        # For models with sparsity, assumes that the sparse layers are also
+        # quantized for cutlass 2:4 support
         sparsity_scheme: Optional[
             SparsityCompressionConfig] = self.sparsity_scheme_map.get(
                 matched_target)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 9da7151ff1b15..12e1b26d87081 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -82,6 +82,17 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
 
                 layer.register_parameter("input_scale", input_scale)
 
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+
         layer.register_parameter("weight", weight)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
@@ -95,12 +106,22 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         :param layer: The layer with the weights to be processed
         
         """
-        if (self.weight_quant and self.weight_quant.strategy
-                == QuantizationStrategy.TENSOR.value):
-            layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
-                weight_scale=layer.weight_scale,
-                logical_widths=layer.logical_widths),
-                                                    requires_grad=False)
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
+                    weight_scale=layer.weight_scale,
+                    logical_widths=layer.logical_widths),
+                                                        requires_grad=False)
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+
         w_compressed, meta = ops.cutlass_compress_entry(layer.weight.data)
         layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
@@ -120,21 +141,27 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
-        scale = None
-        if hasattr(layer, "input_scale"):
-            scale = layer.input_scale
+        if self.quantized:
+            scale = None
+            if hasattr(layer, "input_scale"):
+                scale = layer.input_scale
+
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                if scale is not None:
+                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+                else:
+                    q_input, input_scale = ops.scaled_fp8_quant(
+                        x, use_per_token_if_dynamic=True)
 
-        if self.weights_dtype == torch.int8:
-            ops_output = ops.scaled_int8_quant(x, scale=scale)
-            q_input = ops_output[0]
-            input_scale = ops_output[1]
         else:
-            assert self.weights_dtype == torch.float8_e4m3fn
-            if scale is not None:
-                q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
-            else:
-                q_input, input_scale = ops.scaled_fp8_quant(
-                    x, use_per_token_if_dynamic=True)
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
 
         out = ops.cutlass_scaled_sparse_mm(a=layer.weight,
                                            e=layer.meta,
@@ -143,7 +170,6 @@ def apply_weights(self,
                                            scale_b=input_scale,
                                            out_dtype=self.output_dtype,
                                            bias=bias)
-
         assert out.is_contiguous()
         return out
 

From a27ca8186ab2c2a5aaa91a9cf7697942fd4254bd Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Sun, 8 Dec 2024 20:39:34 +0000
Subject: [PATCH 72/92] add int8 quant tests

---
 tests/quantization/test_compressed_tensors.py | 51 +++++++++++++++----
 tests/weight_loading/models.txt               |  1 +
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 31624cbf1f6f0..557d72172f346 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -211,6 +211,19 @@ def test_compressed_tensors_kv_cache(vllm_runner):
         assert output
 
 
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
+    assert qkv_proj.scheme.input_quant.strategy == input_strategy
+    assert qkv_proj.scheme.quantized
+    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+
 @pytest.mark.parametrize("args_2of4", [
     ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
      "token"),
@@ -219,27 +232,43 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
      "tensor"),
     ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-     "tensor", "token")
+     "tensor", "token"),
 ])
-def test_compressed_tensors_2of4_quant(vllm_runner, args_2of4):
+def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
         qkv_proj = layer.self_attn.qkv_proj
-        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensors24)
+        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
 
-        assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
-        assert qkv_proj.scheme.input_quant.strategy == input_strategy
-        assert qkv_proj.scheme.quantized
-        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-        assert sparsity_map.get("Linear").format == "dense"
-        assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.parametrize("args_2of4", [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+     "channel", "token"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
+     "tensor"),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+     "tensor", "token"),
+])
+def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+        assert qkv_proj.scheme.weights_dtype == torch.int8
+        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
         assert output
 
 
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 4236d67b27412..9363a5fef0e0f 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -22,6 +22,7 @@ compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
 compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main
+compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main

From 81c43604a9c7d42bf8884d8ccc8bed652625ce85 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Wed, 11 Dec 2024 03:40:46 +0000
Subject: [PATCH 73/92] Update code

---
 CMakeLists.txt                                |   5 +-
 .../bench_v2.py => sp_fp8_benchmarks.py}      | 207 ++++++-
 .../cutlass_benchmarks/sparse_mm/bench_v1.py  | 365 -----------
 .../sparse_mm/mm_benchmarks.py                | 217 -------
 .../sparse_mm/stable_kernels.json             |   1 -
 .../sparse_mm/weight_shapes.py                |  75 ---
 .../{sparse_mm => }/utils.py                  |   0
 .../epilogue/scaled_mm_epilogues_c2x.hpp      |   2 +-
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |   2 +-
 .../broadcast_load_epilogue_c2x.hpp           | 496 ---------------
 .../broadcast_load_epilogue_c3x.hpp           | 447 --------------
 .../cutlass_w8a8/scaled_mm_c2x.cuh            |   2 +-
 .../cutlass_w8a8/scaled_mm_c3x.cuh            |   2 +-
 csrc/sparse/cutlass/common.hpp                |  80 +++
 csrc/sparse/cutlass/sparse_compressor.cu      |  27 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |   4 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |   4 +-
 .../util/broadcast_load_epilogue_c3x.hpp      | 447 --------------
 csrc/sparse/cutlass/util/common.hpp           |  27 -
 csrc/sparse/cutlass/util/device_memory.h      | 366 -----------
 csrc/sparse/cutlass/util/exceptions.h         |  71 ---
 csrc/sparse/cutlass/util/helper.h             |  87 ---
 csrc/sparse/cutlass/util/host_tensor.h        | 555 -----------------
 csrc/sparse/cutlass/util/packed_stride.hpp    | 570 ------------------
 requirements-cpu.txt                          |   2 +-
 sane_cute_errors.py                           | 193 ------
 setup.py                                      |  10 -
 27 files changed, 299 insertions(+), 3965 deletions(-)
 rename benchmarks/cutlass_benchmarks/{sparse_mm/bench_v2.py => sp_fp8_benchmarks.py} (74%)
 delete mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
 delete mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
 delete mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json
 delete mode 100644 benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
 rename benchmarks/cutlass_benchmarks/{sparse_mm => }/utils.py (100%)
 delete mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
 delete mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
 create mode 100644 csrc/sparse/cutlass/common.hpp
 delete mode 100644 csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp
 delete mode 100644 csrc/sparse/cutlass/util/common.hpp
 delete mode 100644 csrc/sparse/cutlass/util/device_memory.h
 delete mode 100644 csrc/sparse/cutlass/util/exceptions.h
 delete mode 100644 csrc/sparse/cutlass/util/helper.h
 delete mode 100644 csrc/sparse/cutlass/util/host_tensor.h
 delete mode 100644 csrc/sparse/cutlass/util/packed_stride.hpp
 delete mode 100644 sane_cute_errors.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46e019c697e18..6863eccf29255 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -408,7 +408,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR};${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/
   USE_SABI 3
   WITH_SOABI)
 
@@ -418,9 +418,6 @@ define_gpu_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 
-# include(nm_cutlass_c.cmake)
-# build_nm_cutlass_c()
-
 #
 # _moe_C extension
 #
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
similarity index 74%
rename from benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
rename to benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
index 927d4ddbdb802..644eccdb2d117 100644
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v2.py
+++ b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
@@ -1,3 +1,8 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
 import dataclasses
 import multiprocessing as mp
 import os
@@ -10,9 +15,15 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+from vllm.utils import FlexibleArgumentParser
+import vllm._custom_ops as ops
 from utils import make_n_rand_sparse_tensors
 
-import vllm._custom_ops as ops
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
 
 
 @dataclasses.dataclass
@@ -565,10 +576,202 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
     return timers
 
 
-def bench_v2(dtype: torch.dtype, with_cuda_graph: Optional[int],
+def bench(dtype: torch.dtype, with_cuda_graph: Optional[int],
              with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
              sub_label: str) -> Iterable[TMeasurement]:
     if dtype == torch.float8_e4m3fn:
         return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
                          sub_label)
     raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    dtype = args.dtype
+
+    for m, k, n in MKNs:
+        label = f"scaled-sparse-{dtype}-gemm"
+        label = f"{label}-cugraph_{args.with_cuda_graph}" \
+                if args.with_cuda_graph else label
+        label = f"{label}-argpool_{args.with_arg_pool}" \
+            if args.with_arg_pool else label
+        timers = bench(args.dtype, args.with_cuda_graph,
+                            args.with_arg_pool, m, k, n, label,
+                            f"MKN=({m}x{k}x{n})")
+
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            if tp_split_dim is not None:
+                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        if dt == "fp16":
+            return torch.float16
+        if dt == "bf16":
+            return torch.bfloat16
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument(
+        "--dtype",
+        type=to_torch_dtype,
+        required=True,
+        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
+    parser.add_argument(
+        '--with-cuda-graph',
+        type=int,
+        default=32,
+        help="Number of ops/matmuls in a cudagraph execution. When set"
+        "cuda-graphs is enabled")
+    parser.add_argument(
+        '--with-arg-pool',
+        type=int,
+        default=None,
+        help="Number of A and B tensors to use as arg-pool. When not set,"
+        "it defaults to 1")
+
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
deleted file mode 100644
index 6af88d9e946df..0000000000000
--- a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
+++ /dev/null
@@ -1,365 +0,0 @@
-## Cutlass benchmark V1
-
-from typing import Callable, Iterable
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_rand_sparse_tensors, to_bf16
-
-import vllm._custom_ops as ops
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
-    min_run_time = 1
-
-    globals = {
-        "args": args,
-        "kwargs": kwargs,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-
-    # Create tensors
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
-    aT = a.t()
-    bT = b.t()
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
-                                       torch.bfloat16)
-    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
-
-    if not torch.allclose(out.t(), out_ref):
-        print("Incorrect result")
-        exit()
-
-    timers = []
-
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.bfloat16))
-
-    # cutlass with bias: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.bfloat16, bias))
-
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.float16))
-
-    # cutlass with bias: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.float16, bias.to(dtype=torch.float16)))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-
-    # Create tensors
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
-                                                     k)
-    aT = a.t()
-    bT = b
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    # bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
-                                       torch.bfloat16)
-    out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
-
-    if not torch.allclose(out, out_ref, rtol=1e-2, atol=1e-2):
-        print(f"Incorrect result for {m}, {k}, {n}")
-        exit()
-
-    timers = []
-
-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 bT.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 bT,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 bT,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 bT,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 bT,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.float16))
-
-    return timers
-
-
-def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float16
-
-    m, k, n = 1, 128, 256
-
-    # Create tensors
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float16, m, n, k)
-    aT = a.t()
-    bT = b.t()
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    # bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
-                                       torch.bfloat16)
-    out_ref = to_bf16(a @ bT)
-
-    if not torch.allclose(out.t(), out_ref, rtol=1e-2, atol=1e-2):
-        print("Incorrect result")
-        print(out.t())
-        print(out_ref)
-        exit()
-    else:
-        print("Correct result")
-
-    timers = []
-
-    # # pytorch impl w. bf16
-    # timers.append(
-    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-    #              b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # # pytorch impl: bf16 output
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp16_fp16_bf16_scaled_mm",
-    #              torch._scaled_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.bfloat16))
-
-    # # pytorch impl: fp16 output
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp16_fp16_fp16_scaled_mm",
-    #              torch._scaled_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.float16))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.bfloat16))
-
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.float16))
-
-    # # cutlass impl: bf16 output, with bias
-    # timers.append(
-    #     bench_fn(label, sub_label,
-    #             "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
-    #             ops.cutlass_scaled_sparse_mm, b_compressed, e, aT,
-    #             scale_b, scale_a, torch.bfloat16, bias))
-
-    # # cutlass impl: fp16 output, with bias
-    # timers.append(
-    #     bench_fn(label, sub_label,
-    #              "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
-    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-    #              scale_a, torch.float16,
-    #              bias.to(dtype=torch.float16)))
-
-    return timers
-
-
-def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.bfloat16
-
-    # Create tensors
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.bfloat16, m, n, k)
-    aT = a.t()
-    bT = b.t()
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
-                                       torch.bfloat16)
-    out_ref = to_bf16(a @ bT)
-
-    if not torch.allclose(out.t(), out_ref):
-        print("Incorrect result")
-        exit()
-
-    timers = []
-
-    # # pytorch impl w. bf16
-    # timers.append(
-    #     bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-    #              torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-    #              b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # # pytorch impl: bf16 output
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp16_fp16_bf16_scaled_mm",
-    #              torch._scaled_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.bfloat16))
-
-    # # pytorch impl: fp16 output
-    # timers.append(
-    #     bench_fn(label,
-    #              sub_label,
-    #              "pytorch_fp16_fp16_fp16_scaled_mm",
-    #              torch._scaled_mm,
-    #              a,
-    #              b,
-    #              scale_a=scale_a,
-    #              scale_b=scale_b,
-    #              out_dtype=torch.float16))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.bfloat16))
-
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label,
-                 "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.bfloat16, bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label,
-                 "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
-                 scale_a, torch.float16, bias.to(dtype=torch.float16)))
-
-    return timers
-
-
-def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-             sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float16:
-        return bench_fp16(dtype, m, k, n, label, sub_label)
-    if dtype == torch.bfloat16:
-        return bench_bf16(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
deleted file mode 100644
index ffb731588cbaf..0000000000000
--- a/benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py
+++ /dev/null
@@ -1,217 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from bench_v1 import bench_v1
-from bench_v2 import bench_v2
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-    results = []
-    dtype = args.dtype
-
-    use_bench_v2 = args.with_cuda_graph or args.with_arg_pool
-    for m, k, n in MKNs:
-        if use_bench_v2:
-            label = f"scaled-sparse-{dtype}-gemm"
-            label = f"{label}-cugraph_{args.with_cuda_graph}" \
-                  if args.with_cuda_graph else label
-            label = f"{label}-argpool_{args.with_arg_pool}" \
-                if args.with_arg_pool else label
-            timers = bench_v2(args.dtype, args.with_cuda_graph,
-                              args.with_arg_pool, m, k, n, label,
-                              f"MKN=({m}x{k}x{n})")
-        else:
-            timers = bench_v1(args.dtype, m, k, n,
-                              f"scaled-sparse-{dtype}-gemm",
-                              f"MKN=({m}x{k}x{n})")
-
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            if tp_split_dim is not None:
-                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        if dt == "fp16":
-            return torch.float16
-        if dt == "bf16":
-            return torch.bfloat16
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/sparse_mm/mm_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument(
-        "--dtype",
-        type=to_torch_dtype,
-        required=True,
-        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
-    parser.add_argument(
-        '--with-cuda-graph',
-        type=int,
-        default=None,
-        help="Number of ops/matmuls in a cudagraph execution. When set"
-        "cuda-graphs is enabled")
-    parser.add_argument(
-        '--with-arg-pool',
-        type=int,
-        default=None,
-        help="Number of A and B tensors to use as arg-pool. When not set,"
-        "it defaults to 1")
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json b/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json
deleted file mode 100644
index 2cb53b60807a1..0000000000000
--- a/benchmarks/cutlass_benchmarks/sparse_mm/stable_kernels.json
+++ /dev/null
@@ -1 +0,0 @@
-{"date": "2024-11-09T05:36:00.932166", "stable_kernels": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 537, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295]}
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py b/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
deleted file mode 100644
index 77f15891d84b2..0000000000000
--- a/benchmarks/cutlass_benchmarks/sparse_mm/weight_shapes.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
-                                                              8192], None),
-                                      ([8192, 14336], None),
-                                      ([7168, 8192], None)],
-    # The shape space is very big when benchmarking a large set of kernels.
-    # For example: Let,
-    #  - #kernels to benchmark be 1700
-    #  - #models to benchmark be 4 (each model has 4 shapes)
-    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
-    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
-    # to run, then the benchmark suite would take,
-    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
-    # Below, we exploit some observation on the benchmark shapes to create a
-    # representative set.
-    #
-    # From previous benchmarking runs, we observe that perf if stratified as,
-    # N - small, medium, large and K - small and large. We also observe that
-    # in the model shapes, when K is small, we have small, medium and large Ns.
-    # when K is large, we only have small Ns.
-    #
-    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
-    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
-    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
-    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
-    #         22016, 27648, 28672]
-    "llama-representative-set": [
-        ([4096, 4096], None),  # small K, small N
-        ([4096, 8192], None),  # small K, medium N
-        ([4096, 22016], None),  # small K, large N
-        ([14336, 4096], None),  # large K, small N
-        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
-    ],
-}
diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/utils.py b/benchmarks/cutlass_benchmarks/utils.py
similarity index 100%
rename from benchmarks/cutlass_benchmarks/sparse_mm/utils.py
rename to benchmarks/cutlass_benchmarks/utils.py
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
index c69e87999ae71..acba3be8bd27b 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -1,4 +1,4 @@
-#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
+#include "broadcast_load_epilogue_c2x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index fcc17c7727f94..c74d2aacb0beb 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -1,4 +1,4 @@
-#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "broadcast_load_epilogue_c3x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
deleted file mode 100644
index d407d66ab2aa6..0000000000000
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
+++ /dev/null
@@ -1,496 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-//
-// This file is a modified excerpt of
-// include/cutlass/epilogue/fusion/visitor_load.hpp from
-// https://github.com/NVIDIA/cutlass v3.5.0
-// It has been modified to support either
-// row/column or scalar broadcasting where the tensor being loaded from is
-// always passed in via a device pointer. This lets one compiled kernel handle
-// all cases of per-tensor or per-channel/per-token quantization.
-//
-// This interface also allows the scales to be passed in as tensors that
-// consistently reside on the device, which avoids an issue with a previous
-// implementation where scalars needed to be on the CPU since they
-// were passed in via float values. This created a potential performance hazard
-// if scales were initially on the device, and caused torch.compile graph
-// breaks when moving scales to the CPU.
-//
-#pragma once
-
-// Turn off clang-format for the entire file to keep it close to upstream
-// clang-format off
-
-#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
-#include "cute/tensor.hpp"
-
-namespace cutlass::epilogue::threadblock {
-
-using namespace cute;
-using namespace detail;
-
-template<
-  class ThreadMap,
-  class Element,
-  class StrideMNL
->
-struct VisitorRowOrScalarBroadcast {
-
-  // This struct has been modified to have a bool indicating that ptr_row is a 
-  // scalar that must be broadcast.
-  struct Arguments {
-    Element const* ptr_row = nullptr;
-    bool row_broadcast = true;
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage {};
-
-  // Global load type
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowOrScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gRow,
-      RTensor&& tC_rRow,
-      CTensor&& tC_cRow,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gRow(cute::forward<GTensor>(tC_gRow)),
-      tC_rRow(cute::forward<RTensor>(tC_rRow)),
-      tC_cRow(cute::forward<CTensor>(tC_cRow)),
-      n(get<1>(problem_shape)),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gRow;
-    RTensor tC_rRow;
-    CTensor tC_cRow;
-    Params const* params_ptr;
-    int n;
-
-    // This function is modified from VisitorRowBroadcast
-    CUTLASS_DEVICE void
-    begin_epilogue() {
-      clear(tC_rRow);
-      auto src_v = filter(tC_gRow);
-      auto coord_v = filter(tC_cRow);
-      auto dst_v = filter(tC_rRow);
-
-      if (params_ptr->row_broadcast) {
-        // In this case we are loading from a row vector and broadcasting
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(src_v); ++i) {
-          bool guard = get<1>(coord_v(i)) < n;
-          cutlass::arch::global_load<VecType, sizeof(VecType)>(
-              dst_v(i), (void const*)&src_v(i), guard);
-        }
-      } else {
-        // In this case we are loading from a scalar and broadcasting
-        VecType filled_vec;
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < VecLength; i++) {
-          reinterpret_cast<Element*>(&filled_vec)[i] = *(params_ptr->ptr_row);
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(src_v); ++i) {
-          if (get<1>(coord_v(i)) < n) {
-            dst_v(i) = filled_vec;
-          }
-        }
-      }
-    }
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
-      return rRow_frg(column_idx);
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mRow = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_row),
-      problem_shape,
-      params_ptr->dRow);
-
-    // VECTOR, FRAGMENT_COLUMN
-    Tensor tC_gRow = recast<VecType>(
-      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
-    )(_,_,_0{},_0{},_0{},_0{});
-    Tensor tC_rRow = make_tensor_like(tC_gRow);
-
-    // Generate the pred tensor
-    Tensor cRow = make_identity_tensor(mRow.shape());
-    Tensor tC_cRow = outer_partition(
-      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
-      Shape<Int<VecLength>>{},
-      (_0{})
-    );
-
-    return Callbacks<
-      decltype(tC_gRow), decltype(tC_rRow),
-      decltype(tC_cRow), ProblemShape>(
-      cute::move(tC_gRow),
-      cute::move(tC_rRow),
-      cute::move(tC_cRow),
-      problem_shape,
-      params_ptr
-    );
-  }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// This is a modified RowBroadcast that will broadcast 0 if ptr_row is null
-template<
-  class ThreadMap,
-  class Element,
-  class StrideMNL
->
-struct VisitorRowOrZeroBroadcast {
-
-  // This struct has been modified to remove null_default (because it's always 0)
-  struct Arguments {
-    Element const* ptr_row = nullptr;
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage {};
-
-  // Global load type
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowOrZeroBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowOrZeroBroadcast(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gRow,
-      RTensor&& tC_rRow,
-      CTensor&& tC_cRow,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gRow(cute::forward<GTensor>(tC_gRow)),
-      tC_rRow(cute::forward<RTensor>(tC_rRow)),
-      tC_cRow(cute::forward<CTensor>(tC_cRow)),
-      n(get<1>(problem_shape)),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gRow;
-    RTensor tC_rRow;
-    CTensor tC_cRow;
-    Params const* params_ptr;
-    int n;
-
-    // This function is modified from VisitorRowBroadcast
-    CUTLASS_DEVICE void
-    begin_epilogue() {
-      clear(tC_rRow);
-      auto src_v = filter(tC_gRow);
-      auto coord_v = filter(tC_cRow);
-      auto dst_v = filter(tC_rRow);
-
-      if (params_ptr->ptr_row != nullptr) {
-        // In this case we are loading from a row vector and broadcasting
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(src_v); ++i) {
-          bool guard = get<1>(coord_v(i)) < n;
-          cutlass::arch::global_load<VecType, sizeof(VecType)>(
-              dst_v(i), (void const*)&src_v(i), guard);
-        }
-      } else {
-        // In this case we are broadcasting 0
-        VecType filled_vec;
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < VecLength; i++) {
-          reinterpret_cast<Element*>(&filled_vec)[i] = Element{0};
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(src_v); ++i) {
-          if (get<1>(coord_v(i)) < n) {
-            dst_v(i) = filled_vec;
-          }
-        }
-      }
-    }
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
-      return rRow_frg(column_idx);
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mRow = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_row),
-      problem_shape,
-      params_ptr->dRow);
-
-    // VECTOR, FRAGMENT_COLUMN
-    Tensor tC_gRow = recast<VecType>(
-      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
-    )(_,_,_0{},_0{},_0{},_0{});
-    Tensor tC_rRow = make_tensor_like(tC_gRow);
-
-    // Generate the pred tensor
-    Tensor cRow = make_identity_tensor(mRow.shape());
-    Tensor tC_cRow = outer_partition(
-      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
-      Shape<Int<VecLength>>{},
-      (_0{})
-    );
-
-    return Callbacks<
-      decltype(tC_gRow), decltype(tC_rRow),
-      decltype(tC_cRow), ProblemShape>(
-      cute::move(tC_gRow),
-      cute::move(tC_rRow),
-      cute::move(tC_cRow),
-      problem_shape,
-      params_ptr
-    );
-  }
-
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Column vector broadcast
-template<
-  class ThreadMap,
-  class Element,
-  class StrideMNL = Stride<_1,_0,_0>
->
-struct VisitorColOrScalarBroadcast {
-
-  // This struct has been modified to have a bool indicating that ptr_col is a
-  // scalar that must be broadcast.
-  struct Arguments {
-    Element const* ptr_col = nullptr;
-    bool col_broadcast = true;
-    StrideMNL dCol = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage { };
-
-  CUTLASS_HOST_DEVICE
-  VisitorColOrScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gCol,
-      RTensor&& tC_rCol,
-      CTensor&& tC_cCol,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gCol(cute::forward<GTensor>(tC_gCol)),
-      tC_rCol(cute::forward<RTensor>(tC_rCol)),
-      tC_cCol(cute::forward<CTensor>(tC_cCol)),
-      m(get<0>(problem_shape)),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gCol;
-    RTensor tC_rCol;
-    CTensor tC_cCol;
-    Params const* params_ptr;
-    int m;
-
-    // This function is modified from VisitorColBroadcast
-    CUTLASS_DEVICE void 
-    begin_epilogue() {
-      clear(tC_rCol);
-
-      Tensor pred = make_tensor<bool>(shape(tC_gCol));
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(pred); ++i) {
-        pred(i) = get<0>(tC_cCol(i)) < m;
-      }
-
-      if (params_ptr->col_broadcast) {
-        // In this case we are loading from a column vector and broadcasting
-        copy_if(pred, tC_gCol, tC_rCol);
-      } else {
-        // In this case we are loading from a scalar and broadcasting
-        auto dst_v = filter(tC_rCol);
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(dst_v); ++i) {
-          if (pred(i)) {
-            dst_v(i) = *(params_ptr->ptr_col);
-          }
-        }
-      }
-    }
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Array<Element, FragmentSize> frg_col;
-      frg_col.fill(tC_rCol(row_idx,iter_idx));
-      return frg_col;
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mCol = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_col),
-      problem_shape,
-      params_ptr->dCol);
-
-    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
-    Tensor tC_gCol = group_modes<1,4>(
-      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
-    Tensor tC_rCol = make_tensor_like(tC_gCol);
-
-    // Generate the pred tensor
-    Tensor cCol = make_identity_tensor(mCol.shape());
-    Tensor tC_cCol = group_modes<1,4>(
-      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
-
-    return Callbacks<
-      decltype(tC_gCol), decltype(tC_rCol),
-      decltype(tC_cCol), ProblemShape>(
-      cute::move(tC_gCol),
-      cute::move(tC_rCol),
-      cute::move(tC_cCol),
-      problem_shape,
-      params_ptr
-    );
-  }
-};
-
-}
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
deleted file mode 100644
index 58b1e8ff159fb..0000000000000
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
+++ /dev/null
@@ -1,447 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-//
-// This file is a modified excerpt of
-// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
-// from https://github.com/NVIDIA/cutlass v3.5.0
-// It has been modified to support either row/column or scalar broadcasting
-// where the tensor being loaded from is always passed in via a device pointer.
-// This lets one compiled kernel handle all cases of per-tensor or
-// per-channel/per-token quantization.
-//
-// This interface also allows the scales to be passed in as tensors that
-// consistently reside on the device, which avoids an issue with a previous
-// implementation where scalars needed to be on the CPU since they
-// were passed in via float values. This created a potential performance hazard
-// if scales were initially on the device, and caused torch.compile graphs
-// breaks when moving scales to the CPU.
-//
-#pragma once
-
-// Turn off clang-format for the entire file to keep it close to upstream
-// clang-format off
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-
-#include "cute/tensor.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-
-namespace cutlass::epilogue::fusion {
-
-using namespace cute;
-using namespace detail;
-
-// Row vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class Element,
-  class StrideMNL = Stride<_0,_1,_0>,
-  int Alignment = 128 / sizeof_bits_v<Element>
->
-struct Sm90RowOrScalarBroadcast {
-  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
-
-  struct SharedStorage { 
-    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
-  };
-
-  // This struct has been modified to have a bool indicating that ptr_row is a 
-  // scalar that must be broadcast, instead of containing a scalar that is 
-  // valid if ptr_row is null.
-  struct Arguments {
-    Element const* ptr_row = nullptr;
-    bool row_broadcast = true;
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowOrScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params(params)
-      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
-
-  Params params;
-  Element *smem = nullptr;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
-        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
-        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
-        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
-      : tGS_gRow(tGS_gRow_)
-      , tGS_sRow(tGS_sRow_)
-      , tGS_cRow(tGS_cRow_)
-      , tiled_G2S(tiled_g2s_)
-      , tSR_sRow(tSR_sRow_)
-      , tSR_rRow(tSR_rRow_)
-      , tCcRow(tCcRow_)
-      , residue_tCcRow(residue_tCcRow_)
-      , params(params_) {}
-
-    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
-    Tiled_G2S tiled_G2S;
-
-    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
-  
-    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tCcRow;                                                   // (m, n)
-    ThrNum thr_num;
-    Params const& params;
-
-    CUTLASS_DEVICE void
-    begin() {
-      if (!params.row_broadcast) {
-        fill(tSR_rRow, *(params.ptr_row));
-        return;
-      }
-
-      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
-      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
-      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
-
-      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
-        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
-          continue; // OOB of SMEM, 
-        }
-        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
-          tGS_sRow_flt(i) = tGS_gRow_flt(i);
-        }
-        else {
-          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
-        }
-      }
-      synchronize();
-    }
-
-    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
-      if (epi_m == 0) { // Assumes M-major subtile loop
-        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
-        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
-        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
-        copy(tSR_sRow_flt, tSR_rRow_flt);
-      }
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_row;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
-      }
-
-      return frg_row;
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    using ThreadCount = decltype(size(args.tiled_copy));
-
-    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
-    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
-    Tensor sRow = make_tensor(make_smem_ptr(smem), 
-        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
-    //// G2S: Gmem to Smem
-    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
-                                     Layout< Shape<_1, ThreadCount>, 
-                                            Stride<_0,          _1>>{}, 
-                                     Layout<_1>{});   
-    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
-    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
-    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
-
-    //// G2S: Coord 
-    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
-    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
-
-    //// S2R: Smem to Reg
-    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
-
-    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
-      tGS_gRow, 
-      tGS_sRow, 
-      tGS_cRow, tiled_g2s, 
-      tSR_sRow, 
-      tSR_rRow, 
-      args.tCcD, 
-      args.residue_cD,
-      ThreadCount{}, 
-      params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Column vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class Element,
-  class StrideMNL = Stride<_1,_0,_0>,
-  int Alignment = 128 / sizeof_bits_v<Element>
->
-struct Sm90ColOrScalarBroadcast {
-  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
-  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
-  static_assert(
-    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
-    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
-
-  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
-  struct SharedStorage { };
-
-  // This struct has been modified to have a bool indicating that ptr_col is a 
-  // scalar that must be broadcast, instead of containing a scalar that is 
-  // valid if ptr_col is null.
-  struct Arguments {
-    Element const* ptr_col = nullptr;
-    bool col_broadcast = true;
-    StrideMNL dCol = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColOrScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params(params) { }
-
-  Params params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-      GTensor&& tCgCol,
-      RTensor&& tCrCol,
-      CTensor&& tCcCol,
-      ProblemShape problem_shape,
-      Params const& params
-    ): 
-      tCgCol(cute::forward<GTensor>(tCgCol)),
-      tCrCol(cute::forward<RTensor>(tCrCol)),
-      tCcCol(cute::forward<CTensor>(tCcCol)),
-      m(get<0>(problem_shape)),
-      params(params) {}
-
-    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensor tCrCol;
-    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    Params const& params;
-    int m;
-
-    CUTLASS_DEVICE void
-    begin() {
-      Tensor pred = make_tensor<bool>(shape(tCgCol));
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(pred); ++i) {
-        pred(i) = get<0>(tCcCol(i)) < m;
-      }
-
-      if (!params.col_broadcast) {
-        fill(tCrCol, *(params.ptr_col));
-        return;
-      }
-
-      // Filter so we don't issue redundant copies over stride-0 modes
-      // (only works if 0-strides are in same location, which is by construction)
-      copy_if(pred, filter(tCgCol), filter(tCrCol));
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_col;
-      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
-      }
-
-      return frg_col;
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
-    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-    // Generate an identity tensor matching the shape of the global tensor and 
-    //  partition the same way, this will be used to generate the predicate
-    //  tensor for loading
-    Tensor cCol = make_identity_tensor(mCol.shape());
-    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    return ConsumerStoreCallbacks(
-      cute::move(tCgCol), 
-      cute::move(tCrCol), 
-      cute::move(tCcCol), 
-      args.problem_shape_mnkl, 
-      params
-    );
-  }
-};
-
-}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index 6329ff63623e2..037235e52d7ac 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,7 +21,7 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "broadcast_load_epilogue_c2x.hpp"
+#include "epilogue/broadcast_load_epilogue_c2x.hpp"
 #include "common.hpp"
 // clang-format on
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
index fc78c00835501..92b35c394eeb8 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
@@ -26,7 +26,7 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "broadcast_load_epilogue_c3x.hpp"
+#include "epilogue/broadcast_load_epilogue_c3x.hpp"
 #include "common.hpp"
 // clang-format on
 
diff --git a/csrc/sparse/cutlass/common.hpp b/csrc/sparse/cutlass/common.hpp
new file mode 100644
index 0000000000000..526120799fdcc
--- /dev/null
+++ b/csrc/sparse/cutlass/common.hpp
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                        \
+  {                                                  \
+    TORCH_CHECK(status == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(status))      \
+  }
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                              \
+  {                                                                     \
+    cudaError_t error = status;                                         \
+    if (error != cudaSuccess) {                                         \
+      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                << " at line: " << __LINE__ << std::endl;               \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  }
+
+/**
+ * GPU timer for recording the elapsed time across kernel(s) launched in GPU
+ * stream
+ */
+struct GpuTimer {
+  cudaStream_t _stream_id;
+  cudaEvent_t _start;
+  cudaEvent_t _stop;
+
+  /// Constructor
+  GpuTimer() : _stream_id(0) {
+    CUDA_CHECK(cudaEventCreate(&_start));
+    CUDA_CHECK(cudaEventCreate(&_stop));
+  }
+
+  /// Destructor
+  ~GpuTimer() {
+    CUDA_CHECK(cudaEventDestroy(_start));
+    CUDA_CHECK(cudaEventDestroy(_stop));
+  }
+
+  /// Start the timer for a given stream (defaults to the default stream)
+  void start(cudaStream_t stream_id = 0) {
+    _stream_id = stream_id;
+    CUDA_CHECK(cudaEventRecord(_start, _stream_id));
+  }
+
+  /// Stop the timer
+  void stop() { CUDA_CHECK(cudaEventRecord(_stop, _stream_id)); }
+
+  /// Return the elapsed time (in milliseconds)
+  float elapsed_millis() {
+    float elapsed = 0.0;
+    CUDA_CHECK(cudaEventSynchronize(_stop));
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
+    return elapsed;
+  }
+};
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index e91df52417d1e..1e11e45c5d6d9 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -16,8 +16,8 @@
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/detail/dependent_false.hpp"
 
-#include "util/broadcast_load_epilogue_c3x.hpp"
-#include "util/common.hpp"
+#include "epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
 
 #include "cutlass/transform/device/transform_universal_adapter.hpp"
 #include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
@@ -36,10 +36,8 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/dispatch_policy.hpp"
 
-#include "util/host_tensor.h"
-#include "util/packed_stride.hpp"
-
-#include "util/helper.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
 
 #include "sparse_scaled_mm_c3x.cuh"
 
@@ -109,8 +107,6 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
   LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
 
-  // typename Gemm::GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
-
   // Offline compressor kernel
   using CompressorUtility =
       cutlass::transform::kernel::StructuredSparseCompressorUtility<
@@ -141,30 +137,15 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
 
   auto a_ptr = static_cast<ElementA*>(a.data_ptr());
 
-  // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
-  // cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
-  // cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE>
-  // block_E;
-
   auto a_compressed_ptr = static_cast<ElementA*>(a_compressed.data_ptr());
   auto e_ptr =
       static_cast<typename Gemm::CollectiveMainloop::ElementE*>(e.data_ptr());
 
-  // block_A_compressed.reset(M * KC * L);
-  // block_E.reset(ME * KE * L);
-
   stride_A_compressed =
       cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
   stride_E =
       cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
 
-  // // Random sparsification is performed on host
-  // std::vector<ElementA> block_A_host(m * k);
-  // cutlass::device_memory::copy_to_host(block_A_host.data(), a_ptr, m * k);
-  // compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(),
-  // 2024); cutlass::device_memory::copy_to_device(a_ptr, block_A_host.data(), m
-  // * k);
-
   cutlass::KernelHardwareInfo hw_info;
   hw_info.device_id = 0;
   hw_info.sm_count =
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index e524695c2662c..53a9804d69ff2 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -23,8 +23,8 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "util/broadcast_load_epilogue_c3x.hpp"
-#include "util/common.hpp"
+#include "epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
 // clang-format on
 
   #include "sparse_scaled_mm_c3x.cuh"
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 7c94c38ac6d0b..76bd3fadd90a1 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -20,8 +20,8 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "util/broadcast_load_epilogue_c3x.hpp"
-#include "util/common.hpp"
+#include "epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "common.hpp"
 
 using namespace cute;
 
diff --git a/csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp b/csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp
deleted file mode 100644
index 58b1e8ff159fb..0000000000000
--- a/csrc/sparse/cutlass/util/broadcast_load_epilogue_c3x.hpp
+++ /dev/null
@@ -1,447 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-//
-// This file is a modified excerpt of
-// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
-// from https://github.com/NVIDIA/cutlass v3.5.0
-// It has been modified to support either row/column or scalar broadcasting
-// where the tensor being loaded from is always passed in via a device pointer.
-// This lets one compiled kernel handle all cases of per-tensor or
-// per-channel/per-token quantization.
-//
-// This interface also allows the scales to be passed in as tensors that
-// consistently reside on the device, which avoids an issue with a previous
-// implementation where scalars needed to be on the CPU since they
-// were passed in via float values. This created a potential performance hazard
-// if scales were initially on the device, and caused torch.compile graphs
-// breaks when moving scales to the CPU.
-//
-#pragma once
-
-// Turn off clang-format for the entire file to keep it close to upstream
-// clang-format off
-
-#include "cutlass/cutlass.h"
-#include "cutlass/arch/barrier.h"
-
-#include "cute/tensor.hpp"
-#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
-
-namespace cutlass::epilogue::fusion {
-
-using namespace cute;
-using namespace detail;
-
-// Row vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class Element,
-  class StrideMNL = Stride<_0,_1,_0>,
-  int Alignment = 128 / sizeof_bits_v<Element>
->
-struct Sm90RowOrScalarBroadcast {
-  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
-
-  struct SharedStorage { 
-    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
-  };
-
-  // This struct has been modified to have a bool indicating that ptr_row is a 
-  // scalar that must be broadcast, instead of containing a scalar that is 
-  // valid if ptr_row is null.
-  struct Arguments {
-    Element const* ptr_row = nullptr;
-    bool row_broadcast = true;
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowOrScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params(params)
-      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
-
-  Params params;
-  Element *smem = nullptr;
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
-  }
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
-        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
-        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
-        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
-      : tGS_gRow(tGS_gRow_)
-      , tGS_sRow(tGS_sRow_)
-      , tGS_cRow(tGS_cRow_)
-      , tiled_G2S(tiled_g2s_)
-      , tSR_sRow(tSR_sRow_)
-      , tSR_rRow(tSR_rRow_)
-      , tCcRow(tCcRow_)
-      , residue_tCcRow(residue_tCcRow_)
-      , params(params_) {}
-
-    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
-    Tiled_G2S tiled_G2S;
-
-    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
-  
-    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tCcRow;                                                   // (m, n)
-    ThrNum thr_num;
-    Params const& params;
-
-    CUTLASS_DEVICE void
-    begin() {
-      if (!params.row_broadcast) {
-        fill(tSR_rRow, *(params.ptr_row));
-        return;
-      }
-
-      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
-      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
-      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
-
-      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
-        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
-          continue; // OOB of SMEM, 
-        }
-        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
-          tGS_sRow_flt(i) = tGS_gRow_flt(i);
-        }
-        else {
-          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
-        }
-      }
-      synchronize();
-    }
-
-    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
-      if (epi_m == 0) { // Assumes M-major subtile loop
-        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
-        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
-        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
-        copy(tSR_sRow_flt, tSR_rRow_flt);
-      }
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_row;
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
-      }
-
-      return frg_row;
-    }
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    using ThreadCount = decltype(size(args.tiled_copy));
-
-    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
-    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
-    Tensor sRow = make_tensor(make_smem_ptr(smem), 
-        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
-    //// G2S: Gmem to Smem
-    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
-                                     Layout< Shape<_1, ThreadCount>, 
-                                            Stride<_0,          _1>>{}, 
-                                     Layout<_1>{});   
-    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
-    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
-    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
-
-    //// G2S: Coord 
-    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
-    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
-
-    //// S2R: Smem to Reg
-    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
-
-    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
-      tGS_gRow, 
-      tGS_sRow, 
-      tGS_cRow, tiled_g2s, 
-      tSR_sRow, 
-      tSR_rRow, 
-      args.tCcD, 
-      args.residue_cD,
-      ThreadCount{}, 
-      params);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Column vector broadcast
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class Element,
-  class StrideMNL = Stride<_1,_0,_0>,
-  int Alignment = 128 / sizeof_bits_v<Element>
->
-struct Sm90ColOrScalarBroadcast {
-  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
-  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
-  static_assert(
-    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
-    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
-
-  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
-  struct SharedStorage { };
-
-  // This struct has been modified to have a bool indicating that ptr_col is a 
-  // scalar that must be broadcast, instead of containing a scalar that is 
-  // valid if ptr_col is null.
-  struct Arguments {
-    Element const* ptr_col = nullptr;
-    bool col_broadcast = true;
-    StrideMNL dCol = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColOrScalarBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params(params) { }
-
-  Params params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-      GTensor&& tCgCol,
-      RTensor&& tCrCol,
-      CTensor&& tCcCol,
-      ProblemShape problem_shape,
-      Params const& params
-    ): 
-      tCgCol(cute::forward<GTensor>(tCgCol)),
-      tCrCol(cute::forward<RTensor>(tCrCol)),
-      tCcCol(cute::forward<CTensor>(tCcCol)),
-      m(get<0>(problem_shape)),
-      params(params) {}
-
-    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensor tCrCol;
-    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    Params const& params;
-    int m;
-
-    CUTLASS_DEVICE void
-    begin() {
-      Tensor pred = make_tensor<bool>(shape(tCgCol));
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(pred); ++i) {
-        pred(i) = get<0>(tCcCol(i)) < m;
-      }
-
-      if (!params.col_broadcast) {
-        fill(tCrCol, *(params.ptr_col));
-        return;
-      }
-
-      // Filter so we don't issue redundant copies over stride-0 modes
-      // (only works if 0-strides are in same location, which is by construction)
-      copy_if(pred, filter(tCgCol), filter(tCrCol));
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<Element, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<Element, FragmentSize> frg_col;
-      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
-      }
-
-      return frg_col;
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
-    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-    // Generate an identity tensor matching the shape of the global tensor and 
-    //  partition the same way, this will be used to generate the predicate
-    //  tensor for loading
-    Tensor cCol = make_identity_tensor(mCol.shape());
-    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-
-    return ConsumerStoreCallbacks(
-      cute::move(tCgCol), 
-      cute::move(tCrCol), 
-      cute::move(tCcCol), 
-      args.problem_shape_mnkl, 
-      params
-    );
-  }
-};
-
-}
diff --git a/csrc/sparse/cutlass/util/common.hpp b/csrc/sparse/cutlass/util/common.hpp
deleted file mode 100644
index bf04bb400790f..0000000000000
--- a/csrc/sparse/cutlass/util/common.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include <climits>
-
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
diff --git a/csrc/sparse/cutlass/util/device_memory.h b/csrc/sparse/cutlass/util/device_memory.h
deleted file mode 100644
index fd121317c6c71..0000000000000
--- a/csrc/sparse/cutlass/util/device_memory.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-/**
- * \file
- * \brief C++ interface to CUDA device memory management functions.
- */
-
-#include <memory>
-#include <sstream>
-
-#include "cutlass/platform/platform.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/trace.h"
-#include "exceptions.h"
-
-namespace cutlass {
-namespace device_memory {
-
-/******************************************************************************
- * Allocation lifetime
- ******************************************************************************/
-
-/// Allocate a buffer of \p count elements of type \p T on the current CUDA
-/// device
-template <typename T>
-T* allocate(size_t count = 1) {
-  T* ptr = 0;
-  size_t bytes = 0;
-
-  bytes = count * sizeof(T);
-
-  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
-
-  if (cuda_error != cudaSuccess) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
-    std::ostringstream os;
-    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes="
-       << bytes;
-    CUTLASS_TRACE_HOST(os.str());
-#endif
-    throw cuda_exception("Failed to allocate memory", cuda_error);
-  }
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-  else {
-    std::ostringstream os;
-    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes="
-       << bytes;
-    CUTLASS_TRACE_HOST(os.str());
-  }
-#endif
-
-  return ptr;
-}
-
-/// Free the buffer pointed to by \p ptr
-template <typename T>
-void free(T* ptr) {
-  if (ptr) {
-    cudaError_t cuda_error = (cudaFree(ptr));
-    if (cuda_error != cudaSuccess) {
-      throw cuda_exception("Failed to free device memory", cuda_error);
-    }
-  }
-}
-
-/******************************************************************************
- * Data movement
- ******************************************************************************/
-
-template <typename T>
-void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
-  size_t bytes = count * sizeof_bits<T>::value / 8;
-  if (bytes == 0 && count > 0) {
-    bytes = 1;
-  }
-  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
-  if (cuda_error != cudaSuccess) {
-    std::ostringstream os;
-    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
-       << "dst=" << dst << ", src=" << src << ", bytes=" << bytes
-       << ", count=" << count;
-    if (kind == cudaMemcpyHostToDevice) {
-      os << ", kind=cudaMemcpyHostToDevice";
-    } else if (kind == cudaMemcpyDeviceToHost) {
-      os << ", kind=cudaMemcpyDeviceToHost";
-    } else if (kind == cudaMemcpyDeviceToDevice) {
-      os << ", kind=cudaMemcpyDeviceToDevice";
-    } else if (kind == cudaMemcpyHostToHost) {
-      os << ", kind=cudaMemcpyHostToHost";
-    } else if (kind == cudaMemcpyDefault) {
-      os << ", kind=cudaMemcpyDefault";
-    } else {
-      os << ", kind=Unknown";
-    }
-    os << ", error: " << cudaGetErrorString(cuda_error);
-
-    throw cuda_exception(os.str().c_str(), cuda_error);
-  }
-}
-
-template <typename T>
-void copy_to_device(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyHostToDevice);
-}
-
-template <typename T>
-void copy_to_host(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyDeviceToHost);
-}
-
-template <typename T>
-void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyDeviceToDevice);
-}
-
-template <typename T>
-void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyHostToHost);
-}
-
-/// Copies elements from device memory to host-side range
-template <typename OutputIterator, typename T>
-void insert_to_host(OutputIterator begin, OutputIterator end,
-                    T const* device_begin) {
-  size_t elements = end - begin;
-  copy_to_host(&*begin, device_begin, elements);
-}
-
-/// Copies elements to device memory from host-side range
-template <typename T, typename InputIterator>
-void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
-  size_t elements = end - begin;
-  copy_to_device(device_begin, &*begin, elements);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace device_memory
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-class DeviceAllocation {
- public:
-  /// Delete functor for CUDA device memory
-  struct deleter {
-    void operator()(T* ptr) {
-      cudaError_t cuda_error = (cudaFree(ptr));
-      if (cuda_error != cudaSuccess) {
-        // noexcept
-        //                throw cuda_exception("cudaFree() failed", cuda_error);
-        return;
-      }
-    }
-  };
-
- public:
-  //
-  // Data members
-  //
-
-  /// Number of elements of T allocated on the current CUDA device
-  size_t capacity;
-
-  /// Smart pointer
-  platform::unique_ptr<T, deleter> smart_ptr;
-
- public:
-  //
-  // Static methods
-  //
-
-  /// Static member to compute the number of bytes needed for a given number of
-  /// elements
-  static size_t bytes(size_t elements) {
-    if (sizeof_bits<T>::value < 8) {
-      size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;
-      return elements / kElementsPerByte;
-    } else {
-      size_t const kBytesPerElement = sizeof_bits<T>::value / 8;
-      return elements * kBytesPerElement;
-    }
-  }
-
- public:
-  //
-  // Methods
-  //
-
-  /// Constructor: allocates no memory
-  DeviceAllocation() : capacity(0) {}
-
-  /// Constructor: allocates \p capacity elements on the current CUDA device
-  DeviceAllocation(size_t _capacity)
-      : smart_ptr(device_memory::allocate<T>(_capacity)), capacity(_capacity) {}
-
-  /// Constructor: allocates \p capacity elements on the current CUDA device
-  /// taking ownership of the allocation
-  DeviceAllocation(T* ptr, size_t _capacity)
-      : smart_ptr(ptr), capacity(_capacity) {}
-
-  /// Copy constructor
-  DeviceAllocation(DeviceAllocation const& p)
-      : smart_ptr(device_memory::allocate<T>(p.capacity)),
-        capacity(p.capacity) {
-    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
-  }
-
-  /// Move constructor
-  DeviceAllocation(DeviceAllocation&& p) : capacity(0) {
-    std::swap(smart_ptr, p.smart_ptr);
-    std::swap(capacity, p.capacity);
-  }
-
-  /// Destructor
-  ~DeviceAllocation() { reset(); }
-
-  /// Returns a pointer to the managed object
-  T* get() const { return smart_ptr.get(); }
-
-  /// Releases the ownership of the managed object (without deleting) and resets
-  /// capacity to zero
-  T* release() {
-    capacity = 0;
-    return smart_ptr.release();
-  }
-
-  /// Deletes the managed object and resets capacity to zero
-  void reset() {
-    capacity = 0;
-    smart_ptr.reset();
-  }
-
-  /// Deletes managed object, if owned, and allocates a new object
-  void reset(size_t _capacity) {
-    reset(device_memory::allocate<T>(_capacity), _capacity);
-  }
-
-  /// Deletes managed object, if owned, and replaces its reference with a given
-  /// pointer and capacity
-  void reset(T* _ptr, size_t _capacity) {
-    smart_ptr.reset(_ptr);
-    capacity = _capacity;
-  }
-
-  /// Allocates a new buffer and copies the old buffer into it. The old buffer
-  /// is then released.
-  void reallocate(size_t new_capacity) {
-    platform::unique_ptr<T, deleter> new_allocation(
-        device_memory::allocate<T>(new_capacity));
-
-    device_memory::copy_device_to_device(new_allocation.get(), smart_ptr.get(),
-                                         std::min(new_capacity, capacity));
-
-    std::swap(smart_ptr, new_allocation);
-    std::swap(new_capacity, capacity);
-  }
-
-  /// Returns the number of elements
-  size_t size() const { return capacity; }
-
-  /// Returns the number of bytes needed to store the allocation
-  size_t bytes() const { return bytes(capacity); }
-
-  /// Returns a pointer to the object owned by *this
-  T* operator->() const { return smart_ptr.get(); }
-
-  /// Returns the deleter object which would be used for destruction of the
-  /// managed object.
-  deleter& get_deleter() { return smart_ptr.get_deleter(); }
-
-  /// Returns the deleter object which would be used for destruction of the
-  /// managed object (const)
-  const deleter& get_deleter() const { return smart_ptr.get_deleter(); }
-
-  /// Copies a device-side memory allocation
-  DeviceAllocation& operator=(DeviceAllocation const& p) {
-    if (capacity != p.capacity) {
-      smart_ptr.reset(device_memory::allocate<T>(p.capacity));
-      capacity = p.capacity;
-    }
-    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
-    return *this;
-  }
-
-  /// Move assignment
-  DeviceAllocation& operator=(DeviceAllocation&& p) {
-    std::swap(smart_ptr, p.smart_ptr);
-    std::swap(capacity, p.capacity);
-    return *this;
-  }
-
-  /// Copies the entire allocation from another location in device memory.
-  void copy_from_device(T const* ptr) const { copy_from_device(ptr, capacity); }
-
-  /// Copies a given number of elements from device memory
-  void copy_from_device(T const* ptr, size_t elements) const {
-    device_memory::copy_device_to_device(get(), ptr, elements);
-  }
-
-  void copy_to_device(T* ptr) const { copy_to_device(ptr, capacity); }
-
-  void copy_to_device(T* ptr, size_t elements) const {
-    device_memory::copy_device_to_device(ptr, get(), elements);
-  }
-
-  void copy_from_host(T const* ptr) const { copy_from_host(ptr, capacity); }
-
-  void copy_from_host(T const* ptr, size_t elements) const {
-    device_memory::copy_to_device(get(), ptr, elements);
-  }
-
-  void copy_to_host(T* ptr) const { copy_to_host(ptr, capacity); }
-
-  void copy_to_host(T* ptr, size_t elements) const {
-    device_memory::copy_to_host(ptr, get(), elements);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace device_memory {
-
-/// Device allocation abstraction that tracks size and capacity
-template <typename T>
-using allocation = cutlass::DeviceAllocation<T>;
-
-}  // namespace device_memory
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/util/exceptions.h b/csrc/sparse/cutlass/util/exceptions.h
deleted file mode 100644
index 163af9aca19c2..0000000000000
--- a/csrc/sparse/cutlass/util/exceptions.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-/**
- * \file
- * \brief C++ exception semantics for CUDA error codes
- */
-
-#include <cuda_runtime.h>
-#include <iosfwd>
-#include <stdexcept>
-
-#include "cutlass/platform/platform.h"
-
-namespace cutlass {
-
-/// C++ exception wrapper for CUDA \p cudaError_t
-class cuda_exception : public std::exception {
- public:
-  /// Constructor
-  cuda_exception(const char* msg = "", cudaError_t err = cudaErrorUnknown)
-      : msg(msg), err(err) {}
-
-  /// Returns the underlying CUDA \p cudaError_t
-  cudaError_t cudaError() const { return err; }
-
- protected:
-  /// Explanatory string
-  const char* msg;
-
-  /// Underlying CUDA \p cudaError_t
-  cudaError_t err;
-};
-
-/// Writes a cuda_exception instance to an output stream
-inline std::ostream& operator<<(std::ostream& out, cuda_exception const& e) {
-  return out << e.what() << ": " << cudaGetErrorString(e.cudaError());
-}
-
-}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/util/helper.h b/csrc/sparse/cutlass/util/helper.h
deleted file mode 100644
index 407e0a49b3a38..0000000000000
--- a/csrc/sparse/cutlass/util/helper.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cuda_runtime.h"
-#include <iostream>
-
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                              \
-  {                                                                     \
-    cudaError_t error = status;                                         \
-    if (error != cudaSuccess) {                                         \
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                << " at line: " << __LINE__ << std::endl;               \
-      exit(EXIT_FAILURE);                                               \
-    }                                                                   \
-  }
-
-/**
- * GPU timer for recording the elapsed time across kernel(s) launched in GPU
- * stream
- */
-struct GpuTimer {
-  cudaStream_t _stream_id;
-  cudaEvent_t _start;
-  cudaEvent_t _stop;
-
-  /// Constructor
-  GpuTimer() : _stream_id(0) {
-    CUDA_CHECK(cudaEventCreate(&_start));
-    CUDA_CHECK(cudaEventCreate(&_stop));
-  }
-
-  /// Destructor
-  ~GpuTimer() {
-    CUDA_CHECK(cudaEventDestroy(_start));
-    CUDA_CHECK(cudaEventDestroy(_stop));
-  }
-
-  /// Start the timer for a given stream (defaults to the default stream)
-  void start(cudaStream_t stream_id = 0) {
-    _stream_id = stream_id;
-    CUDA_CHECK(cudaEventRecord(_start, _stream_id));
-  }
-
-  /// Stop the timer
-  void stop() { CUDA_CHECK(cudaEventRecord(_stop, _stream_id)); }
-
-  /// Return the elapsed time (in milliseconds)
-  float elapsed_millis() {
-    float elapsed = 0.0;
-    CUDA_CHECK(cudaEventSynchronize(_stop));
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
-    return elapsed;
-  }
-};
diff --git a/csrc/sparse/cutlass/util/host_tensor.h b/csrc/sparse/cutlass/util/host_tensor.h
deleted file mode 100644
index eeb0692c5b4b8..0000000000000
--- a/csrc/sparse/cutlass/util/host_tensor.h
+++ /dev/null
@@ -1,555 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
- *reserved. SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/*! \file
-  \brief HostTensor contributes management for both host and device memory.
-
-  HostTensor allocates host and device memory upon construction. Basic
-  element-wise operations on host memory synchronize device memory
-  automatically. Explicit copy operations provide abstractions for CUDA memcpy
-  operations.
-
-  Call {host, device}_{data, ref, view}() for accessing host or device memory.
-
-  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
-*/
-
-#include <vector>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/fast_math.h"
-
-#include "device_memory.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Host tensor
-template <
-    /// Data type of element stored within tensor (concept: NumericType)
-    typename Element_,
-    /// Defines a mapping from logical coordinate to linear memory (concept:
-    /// Layout)
-    typename Layout_>
-class HostTensor {
- public:
-  /// Data type of individual access
-  using Element = Element_;
-
-  /// Mapping function from logical coordinate to linear memory
-  using Layout = Layout_;
-
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Layout's stride vector
-  using Stride = typename Layout::Stride;
-
-  /// Tensor reference to device memory
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Tensor reference to constant device memory
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  /// Tensor reference to device memory
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Tensor reference to constant device memory
-  using ConstTensorView = typename TensorView::ConstTensorView;
-
-  /// Reference to element in tensor
-  using Reference = typename TensorRef::Reference;
-
-  /// Constant reference to element in tensor
-  using ConstReference = typename ConstTensorRef::Reference;
-
- private:
-  using StorageUnit = typename platform::conditional_t<
-      std::is_same_v<Element, bool>,
-      uint8_t,  // Avoid the std::vector<bool> specialization
-      typename platform::conditional_t<sizeof_bits<Element>::value % 8 ==
-                                           0,  // Handle subbyte types
-                                       Element, uint8_t>>;
-  using StorageContainerCalculator =
-      cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
-  static constexpr int kContainerTypeNumBits =
-      StorageContainerCalculator::kContainerTypeNumBits;
-  static constexpr int kContainerTypeNumLogicalElements =
-      StorageContainerCalculator::kContainerTypeNumLogicalElements;
-  static constexpr int kContainerTypeNumBytes =
-      StorageContainerCalculator::kContainerTypeNumBytes;
-  static constexpr int kContainerTypeNumStorageUnit =
-      StorageContainerCalculator::kContainerTypeNumStorageUnit;
-
-  //
-  // Data members
-  //
-
-  /// Extent of tensor in logical dimensions
-  TensorCoord extent_;
-
-  /// Layout object
-  Layout layout_;
-
-  /// Host-side memory allocation
-  std::vector<StorageUnit> host_;
-
-  /// Device-side memory
-  device_memory::allocation<StorageUnit> device_;
-
-  /// number of containers
-  size_t count_to_container_storage_unit_count(size_t count) {
-    return (count + kContainerTypeNumLogicalElements - 1) /
-           kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
-  }
-
- public:
-  //
-  // Device and Host Methods
-  //
-
-  /// Default constructor
-  HostTensor() {}
-
-  /// Constructs a tensor given an extent. Assumes a packed layout
-  HostTensor(TensorCoord const& extent, bool device_backed = true) {
-    this->reset(extent, Layout::packed(extent), device_backed);
-  }
-
-  /// Constructs a tensor given an extent and layout
-  HostTensor(TensorCoord const& extent, Layout const& layout,
-             bool device_backed = true) {
-    this->reset(extent, layout, device_backed);
-  }
-
-  ~HostTensor() {}
-
-  /// Clears the HostTensor allocation to size/capacity = 0
-  void reset() {
-    extent_ = TensorCoord();
-    layout_ = Layout::packed(extent_);
-
-    host_.clear();
-    device_.reset();
-  }
-
-  /// Resizes internal memory allocations without affecting layout or extent
-  void reserve(size_t count,  ///< size of tensor in elements
-               bool device_backed_ =
-                   true) {  ///< if true, device memory is also allocated
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count="
-                       << count << ", device_backed_="
-                       << (device_backed_ ? "true" : "false") << ")");
-#endif
-
-    device_.reset();
-    host_.clear();
-
-    size_t count_container = count_to_container_storage_unit_count(count);
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize("
-                       << count_container << ")");
-#endif
-    host_.resize(count_container);
-
-    // Allocate memory
-    StorageUnit* device_memory = nullptr;
-    if (device_backed_) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST(
-          "cutlass::HostTensor::reserve: device_memory::allocate("
-          << count_container << ")");
-#endif
-      device_memory = device_memory::allocate<StorageUnit>(count_container);
-    }
-    device_.reset(device_memory, device_backed_ ? count_container : 0);
-  }
-
-  /// Updates the extent and layout of the HostTensor. Allocates memory
-  /// according to the new extent and layout.
-  void reset(TensorCoord const& extent,  ///< extent of logical tensor
-             Layout const& layout,       ///< layout object of tensor
-             bool device_backed_ =
-                 true) {  ///< if true, device memory is also allocated.
-
-    extent_ = extent;
-    layout_ = layout;
-
-    reserve(size_t(layout_.capacity(extent_)), device_backed_);
-  }
-
-  /// Updates the extent and layout of the HostTensor. Allocates memory
-  /// according to the new extent and layout. Assumes a packed tensor
-  /// configuration.
-  void reset(TensorCoord const& extent,  ///< extent of logical tensor
-             bool device_backed_ =
-                 true) {  ///< if true, device memory is also allocated.
-
-    reset(extent, Layout::packed(extent), device_backed_);
-  }
-
-  /// Changes the size of the logical tensor. Only allocates memory if new
-  /// capacity exceeds reserved capacity. To force allocation, call reset().
-  void resize(TensorCoord const& extent,  ///< extent of logical tensor
-              Layout const& layout,       ///< layout object of tensor
-              bool device_backed_ =
-                  true) {  ///< if true, device memory is also allocated.
-
-    extent_ = extent;
-    layout_ = layout;
-
-    LongIndex new_size = size_t(layout_.capacity(extent_));
-    LongIndex new_size_container =
-        count_to_container_storage_unit_count((layout_.capacity(extent_)));
-
-    if (static_cast<decltype(host_.size())>(new_size_container) >
-        host_.size()) {
-      reserve(new_size, device_backed_);
-    }
-  }
-
-  /// Changes the size of the logical tensor. Only allocates memory if new
-  /// capacity exceeds reserved capacity. To force allocation, call reset().
-  /// Note, this form of resize() assumes a packed tensor configuration.
-  void resize(TensorCoord const& extent,  ///< extent of logical tensor
-              bool device_backed_ =
-                  true) {  ///< if true, device memory is also allocated.
-
-    resize(extent, Layout::packed(extent), device_backed_);
-  }
-
-  /// Returns the logical number of elements stored in the host tensor
-  size_t size() const { return layout_.capacity(extent_); }
-
-  /// Returns the logical capacity in terms of number of elements. May be larger
-  /// than the size().
-  LongIndex capacity() const {
-    return host_.size() / kContainerTypeNumStorageUnit *
-           kContainerTypeNumLogicalElements;
-  }
-
-  /// Gets pointer to host data
-  Element* host_data() { return reinterpret_cast<Element*>(host_.data()); }
-
-  /// Gets pointer to host data with a pointer offset
-  Element* host_data_ptr_offset(LongIndex ptr_element_offset) {
-    return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset);
-  }
-
-  /// Gets a reference to an element in host memory
-  Reference host_data(LongIndex idx) {
-    return ReferenceFactory<Element>::get(host_data(), idx);
-  }
-
-  /// Gets pointer to host data
-  Element const* host_data() const {
-    return reinterpret_cast<Element const*>(host_.data());
-  }
-
-  /// Gets pointer to host data with a pointer offset
-  Element const* host_data_ptr_offset(LongIndex ptr_element_offset) const {
-    return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset);
-  }
-
-  /// Gets a constant reference to an element in host memory
-  ConstReference host_data(LongIndex idx) const {
-    return ReferenceFactory<Element const>::get(host_data(), idx);
-  }
-
-  /// Gets pointer to device data
-  Element* device_data() { return reinterpret_cast<Element*>(device_.get()); }
-
-  /// Gets pointer to device data
-  Element const* device_data() const {
-    return reinterpret_cast<Element const*>(device_.get());
-  }
-
-  /// Gets pointer to device data with a pointer offset
-  Element* device_data_ptr_offset(LongIndex ptr_element_offset) {
-    return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset);
-  }
-
-  /// Gets pointer to device data with a pointer offset
-  Element const* device_data_ptr_offset(LongIndex ptr_element_offset) const {
-    return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorRef host_ref(LongIndex ptr_element_offset = 0) {
-    return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef host_ref(LongIndex ptr_element_offset = 0) const {
-    return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorRef device_ref(LongIndex ptr_element_offset = 0) {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef device_ref(LongIndex ptr_element_offset = 0) const {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorView host_view(LongIndex ptr_element_offset = 0) {
-    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_,
-                      extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView host_view(LongIndex ptr_element_offset = 0) const {
-    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_,
-                           extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorView device_view(LongIndex ptr_element_offset = 0) {
-    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_,
-                      extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView device_view(LongIndex ptr_element_offset = 0) const {
-    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_,
-                           extent_);
-  }
-
-  /// Returns true if device memory is allocated
-  bool device_backed() const {
-    return (device_.get() == nullptr) ? false : true;
-  }
-
-  /// Returns the layout object
-  Layout& layout() { return layout_; }
-
-  /// Returns the layout object
-  Layout layout() const { return layout_; }
-
-  /// Returns the layout object's stride vector
-  Stride stride() const { return layout_.stride(); }
-
-  /// Returns the layout object's stride vector
-  Stride& stride() { return layout_.stride(); }
-
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex stride(int dim) const { return layout_.stride().at(dim); }
-
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex& stride(int dim) { return layout_.stride().at(dim); }
-
-  /// Computes the offset of an index from the origin of the tensor
-  LongIndex offset(TensorCoord const& coord) const { return layout_(coord); }
-
-  /// Returns a reference to the element at the logical Coord in host memory
-  Reference at(TensorCoord const& coord) { return host_data(offset(coord)); }
-
-  /// Returns a const reference to the element at the logical Coord in host
-  /// memory
-  ConstReference at(TensorCoord const& coord) const {
-    return host_data(offset(coord));
-  }
-
-  /// Returns the extent of the tensor
-  TensorCoord extent() const { return extent_; }
-
-  /// Returns the extent of the tensor
-  TensorCoord& extent() { return extent_; }
-
-  /// Copies data from device to host
-  void sync_host() {
-    if (device_backed()) {
-      device_memory::copy_to_host(host_.data(), device_.get(), device_.size());
-    }
-  }
-
-  /// Copies data from host to device
-  void sync_device() {
-    if (device_backed()) {
-      device_memory::copy_to_device(device_.get(), host_.data(), host_.size());
-    }
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_host(
-      Element const* ptr_device,  ///< source device memory
-      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
-                               ///< entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-        host_.data(), reinterpret_cast<StorageUnit const*>(ptr_device),
-        container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_device(
-      Element const* ptr_device,  ///< source device memory
-      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
-                               ///< entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-        device_.get(), reinterpret_cast<StorageUnit const*>(ptr_device),
-        container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_device(
-      Element const* ptr_host,  ///< source host memory
-      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
-                               ///< entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-        device_.get(), reinterpret_cast<StorageUnit const*>(ptr_host),
-        container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_host(
-      Element const* ptr_host,  ///< source host memory
-      LongIndex count = -1) {  ///< number of elements to transfer; if negative,
-                               ///< entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-        host_.data(), reinterpret_cast<StorageUnit const*>(ptr_host),
-        container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_host(Element* ptr_host,  ///< source device memory
-                               LongIndex count = -1)
-      const {  ///< number of elements to transfer; if negative, entire tensor
-               ///< is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(reinterpret_cast<StorageUnit*>(ptr_host),
-                                device_.get(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_device(Element* ptr_device,  ///< source device memory
-                                 LongIndex count = -1)
-      const {  ///< number of elements to transfer; if negative, entire tensor
-               ///< is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-        reinterpret_cast<StorageUnit*>(ptr_device), device_.get(),
-        container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_device(Element* ptr_device,  ///< source host memory
-                               LongIndex count = -1)
-      const {  ///< number of elements to transfer; if negative, entire tensor
-               ///< is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(reinterpret_cast<StorageUnit*>(ptr_device),
-                                  host_.data(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_host(Element* ptr_host,  ///< source host memory
-                             LongIndex count = -1)
-      const {  ///< number of elements to transfer; if negative, entire tensor
-               ///< is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    } else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(reinterpret_cast<StorageUnit*>(ptr_host),
-                                     host_.data(), container_count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/util/packed_stride.hpp b/csrc/sparse/cutlass/util/packed_stride.hpp
deleted file mode 100644
index e9a243a1322cc..0000000000000
--- a/csrc/sparse/cutlass/util/packed_stride.hpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
-*/
-
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cute/container/array.hpp"   // cute::array
-#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides without batch mode
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides with batch mode
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, int64_t>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT, int64_t>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides with group mode
-
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides for convolutions
-
-// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
-// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
-// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
-// right in KTRSC order and can be coalesced to just k.
-// We enforce this condition here with asserts.
-template <class IntT, size_t RankT_>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
-    cute::array<int32_t, RankT_> shape_output,
-    cute::array<IntT, RankT_> stride_output,
-    cutlass::conv::Operator conv_op) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  static_assert(RankT_ >= 3u);
-  constexpr static int RankT = static_cast<int>(RankT_);
-
-  assert(stride_output[RankT-1] == 1);
-  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
-    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
-  });
-
-  auto s_copy = s;
-  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
-      stride_output[0] :
-      stride_output[RankT-2];
-  return s_copy;
-}
-
-//
-// Activation tensor ((w, h, d, n), _1) for fprop kernel
-//
-
-// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_nwc[1];
-  cute::get<0,1>(s_copy) = stride_nwc[0];
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
-  });
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
-  });
-  return s_copy;
-}
-
-//
-// Filter tensor (k, (_1, s, r, t)) for fprop kernel
-//
-
-// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-
-//
-// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
-//
-// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
-//
-
-// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
-// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::get<1,0>(s_copy) = stride_nwc[1];
-    cute::get<1,1>(s_copy) = stride_nwc[0];
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nwc in dgrad is ksc.
-    cute::get<1,0>(s_copy) = stride_nwc[0];
-    cute::get<1,1>(s_copy) = stride_nwc[1];
-  }
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nhwc in dgrad is krsc.
-    cute::get<1,0>(s_copy) = stride_nhwc[0];
-    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_ndhwc in dgrad is ktrsc.
-    cute::get<1,0>(s_copy) = stride_ndhwc[0];
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-
-//
-// NZPQ tensor (_1, nzpq) for wgrad kernel
-//
-
-// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 3> stride_nqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nqk[2] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nqk[1];
-  return s_copy;
-}
-
-// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 4> stride_npqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_npqk[3] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_npqk[2];
-  return s_copy;
-}
-
-// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 5> stride_nzpqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nzpqk[4] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nzpqk[3];
-  return s_copy;
-}
-
-
-
-//
-// Wgrad output tensor (k, (_1, s, r, t), _0)
-//
-
-// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-
-
-//
-// Wgrad output tensor ((_1, s, r, t), k, _0)
-//
-
-// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ksc[0];
-  cute::get<0,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 59a7001611401..db8ad9d3a015d 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -4,4 +4,4 @@
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
 torch==2.5.1; platform_machine == "aarch64"
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file
diff --git a/sane_cute_errors.py b/sane_cute_errors.py
deleted file mode 100644
index 512a493142d94..0000000000000
--- a/sane_cute_errors.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/env python
-
-# Usage:
-#  ```
-#  python setup.py build_ext --inplace | tee compile_log.txt
-#  cat compile_log.txt | python sane_cute_errors.py
-#  ```
-
-import sys
-
-import regex
-from colorama import Fore
-
-
-def _loop_replace(replace_fn, input_str, *args, **kwargs):
-    new_string, count = replace_fn(input_str, *args, **kwargs)
-    while count > 0:
-        new_string, count = replace_fn(new_string, *args, **kwargs)
-    return new_string
-
-
-def replace_delimited_substring(input_str,
-                                start_delim,
-                                end_delim,
-                                replace_fn,
-                                prefix=""):
-    start_delim = regex.escape(start_delim)
-    end_delim = regex.escape(end_delim)
-    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'  # noqa: E501
-    return regex.subn(rx, lambda x: replace_fn(x.group(2)), input_str)
-
-
-def replace_all_delimited_substrings(input_str,
-                                     start_delim,
-                                     end_delim,
-                                     replace_fn,
-                                     prefix=""):
-    return _loop_replace(replace_delimited_substring,
-                         input_str,
-                         start_delim,
-                         end_delim,
-                         replace_fn,
-                         prefix=prefix)
-
-
-def replace_delimiters(input_str,
-                       start_delim,
-                       end_delim,
-                       new_start,
-                       new_end,
-                       prefix=""):
-    start_delim = regex.escape(start_delim)
-    end_delim = regex.escape(end_delim)
-    rx = f'{prefix}({start_delim}((?:(?!{start_delim}|{end_delim}).|(?1))*){end_delim})'  # noqa: E501
-    return regex.subn(rx, f"{new_start}\\2{new_end}", input_str)
-
-
-def replace_all_delimiters(input_str,
-                           start_delim,
-                           end_delim,
-                           new_start,
-                           new_end,
-                           prefix=""):
-    return _loop_replace(replace_delimiters,
-                         input_str,
-                         start_delim,
-                         end_delim,
-                         new_start,
-                         new_end,
-                         prefix=prefix)
-
-
-def replace(input_str, old, new):
-    return regex.subn(old, new, input_str)
-
-
-def replace_all(input_str, old, new):
-    return _loop_replace(replace, input_str, old, new)
-
-
-def sepreate_at_line_of(input_str):
-    return regex.sub(r"at line (\d+) of ([^\n\r]*)",
-                     f"\n\t\tat {Fore.GREEN}\\2:\\1{Fore.RESET}", input_str)
-
-
-def break_apart_instantiation_of(input_str):
-
-    def replace_fn(x):
-
-        def replace_fn_inner(x):
-            x = regex.sub(r"([^\s=]+=)", r"\n\t\t  \1", x)
-            return x
-
-        x = regex.sub(r"(at line)", r"\n\t\t\1", x)
-        y, _ = replace_delimited_substring(x, "[", "]", replace_fn_inner)
-        return "instantiation of " + regex.sub(
-            r"([^(]*)", f"{Fore.MAGENTA}\\1{Fore.RESET}", y, count=1)
-
-    return replace_all_delimited_substrings(input_str,
-                                            "\"",
-                                            "\"",
-                                            replace_fn,
-                                            prefix=r"instantiation of ")
-
-
-def template_replace_commas_at_depth_0(x, new_char):
-    brace_stack = []
-    brace_pairs = {"(": ")", "[": "]", "{": "}", "<": ">"}
-    replaced_comma = False
-
-    for idx in range(len(x)):
-        if x[idx] in brace_pairs:
-            brace_stack.append(x[idx])
-        elif len(brace_stack) > 0 and x[idx] == brace_pairs[brace_stack[-1]]:
-            brace_stack.pop()
-        if len(brace_stack) == 0 and x[idx] == ",":
-            x = x[:idx] + new_char + x[idx + 1:]
-            replaced_comma = True
-    return x, replaced_comma
-
-
-def replace_layout_commas(x):
-
-    def replace_commas_inner(x):
-        x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
-        if not replaced:
-            x, _ = replace_delimiters(x,
-                                      "<",
-                                      ">",
-                                      "",
-                                      "",
-                                      prefix="cute::tuple")
-            x, replaced = template_replace_commas_at_depth_0(x, new_char=" :")
-        assert replaced == True  # noqa: E712
-        return f"{Fore.BLUE}{x}{Fore.RESET}"
-
-    x, _ = replace_delimited_substring(x,
-                                       "<",
-                                       ">",
-                                       replace_commas_inner,
-                                       prefix="cute::Layout")
-    return x
-
-
-def replace_composed_layout_commas(x):
-
-    def replace_commas_inner(x):
-        x, replaced = template_replace_commas_at_depth_0(x, new_char=" o")
-        assert replaced == True  # noqa: E712
-        return x
-
-    x, _ = replace_delimited_substring(x,
-                                       "<",
-                                       ">",
-                                       replace_commas_inner,
-                                       prefix="cute::ComposedLayout")
-    return x
-
-
-def clean_up_log(log):
-    new_str = sepreate_at_line_of(log)
-    new_str = break_apart_instantiation_of(new_str)
-    new_str = replace_layout_commas(new_str)
-    new_str = replace_composed_layout_commas(new_str)
-    new_str = replace_all_delimiters(new_str,
-                                     "<",
-                                     ">",
-                                     "(",
-                                     ")",
-                                     prefix="cute::tuple")
-    new_str = replace_all_delimiters(new_str,
-                                     "<",
-                                     ">",
-                                     "S<",
-                                     ">",
-                                     prefix="cute::Swizzle")
-    new_str = replace_all(new_str, r"cute::C<(\d+)>", r"_\1")
-    new_str = replace_all(new_str, r"cute::_(\d+)", r"_\1")
-    new_str = replace_all(new_str, r"cute::Underscore", r"_")
-
-    template_type_abbreviations = (("cute::ScaledBasis",
-                                    "SB"), ("cute::Tensor", "T"),
-                                   ("cute::ArithmeticTuple",
-                                    "AT"), ("cute::ArithmeticTupleIterator",
-                                            "ATI"), ("cute::ViewEngine", "VE"))
-
-    for template_type, abrv in template_type_abbreviations:
-        new_str = replace_all(new_str, template_type + "<", abrv + "<")
-    print(new_str)
-
-
-data = sys.stdin.read()
-clean_up_log(data)
diff --git a/setup.py b/setup.py
index 4791bb41b5f49..fcfaa207c176a 100644
--- a/setup.py
+++ b/setup.py
@@ -579,16 +579,6 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules.append(
         CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
 
-# if _is_cuda():
-#     sparse_mm_generated_dir = './csrc/sparse/cutlass/generator/generated/'
-#     sparse_mm_generated_dirs = \
-#         [x for x in Path(sparse_mm_generated_dir).iterdir() if x.is_dir()]
-#     sparse_mm_generated_dir_names = [x.name for x in sparse_mm_generated_dirs]
-#     nm_cutlass_extensions = \
-#         [f"vllm._nm_cutlass_{x}_C" for x in sparse_mm_generated_dir_names]
-#     for x in nm_cutlass_extensions:
-#         ext_modules.append(CMakeExtension(name=x))
-
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
 

From 6d574affde326df01fbfaf4f112da56fd3a1f018 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Wed, 11 Dec 2024 20:04:31 +0000
Subject: [PATCH 74/92] Update code, flip sparse op operand order to be
 consistent with dense

---
 .../cutlass_benchmarks/sp_fp8_benchmarks.py   | 12 ++-
 tests/kernels/test_cutlass.py                 | 85 ++++++++++++++++++-
 vllm/_custom_ops.py                           | 12 +--
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
index 644eccdb2d117..0dd59c708d9cd 100644
--- a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
@@ -245,7 +245,6 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int,
         # Create tensors
         BComps, Es, As, Bs = make_n_rand_sparse_tensors(
             kernel_config.get('arg_pool_size', 1), dtype, m, n, k)
-        AsT = [x.t() for x in As]
         bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
         bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
         scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
@@ -304,8 +303,8 @@ def run_single_benchmark_process(kernel_config: Dict, gpu_id: int,
         elif kernel_type == 'cutlass_scaled_sparse_mm':
             bench = BenchMM(cuda_graph_params, label, sub_label,
                             "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                            ops.cutlass_scaled_sparse_mm, ArgPool(BComps),
-                            ArgPool(Es), ArgPool(AsT), scale_b, scale_a,
+                            ops.cutlass_scaled_sparse_mm, ArgPool(As),
+                            ArgPool(BComps), ArgPool(Es), scale_a, scale_b,
                             torch.bfloat16)
 
         # Run the benchmark
@@ -430,7 +429,6 @@ def run_kernels_on_gpus(
                 # Create tensors
                 BComps, Es, As, Bs = make_n_rand_sparse_tensors(
                     config.get('arg_pool_size', 1), dtype, m, n, k)
-                AsT = [x.t() for x in As]
                 bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
                 bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
                 scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
@@ -491,9 +489,9 @@ def run_kernels_on_gpus(
                 elif kernel_type == 'cutlass_scaled_sparse_mm':
                     bench = BenchMM(cuda_graph_params, label, sub_label,
                                     "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                                    ops.cutlass_scaled_sparse_mm,
-                                    ArgPool(BComps), ArgPool(Es), ArgPool(AsT),
-                                    scale_b, scale_a, torch.bfloat16)
+                                    ops.cutlass_scaled_sparse_mm, ArgPool(As),
+                                    ArgPool(BComps), ArgPool(Es),
+                                    scale_a, scale_b, torch.bfloat16)
 
                 # Run the benchmark
                 result = bench.run()
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index afe53797322f9..2c5d19cc54c54 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,7 +2,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Optional, Type
+from typing import Optional, Type, Tuple
 
 import pytest
 import torch
@@ -55,6 +55,61 @@ def rand_int8(shape: tuple, device: str = "cuda"):
     return to_int8(torch.rand(shape, device=device) * 255 - 128)
 
 
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
+                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_compress_entry(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
 def baseline_scaled_mm(a: torch.Tensor,
                        b: torch.Tensor,
                        scale_a: torch.Tensor,
@@ -403,6 +458,34 @@ def test_cutlass_subset():
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    print("in test")
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                b_comp,
+                                e,
+                                scale_a,
+                                scale_b,
+                                out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
 # Test to make sure cuda graphs work
 class CutlassLayer(torch.nn.Module):
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 230a0ad7aaa5c..c89c7d492f75d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -560,9 +560,9 @@ def cutlass_compress_entry(a: torch.Tensor) \
 
 
 def cutlass_scaled_sparse_mm(
-        a: torch.Tensor,
+        a: torch.Tensor, # row-major activations
+        b: torch.Tensor, # row-major weight matrix
         e: torch.Tensor,
-        b: torch.Tensor,
         scale_a: torch.Tensor,
         scale_b: torch.Tensor,
         out_dtype: torch.dtype,
@@ -572,11 +572,13 @@ def cutlass_scaled_sparse_mm(
     assert bias is None or bias.shape[0] == a.shape[0] \
         and bias.dtype == out_dtype
 
-    m = a.shape[0]
-    n = b.shape[1]
+    a_t = a.t()
+
+    m = b.shape[0]
+    n = a_t.shape[1]
     out = torch.empty((n, m), dtype=out_dtype, device=a.device).t()
 
-    torch.ops._C.cutlass_scaled_sparse_mm(out, a, e, b, scale_a, scale_b, bias)
+    torch.ops._C.cutlass_scaled_sparse_mm(out, b, e, a_t, scale_b, scale_a, bias)
 
     return out.t()
 

From 72f45779c51efffe38734fc973fe7598b4b4ed19 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Wed, 11 Dec 2024 21:00:18 +0000
Subject: [PATCH 75/92] Clean up code comments

---
 benchmarks/cutlass_benchmarks/utils.py        |  5 --
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 52 +++++++++----------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |  5 --
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  4 +-
 .../schemes/compressed_tensors_24.py          |  8 +--
 5 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index 49e8a1bdfe2ef..2f7ccee5ddb36 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -52,11 +52,6 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
     a = torch.randn((m, k), device='cuda') * 5
     b = torch.randn((n, k), device='cuda').t() * 5
 
-    # # Initialize a to all ones
-    # a = torch.ones((m, k), device='cuda')
-    # # Initialize b to all ones
-    # b = torch.ones((n, k), device='cuda')
-
     b = prune_to_2_4(b.t()).t()
 
     if dtype == torch.int8:
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 53a9804d69ff2..a4f5eaed4134f 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -297,7 +297,8 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
   }
 }
 
-void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out,
+                                   torch::Tensor const& a,
                                    torch::Tensor const& e,
                                    torch::Tensor const& b,
                                    torch::Tensor const& a_scales,
@@ -306,36 +307,35 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
   if (bias) {
-    TORCH_CHECK(bias->dtype() == c.dtype(),
-                "currently bias dtype must match output dtype ", c.dtype());
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBias>(
-        c, a, e, b, a_scales, b_scales, *bias);
+        out, a, e, b, a_scales, b_scales, *bias);
   } else {
     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogue>(
-        c, a, e, b, a_scales, b_scales);
+        out, a, e, b, a_scales, b_scales);
   }
 }
 
-// void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out, torch::Tensor
-// const& a,
-//                                 torch::Tensor const& e,
-//                                 torch::Tensor const& b,
-//                                 torch::Tensor const& a_scales,
-//                                 torch::Tensor const& b_scales,
-//                                 torch::Tensor const& azp_adj,
-//                                 c10::optional<torch::Tensor> const& azp,
-//                                 c10::optional<torch::Tensor> const& bias) {
-//   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-//   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-//   if (azp) {
-//     return
-//     cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
-//         out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
-//   } else {
-//     return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
-//         out, a, e, b, a_scales, b_scales, azp_adj, bias);
-//   }
-// }
+void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out,
+                                torch::Tensor const& a,
+                                torch::Tensor const& e,
+                                torch::Tensor const& b,
+                                torch::Tensor const& a_scales,
+                                torch::Tensor const& b_scales,
+                                torch::Tensor const& azp_adj,
+                                c10::optional<torch::Tensor> const& azp,
+                                c10::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  if (azp) {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+        out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
+  } else {
+    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
+        out, a, e, b, a_scales, b_scales, azp_adj, bias);
+  }
+}
 
 #endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 76bd3fadd90a1..e66c24627e067 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -364,8 +364,6 @@ struct cutlass_3x_gemm {
   using ElementAB = ElementAB_;
   using ElementD = ElementD_;
   using ElementAcc = AccType;
-  // typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-  //                           float>::type;
 
   using EpilogueDescriptor =
       cutlass::epilogue::collective::detail::EpilogueDescriptor<
@@ -432,9 +430,6 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   int64_t ldb = b.stride(1);
   int64_t ldc = out.stride(1);
 
-  // using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  // using StrideC = typename Gemm::StrideC;
-
   using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
   using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
   using StrideB = typename Gemm::GemmKernel::StrideB;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 9c2aed2eb3079..8a92d3a598964 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -52,8 +52,8 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
 
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1);  // Row-major
-  // TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major
-  // TORCH_CHECK(c.stride(0) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major
+  TORCH_CHECK(c.stride(1) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(b.stride(1) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 12e1b26d87081..5cd0059a4df89 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -163,11 +163,11 @@ def apply_weights(self,
             input_scale = layer.input_scale
             q_input = x
 
-        out = ops.cutlass_scaled_sparse_mm(a=layer.weight,
+        out = ops.cutlass_scaled_sparse_mm(a=q_input,
+                                           b=layer.weight,
                                            e=layer.meta,
-                                           b=q_input.t(),
-                                           scale_a=layer.weight_scale,
-                                           scale_b=input_scale,
+                                           scale_a=input_scale,
+                                           scale_b=layer.weight_scale,
                                            out_dtype=self.output_dtype,
                                            bias=bias)
         assert out.is_contiguous()

From 208b2a02a43cebfcbe6a31410912689cfd3b13e3 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Wed, 11 Dec 2024 21:08:30 +0000
Subject: [PATCH 76/92] Format vllm code

---
 .../cutlass_benchmarks/sp_fp8_benchmarks.py   | 25 +++++++++----------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 18 ++++++-------
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  8 +++---
 tests/kernels/test_cutlass.py                 | 20 ++++++++-------
 vllm/_custom_ops.py                           |  7 +++---
 5 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
index 0dd59c708d9cd..fb7c1d8fcd82d 100644
--- a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
@@ -1,11 +1,11 @@
 import argparse
 import copy
-import itertools
-import pickle as pkl
-import time
 import dataclasses
+import itertools
 import multiprocessing as mp
 import os
+import pickle as pkl
+import time
 import traceback
 from multiprocessing import Process, Queue
 from pathlib import Path
@@ -15,11 +15,11 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-from vllm.utils import FlexibleArgumentParser
-import vllm._custom_ops as ops
 from utils import make_n_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
 
+import vllm._custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -490,8 +490,8 @@ def run_kernels_on_gpus(
                     bench = BenchMM(cuda_graph_params, label, sub_label,
                                     "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
                                     ops.cutlass_scaled_sparse_mm, ArgPool(As),
-                                    ArgPool(BComps), ArgPool(Es),
-                                    scale_a, scale_b, torch.bfloat16)
+                                    ArgPool(BComps), ArgPool(Es), scale_a,
+                                    scale_b, torch.bfloat16)
 
                 # Run the benchmark
                 result = bench.run()
@@ -575,8 +575,8 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
 
 
 def bench(dtype: torch.dtype, with_cuda_graph: Optional[int],
-             with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
-             sub_label: str) -> Iterable[TMeasurement]:
+          with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
     if dtype == torch.float8_e4m3fn:
         return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
                          sub_label)
@@ -599,9 +599,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
                 if args.with_cuda_graph else label
         label = f"{label}-argpool_{args.with_arg_pool}" \
             if args.with_arg_pool else label
-        timers = bench(args.dtype, args.with_cuda_graph,
-                            args.with_arg_pool, m, k, n, label,
-                            f"MKN=({m}x{k}x{n})")
+        timers = bench(args.dtype, args.with_cuda_graph, args.with_arg_pool, m,
+                       k, n, label, f"MKN=({m}x{k}x{n})")
 
         print_timers(timers)
         results.extend(timers)
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index a4f5eaed4134f..7ee5246a52d79 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -297,8 +297,7 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
   }
 }
 
-void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out,
-                                   torch::Tensor const& a,
+void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& e,
                                    torch::Tensor const& b,
                                    torch::Tensor const& a_scales,
@@ -317,15 +316,12 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out,
   }
 }
 
-void cutlass_scaled_sparse_mm_azp_sm90(torch::Tensor& out,
-                                torch::Tensor const& a,
-                                torch::Tensor const& e,
-                                torch::Tensor const& b,
-                                torch::Tensor const& a_scales,
-                                torch::Tensor const& b_scales,
-                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
+void cutlass_scaled_sparse_mm_azp_sm90(
+    torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& e,
+    torch::Tensor const& b, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& azp_adj,
+    c10::optional<torch::Tensor> const& azp,
+    c10::optional<torch::Tensor> const& bias) {
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 8a92d3a598964..9e23df5a05d69 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -51,10 +51,10 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
 
   // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1);  // Row-major
-  TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1); // Column-major
-  TORCH_CHECK(c.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a.stride(1) == 1);                      // Row-major
+  TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1);  // Column-major
+  TORCH_CHECK(c.stride(1) % 16 == 0);                 // 16 Byte Alignment
+  TORCH_CHECK(b.stride(1) % 16 == 0);                 // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 2c5d19cc54c54..aa7f517ce6ff0 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,7 +2,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Optional, Type, Tuple
+from typing import Optional, Tuple, Type
 
 import pytest
 import torch
@@ -86,8 +86,9 @@ def prune_to_2_4(tensor):
     return pruned.reshape(original_shape)
 
 
-def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
-                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda') * 5
     b = torch.randn((n, k), device='cuda').t() * 5
 
@@ -464,7 +465,8 @@ def test_cutlass_sparse_subset():
     m, n, k = 512, 512, 512
 
     # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, big_m, n, k)
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
     a = whole_a[0:m, 0:k]
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
@@ -472,11 +474,11 @@ def test_cutlass_sparse_subset():
     print("in test")
 
     out = ops.cutlass_scaled_sparse_mm(a,
-                                b_comp,
-                                e,
-                                scale_a,
-                                scale_b,
-                                out_dtype=torch.bfloat16)
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
     baseline = baseline_scaled_mm(a,
                                   b,
                                   scale_a,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index c89c7d492f75d..6087247de5a94 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -560,8 +560,8 @@ def cutlass_compress_entry(a: torch.Tensor) \
 
 
 def cutlass_scaled_sparse_mm(
-        a: torch.Tensor, # row-major activations
-        b: torch.Tensor, # row-major weight matrix
+        a: torch.Tensor,  # row-major activations
+        b: torch.Tensor,  # row-major weight matrix
         e: torch.Tensor,
         scale_a: torch.Tensor,
         scale_b: torch.Tensor,
@@ -578,7 +578,8 @@ def cutlass_scaled_sparse_mm(
     n = a_t.shape[1]
     out = torch.empty((n, m), dtype=out_dtype, device=a.device).t()
 
-    torch.ops._C.cutlass_scaled_sparse_mm(out, b, e, a_t, scale_b, scale_a, bias)
+    torch.ops._C.cutlass_scaled_sparse_mm(out, b, e, a_t, scale_b, scale_a,
+                                          bias)
 
     return out.t()
 

From 702009648cee422e9edb1e617c68335b80ea46d9 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 00:11:27 +0000
Subject: [PATCH 77/92] Update code

---
 CMakeLists.txt                                |  4 +--
 benchmarks/kernels/weight_shapes.py           | 32 -------------------
 .../epilogue/scaled_mm_epilogues_c2x.hpp      |  2 +-
 3 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6863eccf29255..38166e9b7f7e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -290,7 +290,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
       message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
-                     "later if you intend on running FP8 quantized models or sparse on "
+                     "later if you intend on running FP8 sparse or quantized models on "
                      "Hopper.")
     else()
       message(STATUS "Not building cutlass_c3x as no compatible archs found "
@@ -408,7 +408,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR};${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index cdb1e443050e9..51f24f3ba1774 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -40,38 +40,6 @@
         ([8192, 57344], 1),
         ([28672, 8192], 0),
     ],
-    "meta-llama/Llama-2-70b-tp4-hf": [([8192, 2560], None), ([2048,
-                                                              8192], None),
-                                      ([8192, 14336], None),
-                                      ([7168, 8192], None)],
-    # The shape space is very big when benchmarking a large set of kernels.
-    # For example: Let,
-    #  - #kernels to benchmark be 1700
-    #  - #models to benchmark be 4 (each model has 4 shapes)
-    #  - #batch sizes be 6 (16, 32, 64, 128, 256, 512)
-    # For 1 kernel, 1 shape and 1 batch-size, H100 takes 1 second (approx.)
-    # to run, then the benchmark suite would take,
-    # 1700 * (4 * 4) * 6 = 163200 seconds => 46 hrs.
-    # Below, we exploit some observation on the benchmark shapes to create a
-    # representative set.
-    #
-    # From previous benchmarking runs, we observe that perf if stratified as,
-    # N - small, medium, large and K - small and large. We also observe that
-    # in the model shapes, when K is small, we have small, medium and large Ns.
-    # when K is large, we only have small Ns.
-    #
-    # models : ['meta-llama/Llama-2-7b-hf', 'meta-llama/Llama-3-8b',
-    #  'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-70b-tp4-hf']
-    # Ks : [2048, 4096, 5120, 7168, 8192, 11008, 13824, 14336]
-    # Ns : [2560, 4096, 5120, 6144, 8192, 12288, 14336, 15360,
-    #         22016, 27648, 28672]
-    "llama-representative-set": [
-        ([4096, 4096], None),  # small K, small N
-        ([4096, 8192], None),  # small K, medium N
-        ([4096, 22016], None),  # small K, large N
-        ([14336, 4096], None),  # large K, small N
-        ([8192, 14336], None),  # medium K, large N (from llama-2-70b-tp4-hf
-    ],
     "meta-llama/Llama-3.1-405b-hf": [
         ([16384, 18432], 1),
         ([16384, 16384], 0),
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
index acba3be8bd27b..c69e87999ae71 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -1,4 +1,4 @@
-#include "broadcast_load_epilogue_c2x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,

From f222aae0e9ad6deeea1a74a35aa425df271f4c1b Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 00:37:46 +0000
Subject: [PATCH 78/92] Update code

---
 .../cutlass_w8a8/scaled_mm_c2x.cu             |  53 +-
 .../cutlass_w8a8/scaled_mm_c2x.cuh            | 302 -------
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 371 +++++++--
 .../cutlass_w8a8/scaled_mm_c3x.cuh            | 786 ------------------
 .../cutlass_w8a8/scaled_mm_entry.cu           |   8 +-
 5 files changed, 324 insertions(+), 1196 deletions(-)
 delete mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
index ee801e16573d4..dbb72e8bbd3f5 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@@ -8,6 +8,10 @@
 #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
 #include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
 
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp"
+
+using namespace vllm;
+
 /*
    This file defines quantized GEMM operations using the CUTLASS 2.x API, for
    NVIDIA GPUs with SM versions prior to sm90 (Hopper).
@@ -22,12 +26,11 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t,
-                                            Epilogue>(
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -42,10 +45,10 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -61,10 +64,10 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm75_epilogue<c2x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
@@ -78,12 +81,11 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
-                                            Epilogue>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
         out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
@@ -98,10 +100,10 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -117,10 +119,10 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm80_epilogue<c2x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
@@ -134,13 +136,12 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                                   Epilogue>(
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                             Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t,
-                                                   Epilogue>(
+      return cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {
@@ -148,13 +149,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_fp8_dispatch<
-          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
-                                                  cutlass::half_t, Epilogue>(
+      return cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
           out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
@@ -170,10 +171,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBias>(
         out, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogue>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogue>(
         out, a, b, a_scales, b_scales);
   }
 }
@@ -189,10 +190,10 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm89_epilogue<c2x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index 037235e52d7ac..d03242f44ab1d 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,7 +21,6 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "epilogue/broadcast_load_epilogue_c2x.hpp"
 #include "common.hpp"
 // clang-format on
 
@@ -71,307 +70,6 @@ struct enable_sm89_to_sm90 : Kernel {
 #endif
   }
 };
-
-/*
- * This class provides the common load descriptors for the
- * ScaledEpilogue[...] classes
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-  template <typename T>
-  using ColOrScalarLoad =
-      cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
-          OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoad =
-      cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
-          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
-
-  template <typename T>
-  using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast<
-      OutputTileThreadMap, T, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast<
-      OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
-
-  template <typename T>
-  using RowOrZeroLoad =
-      cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast<
-          OutputTileThreadMap, T, Stride<Int<0>, Int<1>, Int<0>>>;
-
-  // This utility function constructs the arguments for the load descriptors
-  // from a tensor. It can handle both row and column, as well as row/column or
-  // scalar cases.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(torch::Tensor const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
-    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
-      return Arguments{data_ptr, tensor.numel() != 1};
-    } else {
-      // it would technically work but no use case as data_ptr is never nullptr
-      static_assert(!std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
-      return Arguments{data_ptr};
-    }
-  }
-
-  // This overload handles the case where there might not be a tensor, in which
-  // case a nullptr is passed and a constant (0) is used.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
-    static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
-    return Arguments{data_ptr};
-  }
-};
-
-/*
- This epilogue function defines a quantized GEMM operation similar to
- torch._scaled_mm.
-
- A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
- per-row. B can be quantized per-tensor or per-column.
- Any combination of per-tensor and per-row or column is supported.
- A and B must have symmetric quantization (zero point == 0).
-
- So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
- scales are applied elementwise with numpy-style broadcasting.
-
- ScaleA and ScaleB define the epilogue functions that apply the scales for
- the A and B operands respectively. These scales may be either per-tensor or
- per row or column.
-*/
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
-  }
-};
-
-/*
- * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
- * This bias can also be used in the per-tensor azp case, where the activation
- * zero point (azp) is used to compute an azp correction term,
- * which is folded into the bias.
- *
- * The bias tensor must be per-output channel.
- * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBias
-    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- protected:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD>;
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
-                                                             EVTCompute0, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
-  }
-};
-
-/*
- * This epilogue directly supports per-tensor azp in int32 form.
- * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
- * term, which should already be multiplied with the scalar azp.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBiasAzp
-    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
-
-  // This is the full AZP term, azp * J @ B, shape (1,n)
-  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute float(accum - azp_adj), both operands are int32_t
-  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Accum, AzpWithAdj>;
-
-  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
-                                              EVTComputeAzp>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
-                                              EVTComputeScaleB, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-/*
- * This epilogue supports per-token azp by computing and applying
- * the correction term using a rank-1 update. If the term were materialized,
- * it would require O(m*n) space, and this way it only requires O(m+n) space.
- * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
- * point for each row of A.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBiasAzpToken
-    : protected ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowOrZeroLoad<ElementD>;
-
-  // Per-token azp term, shape (m,1)
-  using Azp = typename SUPER::template ColLoad<int32_t>;
-
-  // This is the AZP adjustment term, J @ B, shape (1,n)
-  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute azp * azp_adj
-  using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, int32_t, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeAzp, Azp, AzpAdj>;
-
-  // Compute float(accum - azp*azp_adj), all operands are int32_t
-  using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAcc =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeAcc, Accum, EVTComputeAzp>;
-
-  using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleB, ScaleB,
-                                              EVTComputeAcc>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<ComputeScaleBiasA, ScaleA,
-                                              EVTComputeScaleB, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
 template <typename Arch, template <typename> typename ArchGuard,
           typename ElementAB_, typename ElementD_,
           template <typename, typename> typename Epilogue_, typename TileShape,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 84e1f367c8722..33581a63d4c3d 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,7 +1,291 @@
-#include <stddef.h>
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
 #include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
 #include "cutlass/cutlass.h"
-#include "scaled_mm_c3x.cuh"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+  #endif
+  }
+};
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_default {
+  // For M > 128 and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M128 {
+  // For M in (64, 128] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule =
+      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M64 {
+  // For M in (32, 64] and any N
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NBig {
+  // For M in [1, 32] and N >= 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _256>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_int8_config_M32_NSmall {
+  // For M in [1, 32] and N < 8192
+  static_assert(std::is_same<InType, int8_t>());
+  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+}  // namespace
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
@@ -139,11 +423,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == c.dtype(),
                 "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
         c, a, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
-                                                           b_scales);
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        c, a, b, a_scales, b_scales);
   }
 }
 
@@ -158,83 +442,12 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
 
   if (azp) {
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
         out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
   } else {
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
         out, a, b, a_scales, b_scales, azp_adj, bias);
   }
 }
 
-// hyper-parameter sweep kernels
-
-void cutlass_scaled_mm_sm90_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     torch::Tensor const& a_scales,
-                                     torch::Tensor const& b_scales,
-                                     c10::optional<torch::Tensor> const& bias) {
-  assert(!bias);
-
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using AccType = float;
-
-  if (out.dtype() == torch::kBFloat16) {
-    using Cutlass3xGemm =
-        cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t,
-                        ScaledEpilogue, TileShape, ClusterShape, KernelSchedule,
-                        EpilogueSchedule, AccType,
-                        cutlass::gemm::PersistentScheduler,
-                        cutlass::gemm::GemmUniversalMode::kGemm>;
-
-    return cutlass_gemm_caller<Cutlass3xGemm>(out, a, b, a_scales, b_scales);
-
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-
-    using Cutlass3xGemm =
-        cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::half_t, ScaledEpilogue,
-                        TileShape, ClusterShape, KernelSchedule,
-                        EpilogueSchedule, AccType,
-                        cutlass::gemm::PersistentScheduler,
-                        cutlass::gemm::GemmUniversalMode::kGemm>;
-
-    return cutlass_gemm_caller<Cutlass3xGemm>(out, a, b, a_scales, b_scales);
-  }
-}
-
-void cutlass_simple_gemm_sm90_dispatch(torch::Tensor& out,
-                                       torch::Tensor const& a,
-                                       torch::Tensor const& b) {
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using AccType = float;
-
-  if (out.dtype() == torch::kBFloat16) {
-    using Cutlass3xGemm =
-        cutlass_3x_simple_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t,
-                               TileShape, ClusterShape, KernelSchedule, AccType,
-                               cutlass::gemm::PersistentScheduler,
-                               cutlass::gemm::GemmUniversalMode::kGemm>;
-
-    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
-
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-
-    using Cutlass3xGemm =
-        cutlass_3x_simple_gemm<cutlass::float_e4m3_t, cutlass::half_t,
-                               TileShape, ClusterShape, KernelSchedule, AccType,
-                               cutlass::gemm::PersistentScheduler,
-                               cutlass::gemm::GemmUniversalMode::kGemm>;
-
-    return cutlass_simple_gemm_caller<Cutlass3xGemm>(out, a, b);
-  }
-}
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
deleted file mode 100644
index 92b35c394eeb8..0000000000000
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cuh
+++ /dev/null
@@ -1,786 +0,0 @@
-#pragma once
-
-// clang-format will break include orders
-// clang-format off
-#include <cudaTypedefs.h>
-
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler_params.h"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "epilogue/broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
-
-/*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
-*/
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-
-/*
- * This class provides the common load descriptors for the
- * ScaledEpilogue[...] classes
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  template <typename T>
-  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // This utility function constructs the arguments for the load descriptors
-  // from a tensor. It can handle both row and column, as well as row/column or
-  // scalar cases.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(torch::Tensor const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
-    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
-      return Arguments{data_ptr, tensor.numel() != 1};
-    } else {
-      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
-                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
-      return Arguments{data_ptr};
-    }
-  }
-
-  // This overload handles the case where there might not be a tensor, in which
-  // case a nullptr is passed and a constant (0) is used.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
-    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
-                  std::is_same_v<Descriptor, RowLoad<T, true>>);
-    return Arguments{data_ptr};
-  }
-};
-
-/*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch.scaled_mm_.
-
-   A and B may be both either int8 or fp8_e4m3. A can be
-   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
-  }
-};
-
-/*
- * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
- * This bias can also be used in the per-tensor azp case, where the activation
- * zero point (azp) is used to compute an azp correction term,
- * which is folded into the bias.
- *
- * The bias tensor must be per-output channel.
- * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
-  }
-};
-
-/*
- * This epilogue directly supports per-tensor azp in int32 form.
- * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
- * term, which should already be multiplied with the scalar azp.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // This is the full AZP term, azp * J @ B, shape (1,n)
-  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute float(accum - azp_adj), both operands are int32_t
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-/*
- * This epilogue supports per-token azp by computing and applying
- * the correction term using a rank-1 update. If the term were materialized,
- * it would require O(m*n) space, and this way it only requires O(m+n) space.
- * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
- * point for each row of A.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // Per-token azp term, shape (m,1)
-  using Azp = typename SUPER::template ColLoad<int32_t>;
-
-  // This is the AZP adjustment term, J @ B, shape (1,n)
-  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute azp * azp_adj
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, int32_t, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
-
-  // Compute float(accum - azp*azp_adj), all operands are int32_t
-  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAcc =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
-
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule, typename AccType,
-          typename TileSchedule = cutlass::gemm::PersistentScheduler,
-          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
-struct cutlass_3x_gemm {
-  static const GemmUniversalMode Mode = Mode_;
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-
-  using ElementAcc = AccType;
-
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  static constexpr int AlignmentA =
-      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentB =
-      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD =
-      128 / cutlass::sizeof_bits<ElementD>::value;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD, StrideD,
-          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
-          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      TileSchedule>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-inline void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = b.size(0);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
-                                      epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-using ReductionMode = cutlass::gemm::kernel::detail::
-    PersistentTileSchedulerSm90StreamKParams::ReductionMode;
-using DecompositionMode = cutlass::gemm::kernel::detail::
-    PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
-using RasterOrderOptions = cutlass::gemm::kernel::detail::
-    PersistentTileSchedulerSm90Params::RasterOrderOptions;
-
-template <typename Gemm, typename... EpilogueArgs>
-inline void cutlass_gemm_caller_streamk(torch::Tensor& out,
-                                        torch::Tensor const& a,
-                                        torch::Tensor const& b,
-                                        ReductionMode reduction_mode,
-                                        DecompositionMode decomposition_mode,
-                                        EpilogueArgs&&... epilogue_params) {
-  static_assert(std::is_same<typename Gemm::KernelType::TileSchedulerTag,
-                             cutlass::gemm::StreamKScheduler>::value,
-                "Must be streamk scheduler");
-
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::TileSchedulerArguments tile_scheduler_args(
-      1, 1, RasterOrderOptions::Heuristic, decomposition_mode);
-  tile_scheduler_args.reduction_mode = reduction_mode;
-
-  // Copied from examples...
-  // The KernelHardwareInfo struct holds the number of SMs on the GPU with a
-  // given device ID. This information is used by the underlying kernel.
-  cutlass::KernelHardwareInfo hw_info;
-  // Change device_id to another value if you are running on a machine with
-  // multiple GPUs and wish to use a GPU other than that with device ID 0.
-  hw_info.device_id = 0;
-  hw_info.sm_count =
-      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-          hw_info.device_id);
-
-  typename GemmKernel::Arguments args{Gemm::Mode,    prob_shape,
-                                      mainloop_args, epilogue_args,
-                                      hw_info,       tile_scheduler_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
-};
-
-template <typename ElementAB_, typename ElementD_, typename TileShape,
-          typename ClusterShape, typename KernelSchedule, typename AccType,
-          typename TileSchedule = cutlass::gemm::PersistentScheduler,
-          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
-struct cutlass_3x_simple_gemm {
-  static const GemmUniversalMode Mode = Mode_;
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, AccType,
-                                AccType>::type;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      TileSchedule>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm>
-inline void cutlass_simple_gemm_caller(torch::Tensor& out,
-                                       torch::Tensor const& a,
-                                       torch::Tensor const& b) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      {}, c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{Gemm::Mode, prob_shape, mainloop_args,
-                                      epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-
-#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 1657f7d0b16e8..97a969cf5e3e0 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -137,9 +137,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
     return;
   }
 
-  // Turing
-  TORCH_CHECK(version_num >= 75);
-  cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
+  if (version_num >= 75) {
+    // Turing
+    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
 #endif
 
   TORCH_CHECK_NOT_IMPLEMENTED(

From c7a3a7de6f4004a5d5c99699721366baef521f6e Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 00:43:59 +0000
Subject: [PATCH 79/92] Update code

---
 csrc/ops.h                                    |  2 --
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 16 ----------------
 csrc/torch_bindings.cpp                       | 12 +++---------
 vllm/_custom_ops.py                           |  5 -----
 4 files changed, 3 insertions(+), 32 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 171b70eb80aee..363ddec3d0729 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -155,8 +155,6 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            c10::optional<torch::Tensor> const& azp,
                            c10::optional<torch::Tensor> const& bias);
 
-bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability);
-
 void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& e, torch::Tensor const& b,
                               torch::Tensor const& a_scales,
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 9e23df5a05d69..5075c342098ba 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -12,22 +12,6 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                                    c10::optional<torch::Tensor> const& bias);
 #endif
 
-bool cutlass_scaled_sparse_mm_supports_fp8(int64_t cuda_device_capability) {
-  // CUTLASS FP8 kernels need at least
-  //   CUDA 12.0 on SM90 systems (Hopper)
-  //   CUDA 12.4 on SM89 systems (Lovelace)
-
-#if defined CUDA_VERSION
-  if (cuda_device_capability >= 90) {
-    return CUDA_VERSION >= 12000;
-  } else if (cuda_device_capability >= 89) {
-    return CUDA_VERSION >= 12040;
-  }
-#endif
-
-  return false;
-}
-
 int32_t test_get_sm_version_num() {
   int32_t major_capability, minor_capability;
   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 667aa94db3218..3cbc843dc501a 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -313,7 +313,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
-  // Test
+  // CUTLASS sparse GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization, as well as bias
   ops.def(
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
       "                  Tensor e,"
@@ -321,14 +322,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                  Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
-  // Test
-  ops.def(
-      "cutlass_scaled_sparse_mm_supports_fp8(int cuda_device_capability) -> "
-      "bool");
-  ops.impl("cutlass_scaled_sparse_mm_supports_fp8",
-           &cutlass_scaled_sparse_mm_supports_fp8);
-
-  // Test
+  // CUTLASS sparse matrix compressor
   ops.def(
       "cutlass_compress_entry(Tensor! a_compressed, Tensor! e,"
       " Tensor a) -> bool");
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 6087247de5a94..d99d2340275af 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -532,11 +532,6 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
-def cutlass_scaled_sparse_mm_supports_fp8(cuda_device_capability: int) -> bool:
-    return torch.ops._C.cutlass_scaled_sparse_mm_supports_fp8(
-        cuda_device_capability)
-
-
 def cutlass_compress_entry(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
     assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn or \

From 94a4945136a68f70ae9aa338c707d89e1c0988f3 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 01:21:39 +0000
Subject: [PATCH 80/92] Update code

---
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |   2 +-
 tests/kernels/test_cutlass.py                 |  87 +---
 tests/kernels/test_semi_structured.py         | 418 ++++++------------
 vllm/_custom_ops.py                           |   3 +-
 4 files changed, 132 insertions(+), 378 deletions(-)

diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index c74d2aacb0beb..fcc17c7727f94 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -1,4 +1,4 @@
-#include "broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index aa7f517ce6ff0..afe53797322f9 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,7 +2,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Optional, Tuple, Type
+from typing import Optional, Type
 
 import pytest
 import torch
@@ -55,62 +55,6 @@ def rand_int8(shape: tuple, device: str = "cuda"):
     return to_int8(torch.rand(shape, device=device) * 255 - 128)
 
 
-def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.bfloat16)
-
-
-def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.float16)
-
-
-def prune_to_2_4(tensor):
-    # Reshape tensor to [N, 4] where N is number of groups of 4
-    original_shape = tensor.shape
-    reshaped = tensor.reshape(-1, 4)
-
-    # Get indices of top 2 absolute values in each group of 4
-    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
-
-    # Create binary mask
-    mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
-
-    # Apply mask and reshape back
-    pruned = reshaped * mask
-
-    # Turn all -0.0 to 0.0
-    pruned[pruned == -0.0] = 0.0
-
-    return pruned.reshape(original_shape)
-
-
-def make_rand_sparse_tensors(
-        dtype: torch.dtype, m: int, n: int, k: int
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    b = prune_to_2_4(b.t()).t()
-
-    if dtype == torch.int8:
-        a, b = to_int8(a), to_int8(b)
-    elif dtype == torch.float8_e4m3fn:
-        a, b = to_fp8(a), to_fp8(b)
-    elif dtype == torch.float16:
-        a, b = to_fp16(a), to_fp16(b)
-    elif dtype == torch.bfloat16:
-        a, b = to_bf16(a), to_bf16(b)
-    else:
-        raise ValueError("unsupported dtype")
-
-    b_compressed, e = ops.cutlass_compress_entry(b.t())
-
-    # Compressed B, Metadata, Original A, B
-    return b_compressed, e, a, b
-
-
 def baseline_scaled_mm(a: torch.Tensor,
                        b: torch.Tensor,
                        scale_a: torch.Tensor,
@@ -459,35 +403,6 @@ def test_cutlass_subset():
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
-# Test working with a subset of A and B for sparse matmul
-def test_cutlass_sparse_subset():
-    big_m = 1024
-    m, n, k = 512, 512, 512
-
-    # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
-                                                     big_m, n, k)
-    a = whole_a[0:m, 0:k]
-    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-
-    print("in test")
-
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
-
-    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
-
-
 # Test to make sure cuda graphs work
 class CutlassLayer(torch.nn.Module):
 
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 783ae12ba6cc7..3d8560238de3d 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -1,292 +1,132 @@
-import pytest
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Optional, Tuple, Type
+
 import torch
 
-from tests.quantization.utils import is_quant_method_supported
-from vllm.model_executor.layers.sparsity.utils.cusparse_2_4_utils import (
-    clear_cache, compress_to_torch_sparse_semi_structured_mat,
-    decompress_torch_sparse_semi_structured_mat, dense_matmul,
-    generate_pruned_semi_structured_mat, get_random_mat,
-    is_semi_structured_supported, semi_structured_dense_sparse_T_gemm,
-    semi_structured_dense_sparse_T_gemm_scaled,
-    semi_structured_sparse_dense_gemm,
-    semi_structured_sparse_dense_gemm_scaled)
-
-DTYPES = [torch.float16, torch.bfloat16, torch.int8]
-SIZES = [(128, 128), (1024, 8192)]
-SIZES_FP8 = [(32, 64), (1024, 1024)]
-MNK = [(128, 128, 128), (128, 512, 1024), (512, 512, 512), (1024, 2048, 4096)]
-
-
-# From pytorch test
-def to_float8(x, dtype=torch.float8_e4m3fn):
-    finfo = torch.finfo(dtype)
-    # Calculate the scale as dtype max divided by absmax
-    scale = finfo.max / x.abs().max().clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    return x_scl_sat.to(dtype), scale.float().reciprocal()
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported(),
-    reason="Semi structured matmul is not supported on this GPU type.")
-@pytest.mark.parametrize("size", SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_semi_structured_compress(size, dtype):
-    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
-    output_pruned = decompress_torch_sparse_semi_structured_mat(
-        compress_to_torch_sparse_semi_structured_mat(input_pruned))
-    torch.testing.assert_close(input_pruned, output_pruned)
-
-
-# TODO modelopt config has to be replaced with corresponding fp8_24 config
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-@pytest.mark.parametrize("size", SIZES_FP8)
-def test_semi_structured_fp8_compress(size):
-    dtype = torch.float8_e4m3fn
-    input_pruned = generate_pruned_semi_structured_mat(*size, dtype)
-    output_pruned = decompress_torch_sparse_semi_structured_mat(
-        compress_to_torch_sparse_semi_structured_mat(input_pruned))
-    torch.testing.assert_close(input_pruned.to(torch.float32),
-                               output_pruned.to(torch.float32),
-                               rtol=1e-1,
-                               atol=1e-1)
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported(),
-    reason="Semi structured matmul is not supported on this GPU type.")
-@pytest.mark.parametrize("mnk", MNK)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_torch_semi_structured_sparse_dense_matmul(mnk, dtype):
-    M, N, K = mnk
-    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
-    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
-    B = get_random_mat(K, N, dtype)
-    if dtype is torch.int8:
-        with pytest.raises(ValueError):
-            C_sparse = semi_structured_sparse_dense_gemm(A, B)
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
     else:
-        C_sparse = semi_structured_sparse_dense_gemm(A, B)
-        C = dense_matmul(A_pruned, B, dtype)
-        torch.testing.assert_close(C, C_sparse)
-
-        # Verify cache
-        B = get_random_mat(K, N, dtype)
-        C = dense_matmul(A_pruned, B, dtype)
-        C_sparse = semi_structured_sparse_dense_gemm(A, B)
-        torch.testing.assert_close(C, C_sparse)
-
-        C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False)
-        torch.testing.assert_close(C, C_sparse)
-        clear_cache()
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported(),
-    reason="Semi structured matmul is not supported on this GPU type.")
-@pytest.mark.parametrize("mnk", MNK)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_torch_semi_structured_sparse_dense_T_matmul(mnk, dtype):
-    M, N, K = mnk
-    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
-    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
-    B = get_random_mat(N, K, dtype)
-
-    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
-    C = dense_matmul(A_pruned, B.t(), dtype)
-    torch.testing.assert_close(C, C_sparse)
-
-    # Verify cache
-    B = get_random_mat(N, K, dtype)
-    C = dense_matmul(A_pruned, B.t(), dtype)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B.t())
-    torch.testing.assert_close(C, C_sparse)
-
-    C_sparse = semi_structured_sparse_dense_gemm(A, B.t(), cached=False)
-    torch.testing.assert_close(C, C_sparse)
-    clear_cache()
-
-
-# TODO modelopt config has to be replaced with corresponding fp8_24 config
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_matmul():
-    M, N, K = (32, 64, 32)
-    dtype = torch.float8_e4m3fn
-    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=dtype)
-    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
-    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
-
-    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-
-    # Cached version
-    B = torch.full((K, N), .25, device='cuda', dtype=dtype).t()
-    C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-
-    # Noncached version
-    C_sparse = semi_structured_sparse_dense_gemm(A, B, cached=False).to(
-        torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-    clear_cache()
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported(),
-    reason="Semi structured matmul is not supported on this GPU type.")
-@pytest.mark.parametrize("mnk", MNK)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype):
-    M, N, K = mnk
-    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype)
-    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
-    A = get_random_mat(M, K, dtype)
-
-    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T)
-    C = dense_matmul(A, B_T_pruned.t(), dtype)
-    torch.testing.assert_close(C, C_sparse)
-
-    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T, cached=False)
-    C = dense_matmul(A, B_T_pruned.t(), dtype)
-    torch.testing.assert_close(C, C_sparse)
-    clear_cache()
-
-
-# TODO modelopt config has to be replaced with corresponding fp8_24 config
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_dense_sparse_T_fp8_matmul():
-    M, N, K = (32, 64, 32)
-    dtype = torch.float8_e4m3fn
-    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=dtype)
-    B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned)
-    A = torch.full((M, K), .25, device='cuda', dtype=dtype)
-
-    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
-    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-
-    C_sparse = semi_structured_dense_sparse_T_gemm(A, B_T).to(torch.float32)
-    C = dense_matmul(A, B_T_pruned.t(), dtype=dtype).to(torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-    clear_cache()
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_T_fp8_scaled_matmul():
-    M, N, K = (32, 64, 32)
-    A_pruned = generate_pruned_semi_structured_mat(M, N, dtype=torch.float16)
-    A_pruned_fp8, scale_A = to_float8(A_pruned)
-    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
-    B_fp8, scale_B = to_float8(B)
-
-    A_fp8_sparse = compress_to_torch_sparse_semi_structured_mat(A_pruned_fp8)
-
-    C = torch._scaled_mm(A_pruned_fp8,
-                         B_fp8,
-                         scale_a=scale_A,
-                         scale_b=scale_B,
-                         out_dtype=torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
-                                                        B_fp8,
-                                                        scale_a=scale_A,
-                                                        scale_b=scale_B).to(
-                                                            torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
-
-    # cached
-    B = torch.rand((K, N), device='cuda').to(torch.float16).t()
-    B_fp8, scale_B = to_float8(B)
-
-    C = torch._scaled_mm(A_pruned_fp8,
-                         B_fp8,
-                         scale_a=scale_A,
-                         scale_b=scale_B,
-                         out_dtype=torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
-                                                        B_fp8,
-                                                        scale_a=scale_A,
-                                                        scale_b=scale_B).to(
-                                                            torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
-
-    # noncached
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A_fp8_sparse,
-                                                        B_fp8,
-                                                        scale_a=scale_A,
-                                                        scale_b=scale_B,
-                                                        cached=False).to(
-                                                            torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
-    clear_cache()
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported()
-    or not is_quant_method_supported("modelopt"),
-    reason="Semi structured fp8 matmul is not supported on this GPU type.")
-def test_torch_semi_structured_dense_sparse_T_fp8_scaled_matmul():
-    M, N, K = (32, 64, 32)
-    A = torch.rand((M, K), device='cuda', dtype=torch.float16)
-    A_fp8, scale_a = to_float8(A)
-    B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=torch.float16)
-    B_T_pruned_fp8, scale_b = to_float8(B_T_pruned)
-    B_T_packed = compress_to_torch_sparse_semi_structured_mat(B_T_pruned_fp8)
-
-    C_sparse = semi_structured_dense_sparse_T_gemm_scaled(A_fp8,
-                                                          B_T_packed,
-                                                          scale_a=scale_a,
-                                                          scale_b=scale_b).to(
-                                                              torch.float32)
-    C = torch._scaled_mm(B_T_pruned_fp8,
-                         A_fp8.t(),
-                         scale_a=scale_b,
-                         scale_b=scale_a,
-                         out_dtype=torch.float32).t()
-    torch.testing.assert_close(C, C_sparse, rtol=7e-2, atol=7e-2)
-    clear_cache()
-
-
-@pytest.mark.skipif(
-    not is_semi_structured_supported(),
-    reason="Semi structured matmul is not supported on this GPU type.")
-def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul():
-    dtype = torch.int8
-    M, N, K = (32, 64, 32)
-    A_pruned = generate_pruned_semi_structured_mat(M, K, dtype)
-    A = compress_to_torch_sparse_semi_structured_mat(A_pruned)
-    B = get_random_mat(N, K, dtype)
-
-    scale_a = torch.tensor(2.0, dtype=torch.float32, device='cuda')
-    scale_b = torch.tensor(2.0, dtype=torch.float32, device='cuda')
-
-    C = dense_matmul(A_pruned,
-                     B.t(),
-                     dtype=dtype,
-                     scale_a=scale_a,
-                     scale_b=scale_b).to(torch.float32)
-    C_sparse = semi_structured_sparse_dense_gemm_scaled(A,
-                                                        B.t(),
-                                                        scale_a=scale_a,
-                                                        scale_b=scale_b).to(
-                                                            torch.float32)
-    torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1)
-    clear_cache()
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_compress_entry(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    print("in test")
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d99d2340275af..72e744fc9ba0d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -537,8 +537,7 @@ def cutlass_compress_entry(a: torch.Tensor) \
     assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn or \
             a.dtype is torch.bfloat16 or a.dtype is torch.float16)
 
-    # Not exactly sure what the right value would be based on cutlass definitions
-    # Let's assume e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
+    # e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
     elemsPerElemE = 4
 
     m = a.shape[0]

From 7fedb94d6f72172278022bfe6b6b96f5d5fd8fad Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 01:52:05 +0000
Subject: [PATCH 81/92] Update code

---
 csrc/sparse/cutlass/sparse_compressor.cu     | 2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu  | 2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index 1e11e45c5d6d9..eac71badac3b5 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -16,7 +16,7 @@
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/detail/dependent_false.hpp"
 
-#include "epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 #include "common.hpp"
 
 #include "cutlass/transform/device/transform_universal_adapter.hpp"
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 7ee5246a52d79..562ebd5530ea0 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -23,7 +23,7 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 #include "common.hpp"
 // clang-format on
 
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index e66c24627e067..a948a02e451af 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -20,7 +20,7 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 #include "common.hpp"
 
 using namespace cute;

From 3d6c50a9a93f5d082d3c4e48cc7d57494e075473 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 02:08:01 +0000
Subject: [PATCH 82/92] Update code

---
 ...benchmarks.py => sparse_fp8_benchmarks.py} | 18 ++-----
 benchmarks/cutlass_benchmarks/utils.py        | 13 +++++
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 27 +----------
 csrc/sparse/cutlass/common.hpp                | 47 -------------------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 18 -------
 5 files changed, 18 insertions(+), 105 deletions(-)
 rename benchmarks/cutlass_benchmarks/{sp_fp8_benchmarks.py => sparse_fp8_benchmarks.py} (98%)

diff --git a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
similarity index 98%
rename from benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
rename to benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
index fb7c1d8fcd82d..0b2fd3e477247 100644
--- a/benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
@@ -513,12 +513,6 @@ def run_kernels_on_gpus(
         return results
 
 
-def get_cache_path() -> str:
-    """Get the path to the cache file for the given configuration hash."""
-    path = Path(os.path.dirname(os.path.realpath(__file__)))
-    return f'{path}/stable_kernels.json'
-
-
 def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
               with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
@@ -546,9 +540,9 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
 
     # Prepare configs for all kernels
     standard_kernels = [
-        # {'kernel_type': 'pytorch_mm'},
-        # {'kernel_type': 'pytorch_scaled_mm'},
-        # {'kernel_type': 'pytorch_scaled_mm_fast'},
+        {'kernel_type': 'pytorch_mm'},
+        {'kernel_type': 'pytorch_scaled_mm'},
+        {'kernel_type': 'pytorch_scaled_mm_fast'},
         {
             'kernel_type': 'cutlass_scaled_mm'
         },
@@ -691,14 +685,8 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
 if __name__ == '__main__':
 
     def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
         if dt == "fp8":
             return torch.float8_e4m3fn
-        if dt == "fp16":
-            return torch.float16
-        if dt == "bf16":
-            return torch.bfloat16
         raise ValueError("unsupported dtype")
 
     parser = FlexibleArgumentParser(
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index 2f7ccee5ddb36..84937d1c81bb2 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -24,6 +24,19 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
     return tensor.to(dtype=torch.float16)
 
 
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
 def prune_to_2_4(tensor):
     # Reshape tensor to [N, 4] where N is number of groups of 4
     original_shape = tensor.shape
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index abcde3b016a7b..aabc1bdaf9753 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -13,35 +13,12 @@
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser
 
+from utils import make_rand_tensors
+
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
 
-# helpers
-
-
-def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
 
 # bench
 def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
diff --git a/csrc/sparse/cutlass/common.hpp b/csrc/sparse/cutlass/common.hpp
index 526120799fdcc..92c6f3b1bea05 100644
--- a/csrc/sparse/cutlass/common.hpp
+++ b/csrc/sparse/cutlass/common.hpp
@@ -19,14 +19,6 @@ inline uint32_t next_pow_2(uint32_t const num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
 /**
  * Panic wrapper for unwinding CUDA runtime errors
  */
@@ -39,42 +31,3 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
       exit(EXIT_FAILURE);                                               \
     }                                                                   \
   }
-
-/**
- * GPU timer for recording the elapsed time across kernel(s) launched in GPU
- * stream
- */
-struct GpuTimer {
-  cudaStream_t _stream_id;
-  cudaEvent_t _start;
-  cudaEvent_t _stop;
-
-  /// Constructor
-  GpuTimer() : _stream_id(0) {
-    CUDA_CHECK(cudaEventCreate(&_start));
-    CUDA_CHECK(cudaEventCreate(&_stop));
-  }
-
-  /// Destructor
-  ~GpuTimer() {
-    CUDA_CHECK(cudaEventDestroy(_start));
-    CUDA_CHECK(cudaEventDestroy(_stop));
-  }
-
-  /// Start the timer for a given stream (defaults to the default stream)
-  void start(cudaStream_t stream_id = 0) {
-    _stream_id = stream_id;
-    CUDA_CHECK(cudaEventRecord(_start, _stream_id));
-  }
-
-  /// Stop the timer
-  void stop() { CUDA_CHECK(cudaEventRecord(_stop, _stream_id)); }
-
-  /// Return the elapsed time (in milliseconds)
-  float elapsed_millis() {
-    float elapsed = 0.0;
-    CUDA_CHECK(cudaEventSynchronize(_stop));
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
-    return elapsed;
-  }
-};
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 562ebd5530ea0..1c67e2549e086 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -316,22 +316,4 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
   }
 }
 
-void cutlass_scaled_sparse_mm_azp_sm90(
-    torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& e,
-    torch::Tensor const& b, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& azp_adj,
-    c10::optional<torch::Tensor> const& azp,
-    c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (azp) {
-    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzpToken>(
-        out, a, e, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBiasAzp>(
-        out, a, e, b, a_scales, b_scales, azp_adj, bias);
-  }
-}
-
 #endif

From 8d94e1f49dfa1aaac92381e9d56dbdc65c225a90 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 02:10:41 +0000
Subject: [PATCH 83/92] Update code

---
 .../sparse_fp8_benchmarks.py                  | 24 +++++++++----------
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  3 +--
 tests/kernels/test_semi_structured.py         |  2 --
 3 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
index 0b2fd3e477247..4a76b289e3696 100644
--- a/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
@@ -3,12 +3,10 @@
 import dataclasses
 import itertools
 import multiprocessing as mp
-import os
 import pickle as pkl
 import time
 import traceback
 from multiprocessing import Process, Queue
-from pathlib import Path
 from queue import Empty
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 
@@ -539,17 +537,17 @@ def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
     }
 
     # Prepare configs for all kernels
-    standard_kernels = [
-        {'kernel_type': 'pytorch_mm'},
-        {'kernel_type': 'pytorch_scaled_mm'},
-        {'kernel_type': 'pytorch_scaled_mm_fast'},
-        {
-            'kernel_type': 'cutlass_scaled_mm'
-        },
-        {
-            'kernel_type': 'cutlass_scaled_sparse_mm'
-        }
-    ]
+    standard_kernels = [{
+        'kernel_type': 'pytorch_mm'
+    }, {
+        'kernel_type': 'pytorch_scaled_mm'
+    }, {
+        'kernel_type': 'pytorch_scaled_mm_fast'
+    }, {
+        'kernel_type': 'cutlass_scaled_mm'
+    }, {
+        'kernel_type': 'cutlass_scaled_sparse_mm'
+    }]
 
     # Create configs for standard kernels
     all_configs = [{**base_config, **kernel} for kernel in standard_kernels]
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index aabc1bdaf9753..d0353bc8cb42a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,13 +8,12 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser
 
-from utils import make_rand_tensors
-
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 3d8560238de3d..e5c9f14224f5a 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -9,7 +9,6 @@
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
-
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
@@ -129,4 +128,3 @@ def test_cutlass_sparse_subset():
                                   out_dtype=torch.bfloat16)
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
-

From b039820c4182d8b12d7998e85cb271ad6da171fa Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 07:11:56 +0000
Subject: [PATCH 84/92] Update code

---
 .../cutlass => cutlass_extensions}/common.hpp |  10 +-
 .../cutlass_w8a8/scaled_mm_c2x.cuh            |   2 +-
 .../cutlass_w8a8/scaled_mm_c3x.cu             |   2 +-
 csrc/sparse/cutlass/sparse_compressor.cu      |  14 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  13 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 344 ++----------------
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |   1 -
 csrc/torch_bindings.cpp                       |   6 +-
 tests/kernels/test_semi_structured.py         |   2 -
 vllm/_custom_ops.py                           |   3 +-
 10 files changed, 55 insertions(+), 342 deletions(-)
 rename csrc/{sparse/cutlass => cutlass_extensions}/common.hpp (78%)

diff --git a/csrc/sparse/cutlass/common.hpp b/csrc/cutlass_extensions/common.hpp
similarity index 78%
rename from csrc/sparse/cutlass/common.hpp
rename to csrc/cutlass_extensions/common.hpp
index 92c6f3b1bea05..e918c2a77bc85 100644
--- a/csrc/sparse/cutlass/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -19,6 +19,14 @@ inline uint32_t next_pow_2(uint32_t const num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
 /**
  * Panic wrapper for unwinding CUDA runtime errors
  */
@@ -30,4 +38,4 @@ inline uint32_t next_pow_2(uint32_t const num) {
                 << " at line: " << __LINE__ << std::endl;               \
       exit(EXIT_FAILURE);                                               \
     }                                                                   \
-  }
+  }
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index d03242f44ab1d..6e72aff89f2e4 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,7 +21,7 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "common.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 33581a63d4c3d..d118f0e070a63 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -24,7 +24,7 @@
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "common.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index eac71badac3b5..ba067a82da1d1 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -17,7 +17,7 @@
 #include "cutlass/detail/dependent_false.hpp"
 
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
+#include "cutlass_extensions/common.hpp"
 
 #include "cutlass/transform/device/transform_universal_adapter.hpp"
 #include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
@@ -39,8 +39,12 @@
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/packed_stride.hpp"
 
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 #include "sparse_scaled_mm_c3x.cuh"
 
+using namespace cute;
+using namespace vllm;
+
 /// Make A structured sparse by replacing elements with 0 and compress it
 template <typename ElementA_>
 bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
@@ -66,20 +70,20 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   using Gemm = typename std::conditional<
       std::is_same_v<ElementA, int8_t>,
       typename sm90_int8_config_default<int8_t, cutlass::half_t,
-                                        ScaledEpilogue>::Cutlass3xGemm,
+                                        c3x::ScaledEpilogue>::Cutlass3xGemm,
       typename std::conditional<
           std::is_same_v<ElementA, cutlass::float_e4m3_t>,
           typename sm90_fp8_config_default<cutlass::float_e4m3_t,
                                            cutlass::half_t,
-                                           ScaledEpilogue>::Cutlass3xGemm,
+                                           c3x::ScaledEpilogue>::Cutlass3xGemm,
           typename std::conditional<
               std::is_same_v<ElementA, cutlass::half_t>,
               typename sm90_fp16_config_default<cutlass::half_t,
                                                 cutlass::half_t,
-                                                ScaledEpilogue>::Cutlass3xGemm,
+                                                c3x::ScaledEpilogue>::Cutlass3xGemm,
               typename sm90_bf16_config_default<
                   cutlass::bfloat16_t, cutlass::half_t,
-                  ScaledEpilogue>::Cutlass3xGemm>::type>::type>::type;
+                  c3x::ScaledEpilogue>::Cutlass3xGemm>::type>::type>::type;
 
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 1c67e2549e086..2eaffb11a7b40 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -23,11 +23,14 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
 // clang-format on
 
-  #include "sparse_scaled_mm_c3x.cuh"
+#include "sparse_scaled_mm_c3x.cuh"
+
+using namespace cute;
+using namespace vllm;
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
@@ -308,10 +311,10 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogueBias>(
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
         out, a, e, b, a_scales, b_scales, *bias);
   } else {
-    return cutlass_scaled_sparse_mm_sm90_epilogue<ScaledEpilogue>(
+    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
         out, a, e, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index a948a02e451af..91aa7da4b9807 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -21,12 +21,12 @@
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
+#include "cutlass_extensions/common.hpp"
 
 using namespace cute;
 
 /*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper) or later.
 
    Epilogue functions can be defined to post-process the output before it is
@@ -53,304 +53,6 @@ struct enable_sm90_or_later : Kernel {
   }
 };
 
-/*
- * This class provides the common load descriptors for the
- * ScaledEpilogue[...] classes
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  template <typename T>
-  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  template <typename T>
-  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // Don't want to support nullptr by default
-  template <typename T, bool EnableNullPtr = false>
-  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
-
-  // This utility function constructs the arguments for the load descriptors
-  // from a tensor. It can handle both row and column, as well as row/column or
-  // scalar cases.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(torch::Tensor const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
-    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
-                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
-      return Arguments{data_ptr, tensor.numel() != 1};
-    } else {
-      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
-                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
-      return Arguments{data_ptr};
-    }
-  }
-
-  // This overload handles the case where there might not be a tensor, in which
-  // case a nullptr is passed and a constant (0) is used.
-  template <typename Descriptor, typename T>
-  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
-    using Arguments = typename Descriptor::Arguments;
-    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
-    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
-                  std::is_same_v<Descriptor, RowLoad<T, true>>);
-    return Arguments{data_ptr};
-  }
-};
-
-/*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch.scaled_mm_.
-
-   A and B may be both either int8 or fp8_e4m3. A can be
-   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
-  }
-};
-
-/*
- * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
- * This bias can also be used in the per-tensor azp case, where the activation
- * zero point (azp) is used to compute an azp correction term,
- * which is folded into the bias.
- *
- * The bias tensor must be per-output channel.
- * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD>;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
-
-  using ArgumentType = typename EVTCompute::Arguments;
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
-  }
-};
-
-/*
- * This epilogue directly supports per-tensor azp in int32 form.
- * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
- * term, which should already be multiplied with the scalar azp.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // This is the full AZP term, azp * J @ B, shape (1,n)
-  using AzpWithAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute float(accum - azp_adj), both operands are int32_t
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Accum, AzpWithAdj>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
-/*
- * This epilogue supports per-token azp by computing and applying
- * the correction term using a rank-1 update. If the term were materialized,
- * it would require O(m*n) space, and this way it only requires O(m+n) space.
- * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero
- * point for each row of A.
- * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B.
- *
- * This epilogue also supports bias, which remains per-channel.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
-  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
-  using Bias = typename SUPER::template RowLoad<ElementD, true>;
-
-  // Per-token azp term, shape (m,1)
-  using Azp = typename SUPER::template ColLoad<int32_t>;
-
-  // This is the AZP adjustment term, J @ B, shape (1,n)
-  using AzpAdj = typename SUPER::template RowLoad<int32_t>;
-
-  // Compute azp * azp_adj
-  using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, int32_t, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAzp =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAzp, Azp, AzpAdj>;
-
-  // Compute float(accum - azp*azp_adj), all operands are int32_t
-  using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::minus, float, int32_t,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeAcc =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeAcc, Accum, EVTComputeAzp>;
-
-  using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTComputeScaleB =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
-
-  using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeScaleBiasA, ScaleA,
-                                         EVTComputeScaleB, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& azp_adj,
-                                   torch::Tensor const& azp,
-                                   c10::optional<torch::Tensor> const& bias) {
-    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
-    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    auto azp_args = SUPER::template args_from_tensor<Azp, int32_t>(azp);
-    auto azp_adj_args =
-        SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
-  }
-};
-
 using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
 
 template <typename ElementAB_, typename ElementD_,
@@ -359,7 +61,7 @@ template <typename ElementAB_, typename ElementD_,
           typename EpilogueSchedule, typename AccType,
           typename TileSchedule = cutlass::gemm::PersistentScheduler,
           GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
-struct cutlass_3x_gemm {
+struct cutlass_sparse_3x_gemm {
   static const GemmUniversalMode Mode = Mode_;
   using ElementAB = ElementAB_;
   using ElementD = ElementD_;
@@ -489,7 +191,7 @@ struct sm90_fp16_config_default {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -503,7 +205,7 @@ struct sm90_bf16_config_default {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -517,7 +219,7 @@ struct sm90_fp8_config_1 {
   using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -532,7 +234,7 @@ struct sm90_fp8_config_2 {
   using TileShape = Shape<_128, _64, _256>;
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -545,7 +247,7 @@ struct sm90_fp8_config_3 {
   using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -559,7 +261,7 @@ struct sm90_fp8_config_4 {
   using TileShape = Shape<_64, _128, _256>;
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -573,7 +275,7 @@ struct sm90_fp8_config_5 {
   using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -586,7 +288,7 @@ struct sm90_fp8_config_6 {
   using TileShape = Shape<_64, _128, _256>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -601,7 +303,7 @@ struct sm90_fp8_config_7 {
   using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -616,7 +318,7 @@ struct sm90_fp8_config_8 {
   using TileShape = Shape<_128, _256, _128>;
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 ////////////////////////////////////////////////////////////////////////
@@ -631,7 +333,7 @@ struct sm90_fp8_config_default {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -649,7 +351,7 @@ struct sm90_fp8_config_M64 {
   using TileSchedule = cutlass::gemm::PersistentScheduler;
 
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
@@ -667,7 +369,7 @@ struct sm90_fp8_config_M128 {
   using TileSchedule = cutlass::gemm::PersistentScheduler;
 
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
@@ -686,7 +388,7 @@ struct sm90_fp8_config_M256 {
   using TileSchedule = cutlass::gemm::PersistentScheduler;
 
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
@@ -705,7 +407,7 @@ struct sm90_fp8_config_M512 {
   using TileSchedule = cutlass::gemm::PersistentScheduler;
 
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, float, TileSchedule>;
 };
 
@@ -720,7 +422,7 @@ struct sm90_int8_config_default {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
@@ -735,7 +437,7 @@ struct sm90_int8_config_M128 {
   using TileShape = Shape<_64, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
@@ -749,7 +451,7 @@ struct sm90_int8_config_M64 {
   using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
@@ -763,7 +465,7 @@ struct sm90_int8_config_M32_NBig {
   using TileShape = Shape<_64, _128, _256>;
   using ClusterShape = Shape<_1, _4, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
@@ -777,7 +479,7 @@ struct sm90_int8_config_M32_NSmall {
   using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_1, _8, _1>;
   using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
                       KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 5075c342098ba..a451298d5a6cd 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -48,7 +48,6 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
 
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
   int32_t version_num = test_get_sm_version_num();
-  // Hopper
 
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 3cbc843dc501a..0378c5ad0036c 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -317,9 +317,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // quantization, as well as bias
   ops.def(
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
-      "                  Tensor e,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
+      "                         Tensor e,"
+      "                         Tensor b, Tensor a_scales,"
+      "                         Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index e5c9f14224f5a..14ef2438daf4f 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -113,8 +113,6 @@ def test_cutlass_sparse_subset():
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
 
-    print("in test")
-
     out = ops.cutlass_scaled_sparse_mm(a,
                                        b_comp,
                                        e,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 72e744fc9ba0d..9573d31e46445 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -534,8 +534,7 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
 
 def cutlass_compress_entry(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
-    assert (a.dtype is torch.int8 or a.dtype is torch.float8_e4m3fn or \
-            a.dtype is torch.bfloat16 or a.dtype is torch.float16)
+    assert (a.dtype is [torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16])
 
     # e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
     elemsPerElemE = 4

From 67aae3e243abc2dd71aacf77724b5994363993f7 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 07:13:18 +0000
Subject: [PATCH 85/92] Clean up code

---
 csrc/sparse/cutlass/sparse_compressor.cu     |  6 +--
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu  |  2 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh | 48 +++++++++++---------
 vllm/_custom_ops.py                          |  4 +-
 4 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index ba067a82da1d1..d551a71e51601 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -78,9 +78,9 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
                                            c3x::ScaledEpilogue>::Cutlass3xGemm,
           typename std::conditional<
               std::is_same_v<ElementA, cutlass::half_t>,
-              typename sm90_fp16_config_default<cutlass::half_t,
-                                                cutlass::half_t,
-                                                c3x::ScaledEpilogue>::Cutlass3xGemm,
+              typename sm90_fp16_config_default<
+                  cutlass::half_t, cutlass::half_t,
+                  c3x::ScaledEpilogue>::Cutlass3xGemm,
               typename sm90_bf16_config_default<
                   cutlass::bfloat16_t, cutlass::half_t,
                   c3x::ScaledEpilogue>::Cutlass3xGemm>::type>::type>::type;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 2eaffb11a7b40..ea191015b4159 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -27,7 +27,7 @@
 #include "cutlass_extensions/common.hpp"
 // clang-format on
 
-#include "sparse_scaled_mm_c3x.cuh"
+  #include "sparse_scaled_mm_c3x.cuh"
 
 using namespace cute;
 using namespace vllm;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 91aa7da4b9807..a5925c715940b 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -26,8 +26,8 @@
 using namespace cute;
 
 /*
-   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
+   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
+   for NVIDIA GPUs with sm90a (Hopper) or later.
 
    Epilogue functions can be defined to post-process the output before it is
    written to GPU memory.
@@ -192,7 +192,7 @@ struct sm90_fp16_config_default {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -206,7 +206,7 @@ struct sm90_bf16_config_default {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 //////////////////////// Cherry-Picking Kernels ////////////////////////
@@ -220,7 +220,7 @@ struct sm90_fp8_config_1 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -235,7 +235,7 @@ struct sm90_fp8_config_2 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -248,7 +248,7 @@ struct sm90_fp8_config_3 {
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -262,7 +262,7 @@ struct sm90_fp8_config_4 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -276,7 +276,7 @@ struct sm90_fp8_config_5 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -289,7 +289,7 @@ struct sm90_fp8_config_6 {
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -304,7 +304,7 @@ struct sm90_fp8_config_7 {
   using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -319,7 +319,7 @@ struct sm90_fp8_config_8 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 ////////////////////////////////////////////////////////////////////////
 
@@ -334,7 +334,7 @@ struct sm90_fp8_config_default {
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
@@ -352,7 +352,8 @@ struct sm90_fp8_config_M64 {
 
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -370,7 +371,8 @@ struct sm90_fp8_config_M128 {
 
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -389,7 +391,8 @@ struct sm90_fp8_config_M256 {
 
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -408,7 +411,8 @@ struct sm90_fp8_config_M512 {
 
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float, TileSchedule>;
+                             KernelSchedule, EpilogueSchedule, float,
+                             TileSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -423,7 +427,7 @@ struct sm90_int8_config_default {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -438,7 +442,7 @@ struct sm90_int8_config_M128 {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -452,7 +456,7 @@ struct sm90_int8_config_M64 {
   using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -466,7 +470,7 @@ struct sm90_int8_config_M32_NBig {
   using ClusterShape = Shape<_1, _4, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 template <typename InType, typename OutType,
@@ -480,7 +484,7 @@ struct sm90_int8_config_M32_NSmall {
   using ClusterShape = Shape<_1, _8, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
 }  // namespace
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9573d31e46445..034b3e9493736 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -534,7 +534,9 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
 
 def cutlass_compress_entry(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
-    assert (a.dtype is [torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16])
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
 
     # e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
     elemsPerElemE = 4

From 154814f7e6af4112c3756ab89536392970f29e63 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 20:27:33 +0000
Subject: [PATCH 86/92] Update benchmarking code and remove empty files

---
 .../cutlass_benchmarks/sparse_benchmarks.py   | 359 +++++++++
 .../sparse_fp8_benchmarks.py                  | 760 ------------------
 csrc/cutlass_extensions/common.hpp            |  18 +-
 csrc/sparse/cutlass/sparse_compressor.cu      |  19 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |  10 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |  33 +-
 .../layers/sparsity/__init__.py               |   0
 .../layers/sparsity/utils/__init__.py         |   0
 8 files changed, 392 insertions(+), 807 deletions(-)
 create mode 100644 benchmarks/cutlass_benchmarks/sparse_benchmarks.py
 delete mode 100644 benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
 delete mode 100644 vllm/model_executor/layers/sparsity/__init__.py
 delete mode 100644 vllm/model_executor/layers/sparsity/utils/__init__.py

diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
new file mode 100644
index 0000000000000..f93ab89049360
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -0,0 +1,359 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import make_rand_sparse_tensors
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
+             **kwargs) -> TMeasurement:
+    min_run_time = 1
+
+    globals = {
+        "args": args,
+        "kwargs": kwargs,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(*args, **kwargs)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+    # pytorch impl - bfloat16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16),
+                 b.to(dtype=torch.bfloat16)))
+
+    # pytorch impl - float16
+    timers.append(
+        bench_fn(label, sub_label,
+                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
+                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
+                 bias))
+    
+    # cutlass sparse impl
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass sparse with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+
+    timers = []
+
+    # pytorch impl w. bf16
+    timers.append(
+        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
+                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda")))
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.bfloat16,
+                 use_fast_accum=True))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(label,
+                 sub_label,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
+                 torch._scaled_mm,
+                 a,
+                 b,
+                 scale_a=scale_a,
+                 scale_b=scale_b,
+                 out_dtype=torch.float16,
+                 use_fast_accum=True))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
+                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
+                 torch.bfloat16))
+    
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b,
+                 torch.bfloat16))
+
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.float16))
+
+    # cutlass impl: bf16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.bfloat16,
+                 bias))
+
+    # cutlass impl: fp16 output, with bias
+    timers.append(
+        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.float16,
+                 bias.to(dtype=torch.float16)))
+
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
\ No newline at end of file
diff --git a/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
deleted file mode 100644
index 4a76b289e3696..0000000000000
--- a/benchmarks/cutlass_benchmarks/sparse_fp8_benchmarks.py
+++ /dev/null
@@ -1,760 +0,0 @@
-import argparse
-import copy
-import dataclasses
-import itertools
-import multiprocessing as mp
-import pickle as pkl
-import time
-import traceback
-from multiprocessing import Process, Queue
-from queue import Empty
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_n_rand_sparse_tensors
-from weight_shapes import WEIGHT_SHAPES
-
-import vllm._custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-
-@dataclasses.dataclass
-class CudaGraphBenchParams:
-    num_ops_in_cuda_graph: int
-
-
-@dataclasses.dataclass
-class ArgPool:
-    '''
-    When some argument of the benchmarking function is annotated with this type,
-    the benchmarking class (BenchMM) will collapse the argument to a pick a
-    single value from the given list of values, during function invocation.
-
-    For every invocation during a benchmarking run, it will choose a
-    different value from the list.
-    '''
-    values: Iterable[Any]
-
-
-class BenchMM:
-
-    class ArgsIterator:
-
-        def __init__(self, args_list, kwargs_list):
-            assert len(args_list) == len(kwargs_list)
-            self.args_list = args_list
-            self.kwargs_list = kwargs_list
-            self.n = len(self.args_list)
-            self.idx = 0
-
-        def __next__(self):
-            while True:
-                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
-                self.idx += 1
-                self.idx = self.idx % self.n
-
-        def reset(self):
-            self.idx = 0
-
-        @property
-        def n_args(self):
-            return self.n
-
-    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
-                 label: str, sub_label: str, description: str, fn: Callable,
-                 *args, **kwargs):
-
-        self.cuda_graph_params = cuda_graph_params
-        self.use_cuda_graph = self.cuda_graph_params is not None
-        self.label = label
-        self.sub_label = sub_label
-        self.description = description
-        self.fn = fn
-
-        # Process args
-        self._args = args
-        self._kwargs = kwargs
-        self.args_list, self.kwargs_list = self.collapse_argpool(
-            *args, **kwargs)
-        self.args_iterator = self.ArgsIterator(self.args_list,
-                                               self.kwargs_list)
-
-        # Cudagraph runner
-        self.g = None
-        if self.use_cuda_graph:
-            self.g = self.get_cuda_graph_runner()
-
-        # benchmark run params
-        self.min_run_time = 1
-
-    def collapse_argpool(self, *args, **kwargs):
-        kwargs = kwargs if kwargs is not None else {}
-        assert kwargs is None or all([
-            not isinstance(v, ArgPool) for k, v in kwargs.items()
-        ]), 'ArgPools in kwargs are not supported yet'
-
-        arg_pool_indices = [
-            i for i, x in enumerate(args) if isinstance(x, ArgPool)
-        ]
-        if len(arg_pool_indices) == 0:
-            return [args], [kwargs]
-
-        # make sure all the Arg pools have the same number of choices
-        arg_pool_size = len(args[arg_pool_indices[0]].values)
-        assert all(
-            [len(args[i].values) == arg_pool_size for i in arg_pool_indices])
-
-        # create copies of the args
-        args_list = []
-        kwargs_list = []
-        for _ in range(arg_pool_size):
-            args_list.append(args)
-            kwargs_list.append(kwargs.copy())
-
-        # collapse the arg pools by simply choosing the ith value
-        for i in range(arg_pool_size):
-            assert isinstance(args_list[i], tuple)
-            # get as list
-            args_i = list(args_list[i])
-            # collapse - make replacements
-            for arg_pool_idx in arg_pool_indices:
-                val_from_pool = args_i[arg_pool_idx].values[i]
-                args_i[arg_pool_idx] = val_from_pool
-            # store back as tuple
-            args_list[i] = tuple(args_i)
-
-        return args_list, kwargs_list
-
-    def get_cuda_graph_runner(self):
-        assert self.use_cuda_graph
-        assert self.args_iterator is not None
-
-        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
-
-        # warmup
-        args_it = self.args_iterator.__next__()
-        for _ in range(5):
-            args, kwargs = next(args_it)
-            self.fn(*args, **kwargs)
-
-        self.args_iterator.reset()
-        args_it = self.args_iterator.__next__()
-
-        stream = torch.cuda.Stream()
-        with torch.cuda.stream(stream):
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                for _ in range(num_graph_ops):
-                    args, kwargs = next(args_it)
-                    self.fn(*args, **kwargs)
-        return g
-
-    def run_cudagraph(self) -> TMeasurement:
-        assert self.use_cuda_graph
-        globals = {'g': self.g}
-
-        return TBenchmark.Timer(
-            stmt="g.replay()",
-            globals=globals,
-            label=self.label,
-            sub_label=self.sub_label,
-            description=self.description,
-        ).blocked_autorange(min_run_time=self.min_run_time)
-
-    def run_eager(self) -> TMeasurement:
-        setup = None
-        stmt = None
-        globals = None
-
-        has_arg_pool = self.args_iterator.n_args > 1
-        if has_arg_pool:
-            setup = '''
-                    args_iterator.reset()
-                    args_it = args_iterator.__next__()
-                    '''
-            stmt = '''
-                    args, kwargs = next(args_it)
-                    fn(*args, **kwargs)
-                    '''
-            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
-        else:
-            # no arg pool. Just use the args and kwargs directly
-            self.args_iterator.reset()
-            args_it = self.args_iterator.__next__()
-            args, kwargs = next(args_it)
-
-            setup = ""
-            stmt = '''
-                    fn(*args, **kwargs)
-                   '''
-            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
-
-        return TBenchmark.Timer(
-            stmt=stmt,
-            setup=setup,
-            globals=globals,
-            label=self.label,
-            sub_label=self.sub_label,
-            description=self.description,
-        ).blocked_autorange(min_run_time=self.min_run_time)
-
-    def run(self) -> TMeasurement:
-        timer = None
-        if self.use_cuda_graph:  # noqa SIM108
-            timer = self.run_cudagraph()
-        else:
-            timer = self.run_eager()
-        #assert timer.meets_confidence()
-        #assert not timer.has_warnings, f"Warnings {timer._warnings}"
-        if not timer.meets_confidence() or timer.has_warnings:
-            print("Doesn't meet confidence - re-running bench ...")
-            return self.run()
-        return timer
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if exc_type:
-            print(f"exc type {exc_type}")
-            print(f"exc value {exc_value}")
-            print(f"exc traceback {traceback}")
-
-
-def run_single_benchmark_process(kernel_config: Dict, gpu_id: int,
-                                 queue: Queue):
-    """
-    Run a single kernel benchmark in an isolated process.
-    Puts (success, result, config) tuple in the queue.
-    """
-    try:
-        torch.cuda.set_device(gpu_id)
-
-        # Initialize CUDA tensors
-        m, k, n = kernel_config['m'], kernel_config['k'], kernel_config['n']
-        dtype = kernel_config['dtype']
-
-        # Create tensors
-        BComps, Es, As, Bs = make_n_rand_sparse_tensors(
-            kernel_config.get('arg_pool_size', 1), dtype, m, n, k)
-        bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
-        bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
-        scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-        # Because the transposed output will be computed
-        # out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
-
-        # Setup benchmark params
-        cuda_graph_params = None
-        if cgops := kernel_config.get('cuda_graph_ops'):
-            cuda_graph_params = CudaGraphBenchParams(cgops)
-
-        label = kernel_config['label']
-        sub_label = kernel_config['sub_label']
-
-        # Initialize benchmark based on kernel type
-        bench = None
-        kernel_type = kernel_config['kernel_type']
-
-        if kernel_type == 'pytorch_mm':
-            bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                            torch.mm, ArgPool(bf16_As), ArgPool(bf16_Bs))
-
-        elif kernel_type == 'pytorch_scaled_mm':
-            bench = BenchMM(cuda_graph_params,
-                            label,
-                            sub_label,
-                            "pytorch_fp8_fp8_bf16_scaled_mm",
-                            torch._scaled_mm,
-                            ArgPool(As),
-                            ArgPool(Bs),
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=torch.bfloat16)
-
-        elif kernel_type == 'pytorch_scaled_mm_fast':
-            bench = BenchMM(cuda_graph_params,
-                            label,
-                            sub_label,
-                            "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                            torch._scaled_mm,
-                            ArgPool(As),
-                            ArgPool(Bs),
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=torch.bfloat16,
-                            use_fast_accum=True)
-
-        elif kernel_type == 'cutlass_scaled_mm':
-            bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_mm",
-                            ops.cutlass_scaled_mm, ArgPool(As), ArgPool(Bs),
-                            scale_a, scale_b, torch.bfloat16)
-
-        elif kernel_type == 'cutlass_scaled_sparse_mm':
-            bench = BenchMM(cuda_graph_params, label, sub_label,
-                            "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                            ops.cutlass_scaled_sparse_mm, ArgPool(As),
-                            ArgPool(BComps), ArgPool(Es), scale_a, scale_b,
-                            torch.bfloat16)
-
-        # Run the benchmark
-        result = bench.run()
-        queue.put((True, result, kernel_config))
-
-    except Exception as e:
-        print(f"Error in benchmark process: {str(e)}")
-        print(traceback.format_exc())
-        queue.put((False, None, kernel_config))
-    finally:
-        # Explicit cleanup
-        torch.cuda.empty_cache()
-
-
-def benchmark_gpu_worker(gpu_id: int, task_queue: Queue, result_queue: Queue):
-    """
-    Worker process that spawns individual benchmark processes for each kernel.
-    """
-    try:
-        while True:
-            try:
-                kernel_config = task_queue.get_nowait()
-                if kernel_config is None:  # Poison pill
-                    break
-
-                # Create a new process queue for this specific benchmark
-                process_queue = Queue()
-
-                # Create and start a new process for this kernel benchmark
-                p = Process(target=run_single_benchmark_process,
-                            args=(kernel_config, gpu_id, process_queue))
-                p.start()
-
-                # Wait for result with timeout (5 minutes for benchmarking)
-                try:
-                    success, result, config = process_queue.get(timeout=300)
-                    result_queue.put((success, result, config))
-                except Empty:
-                    print(f"Kernel {kernel_config.get('kernel_type')} ",
-                          "benchmark timed out")
-                    result_queue.put((False, None, kernel_config))
-
-                # Cleanup
-                p.join(timeout=1)  # Give it 1 second to join
-                if p.is_alive():
-                    p.terminate()
-                    p.join()
-
-            except Empty:
-                break
-            except Exception as e:
-                print(f"Error in GPU {gpu_id} worker: {str(e)}")
-                print(traceback.format_exc())
-                if 'kernel_config' in locals():
-                    result_queue.put((False, None, kernel_config))
-
-    finally:
-        print(f"GPU {gpu_id} worker finished")
-
-
-def run_kernels_on_gpus(
-        configs: List[Dict]
-) -> List[Tuple[bool, Optional[TMeasurement], Dict]]:
-    MULTI_GPU_MULTI_PROCESS = False  # Set to False for single GPU testing
-    if MULTI_GPU_MULTI_PROCESS:
-        gpus_list = [0]
-        task_queue = Queue()
-        result_queue = Queue()
-
-        configs = configs[:10]
-
-        # Fill task queue
-        for config in configs:
-            task_queue.put(config)
-        for _ in gpus_list:  # Add poison pills
-            task_queue.put(None)
-
-        # Start GPU workers
-        workers = []
-        for gpu_id in gpus_list:
-            p = Process(target=benchmark_gpu_worker,
-                        args=(gpu_id, task_queue, result_queue))
-            p.start()
-            workers.append(p)
-
-        # Collect results
-        results = []
-        completed = 0
-        total_tasks = len(configs)
-
-        while completed < total_tasks:
-            success, result, config = result_queue.get()
-            results.append((success, result, config))
-            completed += 1
-
-            # Print progress
-            status = "Success" if success else "Failed"
-            print(f"{status}: {config['kernel_type']}")
-
-        # Cleanup workers
-        for worker in workers:
-            worker.join(timeout=1)
-            if worker.is_alive():
-                worker.terminate()
-                worker.join()
-
-        return results
-    else:
-        """Run kernel benchmarks in a single process."""
-        results = []
-        gpu_id = 0  # Using the same GPU as before
-        torch.cuda.set_device(gpu_id)
-        # configs = configs[:10]  # Keep the original slice
-
-        for config in configs:
-            try:
-                # Initialize CUDA tensors
-                m, k, n = config['m'], config['k'], config['n']
-                dtype = config['dtype']
-
-                # Create tensors
-                BComps, Es, As, Bs = make_n_rand_sparse_tensors(
-                    config.get('arg_pool_size', 1), dtype, m, n, k)
-                bf16_As = [x.to(dtype=torch.bfloat16) for x in As]
-                bf16_Bs = [x.to(dtype=torch.bfloat16) for x in Bs]
-                scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-                scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-                # out = torch.zeros((n, m), dtype=torch.bfloat16, device="cuda")
-
-                # Setup benchmark params
-                cuda_graph_params = None
-                if cgops := config.get('cuda_graph_ops'):
-                    cuda_graph_params = CudaGraphBenchParams(cgops)
-
-                label = config['label']
-                sub_label = config['sub_label']
-
-                # Initialize benchmark based on kernel type
-                bench = None
-                kernel_type = config['kernel_type']
-
-                if kernel_type == 'pytorch_mm':
-                    bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                                    torch.mm, ArgPool(bf16_As),
-                                    ArgPool(bf16_Bs))
-
-                elif kernel_type == 'pytorch_scaled_mm':
-                    bench = BenchMM(cuda_graph_params,
-                                    label,
-                                    sub_label,
-                                    "pytorch_fp8_fp8_bf16_scaled_mm",
-                                    torch._scaled_mm,
-                                    ArgPool(As),
-                                    ArgPool(Bs),
-                                    scale_a=scale_a,
-                                    scale_b=scale_b,
-                                    out_dtype=torch.bfloat16)
-
-                elif kernel_type == 'pytorch_scaled_mm_fast':
-                    bench = BenchMM(
-                        cuda_graph_params,
-                        label,
-                        sub_label,
-                        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                        torch._scaled_mm,
-                        ArgPool(As),
-                        ArgPool(Bs),
-                        scale_a=scale_a,
-                        scale_b=scale_b,
-                        out_dtype=torch.bfloat16,
-                        use_fast_accum=True)
-
-                elif kernel_type == 'cutlass_scaled_mm':
-                    bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "cutlass_fp8_fp8_bf16_scaled_mm",
-                                    ops.cutlass_scaled_mm, ArgPool(As),
-                                    ArgPool(Bs), scale_a, scale_b,
-                                    torch.bfloat16)
-
-                elif kernel_type == 'cutlass_scaled_sparse_mm':
-                    bench = BenchMM(cuda_graph_params, label, sub_label,
-                                    "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                                    ops.cutlass_scaled_sparse_mm, ArgPool(As),
-                                    ArgPool(BComps), ArgPool(Es), scale_a,
-                                    scale_b, torch.bfloat16)
-
-                # Run the benchmark
-                result = bench.run()
-
-                # Print progress
-                print(f"Success: {kernel_type}")
-
-                results.append((True, result, config))
-
-                # Cleanup
-                torch.cuda.empty_cache()
-
-            except Exception as e:
-                print(f"Error in benchmark: {str(e)}")
-                print(traceback.format_exc())
-                results.append((False, None, config))
-                torch.cuda.empty_cache()
-
-        return results
-
-
-def bench_fp8(dtype: torch.dtype, with_cuda_graph: Optional[int],
-              with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-
-    # Check if context is not set
-    try:  # noqa: SIM105
-        mp.set_start_method('spawn', force=True)
-    except RuntimeError:
-        pass
-
-    timers = []
-    gpus_list = [5]  # Using the same GPU list as original code
-
-    # Base configuration for all kernels
-    base_config = {
-        'm': m,
-        'k': k,
-        'n': n,
-        'dtype': dtype,
-        'cuda_graph_ops': with_cuda_graph,
-        'arg_pool_size': with_arg_pool if with_arg_pool else 1,
-        'label': label,
-        'sub_label': sub_label
-    }
-
-    # Prepare configs for all kernels
-    standard_kernels = [{
-        'kernel_type': 'pytorch_mm'
-    }, {
-        'kernel_type': 'pytorch_scaled_mm'
-    }, {
-        'kernel_type': 'pytorch_scaled_mm_fast'
-    }, {
-        'kernel_type': 'cutlass_scaled_mm'
-    }, {
-        'kernel_type': 'cutlass_scaled_sparse_mm'
-    }]
-
-    # Create configs for standard kernels
-    all_configs = [{**base_config, **kernel} for kernel in standard_kernels]
-
-    # Run all kernels distributed across GPUs
-    print(
-        f"Running {len(all_configs)} benchmarks across {len(gpus_list)} GPUs..."
-    )
-    results = run_kernels_on_gpus(all_configs)
-
-    # Process results
-    for success, result, _ in results:
-        if success and result is not None:
-            timers.append(result)
-
-    return timers
-
-
-def bench(dtype: torch.dtype, with_cuda_graph: Optional[int],
-          with_arg_pool: Optional[int], m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, with_cuda_graph, with_arg_pool, m, k, n, label,
-                         sub_label)
-    raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-    results = []
-    dtype = args.dtype
-
-    for m, k, n in MKNs:
-        label = f"scaled-sparse-{dtype}-gemm"
-        label = f"{label}-cugraph_{args.with_cuda_graph}" \
-                if args.with_cuda_graph else label
-        label = f"{label}-argpool_{args.with_arg_pool}" \
-            if args.with_arg_pool else label
-        timers = bench(args.dtype, args.with_cuda_graph, args.with_arg_pool, m,
-                       k, n, label, f"MKN=({m}x{k}x{n})")
-
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            if tp_split_dim is not None:
-                KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/sp_fp8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument(
-        "--dtype",
-        type=to_torch_dtype,
-        required=True,
-        help="Available options are ['int8', 'fp8', 'fp16', 'bf16']")
-    parser.add_argument(
-        '--with-cuda-graph',
-        type=int,
-        default=32,
-        help="Number of ops/matmuls in a cudagraph execution. When set"
-        "cuda-graphs is enabled")
-    parser.add_argument(
-        '--with-arg-pool',
-        type=int,
-        default=None,
-        help="Number of A and B tensors to use as arg-pool. When not set,"
-        "it defaults to 1")
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index e918c2a77bc85..5c1098971b462 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -19,14 +19,6 @@ inline uint32_t next_pow_2(uint32_t const num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
 /**
  * Panic wrapper for unwinding CUDA runtime errors
  */
@@ -38,4 +30,12 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
                 << " at line: " << __LINE__ << std::endl;               \
       exit(EXIT_FAILURE);                                               \
     }                                                                   \
-  }
\ No newline at end of file
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index d551a71e51601..ebb1c975121ac 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -67,23 +67,8 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
   using StrideE = StrideA;
 
-  using Gemm = typename std::conditional<
-      std::is_same_v<ElementA, int8_t>,
-      typename sm90_int8_config_default<int8_t, cutlass::half_t,
-                                        c3x::ScaledEpilogue>::Cutlass3xGemm,
-      typename std::conditional<
-          std::is_same_v<ElementA, cutlass::float_e4m3_t>,
-          typename sm90_fp8_config_default<cutlass::float_e4m3_t,
-                                           cutlass::half_t,
-                                           c3x::ScaledEpilogue>::Cutlass3xGemm,
-          typename std::conditional<
-              std::is_same_v<ElementA, cutlass::half_t>,
-              typename sm90_fp16_config_default<
-                  cutlass::half_t, cutlass::half_t,
-                  c3x::ScaledEpilogue>::Cutlass3xGemm,
-              typename sm90_bf16_config_default<
-                  cutlass::bfloat16_t, cutlass::half_t,
-                  c3x::ScaledEpilogue>::Cutlass3xGemm>::type>::type>::type;
+  using Gemm = typename sm90_config_default<ElementA, cutlass::half_t,
+                                            c3x::ScaledEpilogue>::Cutlass3xGemm;
 
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index ea191015b4159..76b6b2e395c04 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -45,8 +45,8 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
   using Cutlass3xGemmDefault =
-      typename sm90_fp8_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
+      typename sm90_config_default<InType, OutType,
+                                   Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
       typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
@@ -151,7 +151,7 @@ void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kFloat16);
 
   using Cutlass3xGemmDefault =
-      typename sm90_fp16_config_default<InType, OutType,
+      typename sm90_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
@@ -172,7 +172,7 @@ void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kBFloat16);
 
   using Cutlass3xGemmDefault =
-      typename sm90_bf16_config_default<InType, OutType,
+      typename sm90_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
@@ -193,7 +193,7 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   using Cutlass3xGemmDefault =
-      typename sm90_int8_config_default<InType, OutType,
+      typename sm90_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
       typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index a5925c715940b..59027d61debae 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -183,29 +183,32 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp16_config_default {
+struct sm90_config_default {};
+
+
+template <typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_config_default<half_t, OutType, Epilogue> {
   // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::half_t>());
   using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+      cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
-template <typename InType, typename OutType,
+template <typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_bf16_config_default {
+struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
   // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
   using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape, ClusterShape,
                              KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -323,17 +326,16 @@ struct sm90_fp8_config_8 {
 };
 ////////////////////////////////////////////////////////////////////////
 
-template <typename InType, typename OutType,
+template <typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
+struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
   // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue, TileShape, ClusterShape,
                              KernelSchedule, EpilogueSchedule, float>;
 };
 
@@ -415,18 +417,17 @@ struct sm90_fp8_config_M512 {
                              TileSchedule>;
 };
 
-template <typename InType, typename OutType,
+template <typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
+struct sm90_config_default<int8_t, OutType, Epilogue> {
   // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
   using KernelSchedule =
       typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+      cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
                              KernelSchedule, EpilogueSchedule, int32_t>;
 };
 
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/vllm/model_executor/layers/sparsity/utils/__init__.py b/vllm/model_executor/layers/sparsity/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From ac059b4551aa09f747699020785c9afbe014575a Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Thu, 12 Dec 2024 21:38:42 +0000
Subject: [PATCH 87/92] Update code

---
 .../cutlass_benchmarks/sparse_benchmarks.py   | 39 ++++++++++---------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 12 ++----
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 13 ++++---
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index f93ab89049360..0bbed68a71e67 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -70,19 +70,18 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
                  ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
                  bias))
-    
+
     # cutlass sparse impl
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
 
     # cutlass sparse with bias
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
 
     return timers
 
@@ -90,7 +89,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
 def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float8_e4m3fn
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
@@ -158,29 +158,32 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
         bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
                  ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
                  torch.bfloat16))
-    
+
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.bfloat16,
-                 bias))
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.bfloat16, bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
+        bench_fn(label, sub_label,
+                 "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
+                 scale_b, torch.float16, bias.to(dtype=torch.float16)))
 
     return timers
 
@@ -356,4 +359,4 @@ def to_torch_dtype(dt):
     model_parser.set_defaults(func=run_model_bench)
 
     args = parser.parse_args()
-    args.func(args)
\ No newline at end of file
+    args.func(args)
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 76b6b2e395c04..8d36ece6d79c9 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -45,8 +45,7 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
   using Cutlass3xGemmDefault =
-      typename sm90_config_default<InType, OutType,
-                                   Epilogue>::Cutlass3xGemm;
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
       typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
@@ -151,8 +150,7 @@ void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kFloat16);
 
   using Cutlass3xGemmDefault =
-      typename sm90_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
   return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
@@ -172,8 +170,7 @@ void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kBFloat16);
 
   using Cutlass3xGemmDefault =
-      typename sm90_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
   return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
@@ -193,8 +190,7 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   TORCH_CHECK(b.dtype() == torch::kInt8);
 
   using Cutlass3xGemmDefault =
-      typename sm90_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
+      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
       typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
   using Cutlass3xGemmM64 =
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 59027d61debae..aa90388295492 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -185,7 +185,6 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_config_default {};
 
-
 template <typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_config_default<half_t, OutType, Epilogue> {
@@ -196,7 +195,7 @@ struct sm90_config_default<half_t, OutType, Epilogue> {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename OutType,
@@ -208,8 +207,9 @@ struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule,
+                             float>;
 };
 
 //////////////////////// Cherry-Picking Kernels ////////////////////////
@@ -335,8 +335,9 @@ struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,

From b559b6a3fb1f65cb3e89378109c68374dc7cf355 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Fri, 13 Dec 2024 07:45:06 +0000
Subject: [PATCH 88/92] Push activations and output transposes into CUTLASS
 code

---
 benchmarks/benchmark_throughput.py            |   3 +-
 .../cutlass_benchmarks/sparse_benchmarks.py   |  20 +++
 benchmarks/cutlass_benchmarks/utils.py        |   9 +-
 csrc/ops.h                                    |   4 +-
 csrc/sparse/cutlass/sparse_compressor.cu      |   7 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 152 +++++++++---------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |  77 ++++-----
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  24 +--
 csrc/torch_bindings.cpp                       |   8 +-
 tests/kernels/test_semi_structured.py         |   2 +-
 vllm/_custom_ops.py                           |  89 +++++++---
 .../schemes/compressed_tensors_24.py          |   6 +-
 12 files changed, 236 insertions(+), 165 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index e92b5d00dc9f5..1e5967bd9bf8b 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -361,8 +361,7 @@ def main(args: argparse.Namespace):
         # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-          f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
-          f"{total_num_tokens=} | {total_output_tokens=}")
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
 
     # Output JSON results if specified
     if args.output_json:
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 0bbed68a71e67..eec6e6134a0cf 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -46,6 +46,16 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
     timers = []
     # pytorch impl - bfloat16
     timers.append(
@@ -95,6 +105,16 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
     timers = []
 
     # pytorch impl w. bf16
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index 84937d1c81bb2..c53cee52642f4 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -62,8 +62,11 @@ def prune_to_2_4(tensor):
 
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
                              k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
+    # a = torch.randn((m, k), device='cuda') * 5
+    # b = torch.randn((n, k), device='cuda').t() * 5
+
+    a = torch.ones((m, k), device='cuda')
+    b = torch.ones((n, k), device='cuda').t()
 
     b = prune_to_2_4(b.t()).t()
 
@@ -78,7 +81,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
     else:
         raise ValueError("unsupported dtype")
 
-    b_compressed, e = ops.cutlass_compress_entry(b.t())
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
 
     # Compressed B, Metadata, Original A, B
     return b_compressed, e, a, b
diff --git a/csrc/ops.h b/csrc/ops.h
index 363ddec3d0729..d43f495aabd80 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -156,12 +156,12 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
-                              torch::Tensor const& e, torch::Tensor const& b,
+                              torch::Tensor const& b, torch::Tensor const& e,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
                               c10::optional<torch::Tensor> const& bias);
 
-bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
                             torch::Tensor const& a);
 #endif
 
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index ebb1c975121ac..30b78054f300e 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -73,9 +73,6 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
-  // Just a dummy value
-  int32_t n = 128;
-
   int64_t lda = a.stride(0);
 
   using StrideA = Stride<int64_t, Int<1>, int64_t>;
@@ -85,7 +82,7 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   StrideA a_stride{lda, Int<1>{}, 0};
 
   using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
 
   using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
   using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
@@ -155,7 +152,7 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   return true;
 }
 
-bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
                             torch::Tensor const& a) {
   if (a.dtype() == torch::kBFloat16) {
     return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 8d36ece6d79c9..4537d31c54eb1 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -36,13 +36,13 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& e,
-                                    torch::Tensor const& b,
+                                    torch::Tensor const& bt_nzs,
+                                    torch::Tensor const& bt_meta,
                                     EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
@@ -72,68 +72,68 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass3xGemm8 =
       typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
 
-  uint32_t const n = b.size(1);  // Batch size
-  uint32_t const m = a.size(0);
-  uint32_t const np2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(n));  // next power of 2
+  uint32_t const n = bt_nzs.size(0);
+  uint32_t const m = a.size(0);  // Batch size
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
 
-  if (np2 <= 64) {
-    if (m == 28672) {
+  if (mp2 <= 64) {
+    if (n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 4096 || m == 6144) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096 || n == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
-  } else if (np2 <= 128) {
-    if (m == 4096) {
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 28672) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 6144) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
-  } else if (np2 <= 256) {
-    if (m == 4096) {
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 28672) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 6144) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
   } else {
-    if (m == 6144 || m == 28672) {
+    if (n == 6144 || n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 4096) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
   }
 
   // Otherwise the default heuristic
-  if (np2 <= 64) {
+  if (mp2 <= 64) {
     // n in [1, 64]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (np2 <= 128) {
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
     // n in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (np2 <= 256) {
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
     // n in (128, 256]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   } else {
     // n in (256, inf)
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   }
 }
 
@@ -141,53 +141,53 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& e,
-                                     torch::Tensor const& b,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
                                      EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::half_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat16);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
   return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, e, b, std::forward<EpilogueArgs>(args)...);
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
 }
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& e,
-                                     torch::Tensor const& b,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
                                      EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::bfloat16_t>());
   TORCH_CHECK(a.dtype() == torch::kBFloat16);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kBFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
   return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, e, b, std::forward<EpilogueArgs>(args)...);
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
 }
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& e,
-                                     torch::Tensor const& b,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
                                      EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, int8_t>());
   TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
@@ -213,23 +213,23 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
     // m in [1, 32]
     if (is_small_n) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     } else {
       return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
   } else if (mp2 <= 64) {
     // m in (32, 64]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // m in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
     return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   }
 }
 
@@ -237,68 +237,68 @@ template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
                                             torch::Tensor const& a,
-                                            torch::Tensor const& e,
-                                            torch::Tensor const& b,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
                                             EpilogueArgs&&... epilogue_args) {
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
   if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
                                              Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat8_e4m3fn) {
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
                                             cutlass::bfloat16_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
                                             cutlass::half_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat16) {
-    TORCH_CHECK(b.dtype() == torch::kFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
                                              cutlass::bfloat16_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
                                              Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {  // a.dtype() == torch::kBFloat16
     TORCH_CHECK(a.dtype() == torch::kBFloat16);
-    TORCH_CHECK(b.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
                                              cutlass::bfloat16_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
                                              cutlass::half_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
 
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
-                                   torch::Tensor const& e,
-                                   torch::Tensor const& b,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    c10::optional<torch::Tensor> const& bias) {
@@ -308,10 +308,10 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
     return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, e, b, a_scales, b_scales, *bias);
+        out, a, bt_nzs, bt_meta, a_scales, b_scales, *bias);
   } else {
     return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        out, a, e, b, a_scales, b_scales);
+        out, a, bt_nzs, bt_meta, a_scales, b_scales);
   }
 }
 
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index aa90388295492..1cef3d9c0de8e 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -20,20 +20,16 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
+#include "cutlass_extensions/cute_utils.cuh"
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 #include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
 
 using namespace cute;
 
 /*
    This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
    for NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
 */
 
 namespace {
@@ -74,9 +70,16 @@ struct cutlass_sparse_3x_gemm {
 
   using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
 
-  using StrideD = Stride<Int<1>, int64_t, Int<0>>;
   using ElementC = void;
-  using StrideC = StrideD;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
@@ -91,8 +94,8 @@ struct cutlass_sparse_3x_gemm {
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, ElementAcc, ElementC, StrideC, AlignmentCD, ElementD,
-          StrideD, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD, ElementD,
+          LayoutD_Transpose, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -118,49 +121,49 @@ struct cutlass_sparse_3x_gemm {
 };
 
 template <typename Gemm, typename... EpilogueArgs>
-void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& e, torch::Tensor const& b,
+void cutlass_sparse_gemm_caller(torch::Tensor& out,
+                                torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
                                 EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = b.size(0);
+  // Interface stride expected from the argument a (will get transposed)
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using LayoutD = cutlass::layout::RowMajor;
 
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(1);
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
 
-  using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
-  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
-  using StrideB = typename Gemm::GemmKernel::StrideB;
-  using StrideC = typename Gemm::GemmKernel::StrideC;
-  using StrideD = typename Gemm::GemmKernel::StrideD;
+  auto layout_A = make_cute_layout<StrideA>(a, "A");
+  auto layout_D = make_cute_layout<StrideD>(out, "D");
 
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{Int<1>{}, ldc, Int<0>{}};
+  auto stride_At = layout_A.stride();
+  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
 
   using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+  typename GemmKernel::ProblemShape prob_shape{(int) bt_nzs.size(0), (int) size<0>(layout_A), (int) size<1>(layout_A), 1};
 
   using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
   using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
 
-  LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
   LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  auto e_ptr = static_cast<ElementE*>(e.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
   typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr, a_layout, b_ptr, b_stride, e_ptr, e_layout};
+      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
       Gemm::Epilogue::prepare_args(
           std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
+      c_ptr, stride_Dt, c_ptr, stride_Dt};
 
   typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
                                       prob_shape, mainloop_args, epilogue_args};
@@ -195,7 +198,7 @@ struct sm90_config_default<half_t, OutType, Epilogue> {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename OutType,
@@ -207,9 +210,8 @@ struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
-                             ClusterShape, KernelSchedule, EpilogueSchedule,
-                             float>;
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 //////////////////////// Cherry-Picking Kernels ////////////////////////
@@ -335,9 +337,8 @@ struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
-                             TileShape, ClusterShape, KernelSchedule,
-                             EpilogueSchedule, float>;
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index a451298d5a6cd..8017eb1c93897 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -5,8 +5,8 @@
 
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                                   torch::Tensor const& e,
                                    torch::Tensor const& b,
+                                   torch::Tensor const& e,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    c10::optional<torch::Tensor> const& bias);
@@ -23,26 +23,26 @@ int32_t test_get_sm_version_num() {
 }
 
 void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
-                              torch::Tensor const& e, torch::Tensor const& b,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
                               c10::optional<torch::Tensor> const& bias) {
   // Checks for conformality
-  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
-  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) * 2 == b.size(0) &&
-              b.size(1) == c.size(1));
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
   TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
 
   // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1);                      // Row-major
-  TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1);  // Column-major
-  TORCH_CHECK(c.stride(1) % 16 == 0);                 // 16 Byte Alignment
-  TORCH_CHECK(b.stride(1) % 16 == 0);                 // 16 Byte Alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);                      // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);                 // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
-    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
                 bias->dim() == 1);
   }
 
@@ -52,7 +52,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
   if (version_num >= 90) {
-    cutlass_scaled_sparse_mm_sm90(c, a, e, b, a_scales, b_scales, bias);
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales, bias);
     return;
   }
 #endif
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 0378c5ad0036c..546d01e0d9025 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -317,16 +317,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // quantization, as well as bias
   ops.def(
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
-      "                         Tensor e,"
-      "                         Tensor b, Tensor a_scales,"
+      "                         Tensor b,"
+      "                         Tensor e, Tensor a_scales,"
       "                         Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
   ops.def(
-      "cutlass_compress_entry(Tensor! a_compressed, Tensor! e,"
+      "cutlass_sparse_compress(Tensor! a_compressed, Tensor! e,"
       " Tensor a) -> bool");
-  ops.impl("cutlass_compress_entry", &cutlass_compress_entry);
+  ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
 
   // Mamba selective scan kernel
   ops.def(
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 14ef2438daf4f..dd9f444ed504f 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -81,7 +81,7 @@ def make_rand_sparse_tensors(
     else:
         raise ValueError("unsupported dtype")
 
-    b_compressed, e = ops.cutlass_compress_entry(b.t())
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
 
     # Compressed B, Metadata, Original A, B
     return b_compressed, e, a, b
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 034b3e9493736..a14b1f3bbf45b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -532,51 +532,102 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
-def cutlass_compress_entry(a: torch.Tensor) \
+def cutlass_sparse_compress(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
     assert (a.dtype in [
         torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
     ])
 
-    # e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
-    elemsPerElemE = 4
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
 
     m = a.shape[0]
     k = a.shape[1]
-    a_compressed = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
-    e = torch.empty((m, k // 2 // elemsPerElemE),
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
                     dtype=torch.uint8,
                     device=a.device)
 
-    if not (torch.ops._C.cutlass_compress_entry(a_compressed, e, a)):
+    if not (torch.ops._C.cutlass_sparse_compress(a_nzs, a_meta, a)):
         raise ValueError
 
-    return a_compressed, e
+    return a_nzs, a_meta
 
 
 def cutlass_scaled_sparse_mm(
-        a: torch.Tensor,  # row-major activations
-        b: torch.Tensor,  # row-major weight matrix
-        e: torch.Tensor,
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
         scale_a: torch.Tensor,
         scale_b: torch.Tensor,
         out_dtype: torch.dtype,
         bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    # assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-    assert bias is None or bias.shape[0] == a.shape[0] \
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
         and bias.dtype == out_dtype
 
-    a_t = a.t()
-
-    m = b.shape[0]
-    n = a_t.shape[1]
-    out = torch.empty((n, m), dtype=out_dtype, device=a.device).t()
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_sparse_mm(out, b, e, a_t, scale_b, scale_a,
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a, scale_b,
                                           bias)
 
-    return out.t()
+    return out
 
 
 # aqlm
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 5cd0059a4df89..6dfc590ea4328 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -122,7 +122,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 layer.weight_scale = torch.nn.Parameter(
                     layer.weight_scale.data, requires_grad=False)
 
-        w_compressed, meta = ops.cutlass_compress_entry(layer.weight.data)
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
         layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
@@ -164,8 +164,8 @@ def apply_weights(self,
             q_input = x
 
         out = ops.cutlass_scaled_sparse_mm(a=q_input,
-                                           b=layer.weight,
-                                           e=layer.meta,
+                                           bt_nzs=layer.weight,
+                                           bt_meta=layer.meta,
                                            scale_a=input_scale,
                                            scale_b=layer.weight_scale,
                                            out_dtype=self.output_dtype,

From 4c927a0a815cdaa6d5b29479d2baa1879ceb038e Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Fri, 13 Dec 2024 09:42:14 +0000
Subject: [PATCH 89/92] Address reviews; one compression test left to pass

---
 .../cutlass_benchmarks/sparse_benchmarks.py   | 10 ++--
 csrc/core/math.hpp                            |  7 +++
 csrc/cutlass_extensions/common.hpp            | 19 ++------
 csrc/ops.h                                    |  2 +-
 .../cutlass_w8a8/scaled_mm_c2x.cuh            |  1 +
 .../cutlass_w8a8/scaled_mm_c3x.cu             |  1 +
 csrc/sparse/cutlass/sparse_compressor.cu      | 34 ++-----------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 48 +++++++------------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 36 +++++++-------
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 10 ++--
 vllm/_custom_ops.py                           | 13 +++--
 11 files changed, 73 insertions(+), 108 deletions(-)
 create mode 100644 csrc/core/math.hpp

diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index eec6e6134a0cf..b48fa501942d4 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -46,15 +46,14 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out, out_ref):
         print("Incorrect results")
         print(out)
         print(out_ref)
-    else:
-        print("Correct results")
 
     timers = []
     # pytorch impl - bfloat16
@@ -105,15 +104,14 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out, out_ref):
         print("Incorrect results")
         print(out)
         print(out_ref)
-    else:
-        print("Correct results")
 
     timers = []
 
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
new file mode 100644
index 0000000000000..ba9f40a230c8e
--- /dev/null
+++ b/csrc/core/math.hpp
@@ -0,0 +1,7 @@
+#include <climits>
+#include <iostream>
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 5c1098971b462..11c8486647c5e 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -11,25 +11,16 @@
 #define CUTLASS_CHECK(status)                        \
   {                                                  \
     TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
+                cutlassGetStatusString(status));     \
   }
 
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
 /**
  * Panic wrapper for unwinding CUDA runtime errors
  */
-#define CUDA_CHECK(status)                                              \
-  {                                                                     \
-    cudaError_t error = status;                                         \
-    if (error != cudaSuccess) {                                         \
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                << " at line: " << __LINE__ << std::endl;               \
-      exit(EXIT_FAILURE);                                               \
-    }                                                                   \
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
   }
 
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
diff --git a/csrc/ops.h b/csrc/ops.h
index d43f495aabd80..d1b2e212f8a44 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -162,7 +162,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               c10::optional<torch::Tensor> const& bias);
 
 bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
-                            torch::Tensor const& a);
+                             torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index 6e72aff89f2e4..75681f7f37820 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -21,6 +21,7 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
+#include "core/math.hpp"
 #include "cutlass_extensions/common.hpp"
 // clang-format on
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index d118f0e070a63..8190277997161 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -24,6 +24,7 @@
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "core/math.hpp"
 #include "cutlass_extensions/common.hpp"
 // clang-format on
 
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index 30b78054f300e..d8f1e5e852a40 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -1,46 +1,22 @@
+// clang-format will break include orders
+// clang-format off
 #include <cudaTypedefs.h>
 
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "cutlass/cutlass.h"
+#include "sparse_scaled_mm_c3x.cuh"
 
 #include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
 #include "cutlass/numeric_types.h"
 #include "cutlass/numeric_conversion.h"
-#include "cutlass/detail/dependent_false.hpp"
-
-#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
-#include "cutlass_extensions/common.hpp"
 
 #include "cutlass/transform/device/transform_universal_adapter.hpp"
 #include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
-
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-
-#include <iostream>
-
-#include "cutlass/cutlass.h"
-
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
 
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/packed_stride.hpp"
 
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "sparse_scaled_mm_c3x.cuh"
+// clang-format on
 
 using namespace cute;
 using namespace vllm;
@@ -153,7 +129,7 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
 }
 
 bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
-                            torch::Tensor const& a) {
+                             torch::Tensor const& a) {
   if (a.dtype() == torch::kBFloat16) {
     return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
   } else if (a.dtype() == torch::kFloat16) {
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 4537d31c54eb1..2001fca9ed1a2 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -3,32 +3,10 @@
 #include <cudaTypedefs.h>
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
+#include "sparse_scaled_mm_c3x.cuh"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-#include "cutlass_extensions/common.hpp"
 // clang-format on
 
-  #include "sparse_scaled_mm_c3x.cuh"
-
 using namespace cute;
 using namespace vllm;
 
@@ -247,11 +225,13 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
                                              Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat8_e4m3fn) {
     TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
@@ -259,12 +239,14 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
                                             cutlass::bfloat16_t, Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
                                             cutlass::half_t, Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat16) {
     TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
@@ -272,12 +254,14 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
                                              cutlass::bfloat16_t, Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
                                              Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {  // a.dtype() == torch::kBFloat16
     TORCH_CHECK(a.dtype() == torch::kBFloat16);
@@ -286,12 +270,14 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
                                              cutlass::bfloat16_t, Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
                                              cutlass::half_t, Epilogue>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta,
+          std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 1cef3d9c0de8e..9267b87cd3cc7 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -1,29 +1,24 @@
+// clang-format will break include orders
+// clang-format off
 #include <cudaTypedefs.h>
 
 #include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
-#include <iostream>
-#include <sstream>
-#include <vector>
-
 #include "cutlass/cutlass.h"
 
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/gemm/kernel/tile_scheduler_params.h"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
 #include "cutlass_extensions/cute_utils.cuh"
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "core/math.hpp"
 #include "cutlass_extensions/common.hpp"
 #include "cutlass_extensions/torch_utils.hpp"
+// clang-format on
 
 using namespace cute;
 
@@ -94,8 +89,9 @@ struct cutlass_sparse_3x_gemm {
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD, ElementD,
-          LayoutD_Transpose, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
+          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -121,8 +117,7 @@ struct cutlass_sparse_3x_gemm {
 };
 
 template <typename Gemm, typename... EpilogueArgs>
-void cutlass_sparse_gemm_caller(torch::Tensor& out,
-                                torch::Tensor const& a,
+void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& bt_nzs,
                                 torch::Tensor const& bt_meta,
                                 EpilogueArgs&&... epilogue_params) {
@@ -145,7 +140,8 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out,
   auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
 
   using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{(int) bt_nzs.size(0), (int) size<0>(layout_A), (int) size<1>(layout_A), 1};
+  typename GemmKernel::ProblemShape prob_shape{
+      (int)bt_nzs.size(0), (int)size<0>(layout_A), (int)size<1>(layout_A), 1};
 
   using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
   using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
@@ -198,7 +194,7 @@ struct sm90_config_default<half_t, OutType, Epilogue> {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename OutType,
@@ -210,8 +206,9 @@ struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
+                             ClusterShape, KernelSchedule, EpilogueSchedule,
+                             float>;
 };
 
 //////////////////////// Cherry-Picking Kernels ////////////////////////
@@ -337,8 +334,9 @@ struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
+                             TileShape, ClusterShape, KernelSchedule,
+                             EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 8017eb1c93897..6e067e6529cf9 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -36,9 +36,10 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
 
   // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 && c.stride(1) == 1);  // Row-major
-  TORCH_CHECK(c.stride(0) % 16 == 0);                      // 16 Byte Alignment
-  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);                 // 16 Byte Alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 &&
+              c.stride(1) == 1);            // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);       // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);  // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
@@ -52,7 +53,8 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
   if (version_num >= 90) {
-    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales, bias);
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
+                                  bias);
     return;
   }
 #endif
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a14b1f3bbf45b..dc22d90bd0a5c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -567,20 +567,25 @@ def cutlass_sparse_compress(a: torch.Tensor) \
     assert (a.dtype in [
         torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
     ])
+    assert (a.is_contiguous())
 
     # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
     elemsPerMetaElem = 4
 
     m = a.shape[0]
     k = a.shape[1]
+    assert (k % 2 == 0)
     a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
     a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
-                    dtype=torch.uint8,
-                    device=a.device)
+                         dtype=torch.uint8,
+                         device=a.device)
 
     if not (torch.ops._C.cutlass_sparse_compress(a_nzs, a_meta, a)):
         raise ValueError
 
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+
     return a_nzs, a_meta
 
 
@@ -624,8 +629,8 @@ def cutlass_scaled_sparse_mm(
     n = bt_nzs.shape[0]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a, scale_b,
-                                          bias)
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
 
     return out
 

From b177ab611948d84a64475e9d6efa4a4f68400417 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Fri, 13 Dec 2024 21:48:22 +0000
Subject: [PATCH 90/92] Fix the scale swap bug

---
 benchmarks/cutlass_benchmarks/sparse_benchmarks.py | 9 ++++-----
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu        | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index b48fa501942d4..d5cc69b85348d 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -100,12 +100,11 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     assert dtype == torch.float8_e4m3fn
     b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
                                                      k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32))
+    bias = torch.rand((n, ), device="cuda", dtype=torch.bfloat16) * 10
 
-    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
-                                       torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out, out_ref):
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 2001fca9ed1a2..5aa8ade0df376 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -294,10 +294,10 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
     return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, bt_nzs, bt_meta, a_scales, b_scales, *bias);
+        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
   } else {
     return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        out, a, bt_nzs, bt_meta, a_scales, b_scales);
+        out, a, bt_nzs, bt_meta, b_scales, a_scales);
   }
 }
 

From 8879323b33fd49b9a5b2f7f5baf1763b18010ee1 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Fri, 13 Dec 2024 21:49:23 +0000
Subject: [PATCH 91/92] Clean up benchmarking

---
 benchmarks/cutlass_benchmarks/sparse_benchmarks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index d5cc69b85348d..2bf65182c76ec 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -104,7 +104,8 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32))
     bias = torch.rand((n, ), device="cuda", dtype=torch.bfloat16) * 10
 
-    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
+                                       torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out, out_ref):

From 18ba3de38e15cd4edb7579c3d89a50cc1d72c60e Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Sat, 14 Dec 2024 06:28:59 +0000
Subject: [PATCH 92/92] Minimize includes and reformat the compressor file

---
 .../cutlass_benchmarks/sparse_benchmarks.py   |  10 +-
 benchmarks/cutlass_benchmarks/utils.py        |   7 +-
 csrc/ops.h                                    |   4 +-
 csrc/sparse/cutlass/sparse_compressor.cu      | 100 +++++++++++-------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   |   1 -
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |   5 +-
 csrc/torch_bindings.cpp                       |   4 +-
 vllm/_custom_ops.py                           |   2 +-
 8 files changed, 76 insertions(+), 57 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 2bf65182c76ec..3d1c5e392f9e2 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -54,6 +54,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
         print("Incorrect results")
         print(out)
         print(out_ref)
+    else:
+        print("Correct results")
 
     timers = []
     # pytorch impl - bfloat16
@@ -100,9 +102,9 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     assert dtype == torch.float8_e4m3fn
     b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
                                                      k)
-    scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32))
-    scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32))
-    bias = torch.rand((n, ), device="cuda", dtype=torch.bfloat16) * 10
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
     out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
                                        torch.bfloat16)
@@ -112,6 +114,8 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
         print("Incorrect results")
         print(out)
         print(out_ref)
+    else:
+        print("Correct results")
 
     timers = []
 
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index c53cee52642f4..ef06fcd6604dd 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -62,11 +62,8 @@ def prune_to_2_4(tensor):
 
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
                              k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    # a = torch.randn((m, k), device='cuda') * 5
-    # b = torch.randn((n, k), device='cuda').t() * 5
-
-    a = torch.ones((m, k), device='cuda')
-    b = torch.ones((n, k), device='cuda').t()
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
 
     b = prune_to_2_4(b.t()).t()
 
diff --git a/csrc/ops.h b/csrc/ops.h
index d1b2e212f8a44..211ddd690d7dc 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -161,8 +161,8 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b_scales,
                               c10::optional<torch::Tensor> const& bias);
 
-bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
-                             torch::Tensor const& a);
+bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
+                                   torch::Tensor& e, torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index d8f1e5e852a40..aa1dee73a70ce 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -4,60 +4,89 @@
 
 #include "sparse_scaled_mm_c3x.cuh"
 
-#include "cute/tensor.hpp"
-#include "cutlass/numeric_types.h"
 #include "cutlass/numeric_conversion.h"
-
 #include "cutlass/transform/device/transform_universal_adapter.hpp"
 #include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
 
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/packed_stride.hpp"
-
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 // clang-format on
 
 using namespace cute;
 using namespace vllm;
 
 /// Make A structured sparse by replacing elements with 0 and compress it
-template <typename ElementA_>
-bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
-                           torch::Tensor const& a) {
+template <typename ElementA_, typename ElementAcc_>
+bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
+                             torch::Tensor const& a) {
   // Checks for conformality
   TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
               a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
   TORCH_CHECK(a.dim() == 2)
   // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
   TORCH_CHECK(a.stride(1) == 1)
 
   int m = a.size(0);
   int k = a.size(1);
 
-  using ProblemShape = Shape<int, int, int, int>;
+  // Sparse kernel setup; this kernel is not used for matmul,
+  // but just for setting up the compressor utility
+  // A matrix configuration
   using ElementA = ElementA_;
   using LayoutTagA = cutlass::layout::RowMajor;
+  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  // B matrix configuration
+  using ElementB = ElementA;
+  using LayoutTagB = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  // C/D matrix configuration
+  using ElementC = float;
+  using LayoutTagC = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Core kernel configurations
+  using ElementAccumulator = ElementAcc_;
+  using TileShape = Shape<_128, _128, _128>;
+  using TileShapeRef = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_1, _2, _1>;
+  using KernelSchedule = typename std::conditional<
+      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
+      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
+      cutlass::gemm::KernelTmaWarpSpecialized>::type;
+
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
+  using ProblemShape = Shape<int, int, int, int>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
+          AlignmentC, ElementC, LayoutTagC, AlignmentC,
+          EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
+          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
-  // Layouts for reference (non-sparse) tensors
   using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
   using StrideE = StrideA;
 
-  using Gemm = typename sm90_config_default<ElementA, cutlass::half_t,
-                                            c3x::ScaledEpilogue>::Cutlass3xGemm;
-
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int64_t lda = a.stride(0);
-
   using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
 
-  StrideA a_stride{lda, Int<1>{}, 0};
-
-  using GemmKernel = typename Gemm::GemmKernel;
+  // The n (=1) dimension does not matter for the compressor
   typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
 
   using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
@@ -66,9 +95,6 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
   using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
 
-  LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
-  LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
-
   // Offline compressor kernel
   using CompressorUtility =
       cutlass::transform::kernel::StructuredSparseCompressorUtility<
@@ -85,9 +111,6 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   auto [M, N, K, L] = prob_shape;
 
   StrideA stride_A;
-  StrideA stride_A_compressed;
-  StrideE stride_E;
-
   stride_A =
       cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
 
@@ -103,11 +126,6 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   auto e_ptr =
       static_cast<typename Gemm::CollectiveMainloop::ElementE*>(e.data_ptr());
 
-  stride_A_compressed =
-      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
-  stride_E =
-      cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
-
   cutlass::KernelHardwareInfo hw_info;
   hw_info.device_id = 0;
   hw_info.sm_count =
@@ -128,16 +146,18 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   return true;
 }
 
-bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
-                             torch::Tensor const& a) {
+bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
+                                   torch::Tensor& e, torch::Tensor const& a) {
   if (a.dtype() == torch::kBFloat16) {
-    return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
+    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_compressed, e,
+                                                               a);
   } else if (a.dtype() == torch::kFloat16) {
-    return sparsify_and_compress<cutlass::half_t>(a_compressed, e, a);
+    return cutlass_sparse_compress<cutlass::half_t, float>(a_compressed, e, a);
   } else if (a.dtype() == torch::kFloat8_e4m3fn) {
-    return sparsify_and_compress<cutlass::float_e4m3_t>(a_compressed, e, a);
+    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_compressed,
+                                                                 e, a);
   } else if (a.dtype() == torch::kInt8) {
-    return sparsify_and_compress<int8_t>(a_compressed, e, a);
+    return cutlass_sparse_compress<int8_t, int32_t>(a_compressed, e, a);
   }
   return false;
 }
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 5aa8ade0df376..b50e9a3a2c240 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -4,7 +4,6 @@
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 #include "sparse_scaled_mm_c3x.cuh"
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 // clang-format on
 
 using namespace cute;
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 9267b87cd3cc7..81a8819bde60a 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -9,13 +9,12 @@
 #include "cutlass/cutlass.h"
 
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
-#include "cutlass_extensions/cute_utils.cuh"
-#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 #include "core/math.hpp"
+#include "cutlass_extensions/cute_utils.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 #include "cutlass_extensions/common.hpp"
 #include "cutlass_extensions/torch_utils.hpp"
 // clang-format on
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 546d01e0d9025..99f0a16e8b0f2 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -324,9 +324,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // CUTLASS sparse matrix compressor
   ops.def(
-      "cutlass_sparse_compress(Tensor! a_compressed, Tensor! e,"
+      "cutlass_sparse_compress_entry(Tensor! a_compressed, Tensor! e,"
       " Tensor a) -> bool");
-  ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
+  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
 
   // Mamba selective scan kernel
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index dc22d90bd0a5c..accb71c93dbd3 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -580,7 +580,7 @@ def cutlass_sparse_compress(a: torch.Tensor) \
                          dtype=torch.uint8,
                          device=a.device)
 
-    if not (torch.ops._C.cutlass_sparse_compress(a_nzs, a_meta, a)):
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
         raise ValueError
 
     assert (a_nzs.is_contiguous())