Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] use cutlass for 24 #33

Draft
wants to merge 53 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
5d51361
Add cutlass 2:4 infrastructure
Faraz9877 Oct 22, 2024
17f5b96
Update with test code
Faraz9877 Oct 28, 2024
471a03c
Clean up a bit; both fp8 and int8 working
Faraz9877 Oct 30, 2024
0b332fb
Add fp16 and bf16 support to sparse cutlass mm
Faraz9877 Oct 30, 2024
da31648
semi_structured for fp16 and bf16 and int8
ilmarkov Oct 1, 2024
e655f94
Fix A100 int8 tests
ilmarkov Oct 2, 2024
5fc3c1c
Add fp8 cusparseLt
ilmarkov Oct 9, 2024
9cf36d6
wip
ilmarkov Oct 9, 2024
ad09e79
Fix signatures
ilmarkov Oct 9, 2024
e75eabc
Fix compilation and tests
ilmarkov Oct 13, 2024
0306390
Update for older platforms
ilmarkov Oct 15, 2024
1021acb
Add benchmarks
ilmarkov Oct 16, 2024
19ce358
Fix typo
ilmarkov Oct 23, 2024
959408c
Added scaled_mm for fp8.
ilmarkov Oct 24, 2024
117b87b
Add docstrings
ilmarkov Oct 28, 2024
2c7e68e
Update for torch 2.5
ilmarkov Oct 30, 2024
922f4f8
Add handling contiguous dense input for int8 and fp8
ilmarkov Oct 30, 2024
beca038
Add fp8 cusparseLt
ilmarkov Oct 9, 2024
5d9cd25
Fix compilation and tests
ilmarkov Oct 13, 2024
39ad9d4
Add caching of cusparseLT meta
ilmarkov Oct 23, 2024
520eb62
Cached cusparseLt
ilmarkov Oct 25, 2024
20956e6
Fix destroy function
ilmarkov Oct 25, 2024
87c8088
Prepare for reproduce
ilmarkov Oct 25, 2024
4ea58b1
Fix cusparseLt caching
ilmarkov Oct 30, 2024
f0551ef
Make cached version default function
ilmarkov Nov 5, 2024
d7476e8
Fixes and polishing after rebase
ilmarkov Nov 6, 2024
681ea5e
add sparse 2:4 weight loading suport
dsikka Oct 23, 2024
ecf878f
Some more changes!
rahul-tuli Oct 29, 2024
80952dc
Cleanup
rahul-tuli Oct 31, 2024
8462c9d
get uncompressed to work; update gemm to use contiguous; use alex's u…
dsikka Nov 1, 2024
0a3e506
patch
dsikka Nov 4, 2024
2e28972
use our decompressor
dsikka Nov 4, 2024
28f0abb
Some more work
rahul-tuli Nov 6, 2024
c7a97a8
Use new scaled_T function
rahul-tuli Nov 7, 2024
ccadad0
Add multiprocessing for kernel sweep benchmarking
Faraz9877 Nov 8, 2024
807737c
Add multi-GPU
Faraz9877 Nov 8, 2024
04c19a5
Add cutlass_scaled_sparse_mm op
Faraz9877 Nov 14, 2024
2a85c5a
Clean up
Faraz9877 Nov 14, 2024
1b381c9
Update code
Faraz9877 Nov 14, 2024
4e31076
Update code
Faraz9877 Nov 14, 2024
13fccf4
Clean up the benchmarking
Faraz9877 Nov 14, 2024
b345cc8
Clean up the cutlass benchmarking
Faraz9877 Nov 14, 2024
2d03e1d
Fix cmake errors
Faraz9877 Nov 14, 2024
e9439cc
Fix the cmake TAG
Faraz9877 Nov 14, 2024
4ba7c0f
Merge branch 'buildable' into rahul-quant-merged-rs
robertgshaw2-neuralmagic Nov 15, 2024
f74ef37
update
robertgshaw2-neuralmagic Nov 15, 2024
f5bc9eb
fixed
robertgshaw2-neuralmagic Nov 15, 2024
1316076
updated
robertgshaw2-neuralmagic Nov 15, 2024
4d2b12c
updated
robertgshaw2-neuralmagic Nov 15, 2024
fe30b53
updated, calling things properly
robertgshaw2-neuralmagic Nov 15, 2024
4c61b19
running end to end but not passing
robertgshaw2-neuralmagic Nov 15, 2024
86716f8
updated
robertgshaw2-neuralmagic Nov 15, 2024
c796ac8
Some cleanup
rahul-tuli Nov 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,18 +202,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")

FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG v3.5.1
GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
GIT_PROGRESS TRUE

# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE
# GIT_SHALLOW FALSE
)
FetchContent_MakeAvailable(cutlass)

Expand All @@ -225,7 +225,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor.cu")

set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
Expand Down Expand Up @@ -255,11 +257,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()

#
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# The cutlass_scaled_mm cutlass_scaled_sparse_mm, and cutlass_compressor kernels
# For Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
"csrc/sparse/cutlass/sparse_compressor.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
Expand All @@ -268,12 +273,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
message(STATUS "Not building cutlass_c3x kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"later if you intend on running FP8 quantized models or sparse on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
message(STATUS "Not building cutlass_c3x as no compatible archs found "
"in CUDA target architectures")
endif()

Expand Down Expand Up @@ -398,6 +403,9 @@ define_gpu_extension_target(
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)

# include(nm_cutlass_c.cmake)
# build_nm_cutlass_c()

#
# _moe_C extension
#
Expand Down
311 changes: 311 additions & 0 deletions benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
## Cutlass benchmark V1

from typing import Callable, Iterable

import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from utils import make_rand_sparse_tensors

import vllm._custom_ops as ops


# bench
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
**kwargs) -> TMeasurement:
min_run_time = 1

globals = {
"args": args,
"kwargs": kwargs,
"fn": fn,
}
return TBenchmark.Timer(
stmt="fn(*args, **kwargs)",
globals=globals,
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)


def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.int8
a_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)

scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

timers = []

# pytorch impl - bfloat16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16),
b.to(dtype=torch.bfloat16)))

# pytorch impl - float16
timers.append(
bench_fn(label, sub_label,
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))

# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
torch.bfloat16))

# cutlass with bias: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
bias))

# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
torch.float16))

# cutlass with bias: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
bias.to(dtype=torch.float16)))

return timers


def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.float8_e4m3fn

# Create tensors
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
aT = a.t()
bT = b.t()
bf16_a = a.to(dtype=torch.bfloat16)
bf16_bT = bT.to(dtype=torch.bfloat16)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

timers = []

# pytorch impl w. bf16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
bT.to(dtype=torch.bfloat16, device="cuda")))

# pytorch impl: bf16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm",
torch._scaled_mm,
a,
bT,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16))

# pytorch impl: bf16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
bT,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True))

# pytorch impl: fp16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm",
torch._scaled_mm,
a,
bT,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16))

# pytorch impl: fp16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
bT,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16,
use_fast_accum=True))

# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16))
# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))

return timers


def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.float16
a_compressed, e, a, b = make_rand_sparse_tensors(torch.float16, m, n, k)

scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

timers = []

# # pytorch impl w. bf16
# timers.append(
# bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
# torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
# b.to(dtype=torch.bfloat16, device="cuda")))

# # pytorch impl: bf16 output
# timers.append(
# bench_fn(label,
# sub_label,
# "pytorch_fp16_fp16_bf16_scaled_mm",
# torch._scaled_mm,
# a,
# b,
# scale_a=scale_a,
# scale_b=scale_b,
# out_dtype=torch.bfloat16))

# # pytorch impl: fp16 output
# timers.append(
# bench_fn(label,
# sub_label,
# "pytorch_fp16_fp16_fp16_scaled_mm",
# torch._scaled_mm,
# a,
# b,
# scale_a=scale_a,
# scale_b=scale_b,
# out_dtype=torch.float16))

# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
torch.bfloat16))

# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))

# cutlass impl: bf16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
bias))

# cutlass impl: fp16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
bias.to(dtype=torch.float16)))

return timers


def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.bfloat16
a_compressed, e, a, b = make_rand_sparse_tensors(torch.bfloat16, m, n, k)

scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

timers = []

# # pytorch impl w. bf16
# timers.append(
# bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
# torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
# b.to(dtype=torch.bfloat16, device="cuda")))

# # pytorch impl: bf16 output
# timers.append(
# bench_fn(label,
# sub_label,
# "pytorch_fp16_fp16_bf16_scaled_mm",
# torch._scaled_mm,
# a,
# b,
# scale_a=scale_a,
# scale_b=scale_b,
# out_dtype=torch.bfloat16))

# # pytorch impl: fp16 output
# timers.append(
# bench_fn(label,
# sub_label,
# "pytorch_fp16_fp16_fp16_scaled_mm",
# torch._scaled_mm,
# a,
# b,
# scale_a=scale_a,
# scale_b=scale_b,
# out_dtype=torch.float16))

# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b,
torch.bfloat16))

# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16))

# cutlass impl: bf16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.bfloat16,
bias))

# cutlass impl: fp16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a_compressed, e, b, scale_a, scale_b, torch.float16,
bias.to(dtype=torch.float16)))

return timers


def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
# if dtype == torch.int8:
# return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn:
return bench_fp8(dtype, m, k, n, label, sub_label)
# if dtype == torch.float16:
# return bench_fp16(dtype, m, k, n, label, sub_label)
# if dtype == torch.bfloat16:
# return bench_bf16(dtype, m, k, n, label, sub_label)
raise ValueError("unsupported type")
Loading