Skip to content

Commit

Permalink
First steps to enable SYCL backend in Python Interface (#155)
Browse files Browse the repository at this point in the history
First implementation steps towards supporting the SYCL backend in the
CUTLASS Python Interface.

The main additions from this PR are:
* Generating a suitable GEMM template and arguments for the CUTLASS 3.x
API and Intel PVC as target.
* Calling DPC++ instead of `nvcc` to compile device and host code.
* Using the DPCTL library to transfer data and launch the kernel via
SYCL.

The support so far focuses on a simple GEMM, epilogues (e.g, with
visitor) are not yet supported.

Compilation is currently only possible with development versions of
DPC++, the `-fsycl-rtc-mode` flag that was added to support CUTLASS
nested parameter classes in free-function kernels as part of this work
is not yet available in releases.

The activation of the SYCL backend via environment variable is a
temporary solution, a follow-up will look into a cleaner solution.

---------

Signed-off-by: Lukas Sommer <[email protected]>
Co-authored-by: Alejandro Acosta <[email protected]>
  • Loading branch information
sommerlukas and aacostadiaz authored Feb 5, 2025
1 parent b66a485 commit f95d391
Show file tree
Hide file tree
Showing 16 changed files with 406 additions and 93 deletions.
33 changes: 32 additions & 1 deletion python/cutlass/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ def cuda_install_path():
else:
this.use_rmm = False

this._use_sycl = False


def set_log_level(level: int):
"""
Expand Down Expand Up @@ -172,6 +174,35 @@ def initialize_cuda_context():
this._device_id = int(device_id)


import dpctl

this._sycl_device: dpctl.SyclDevice = None

def initialize_sycl_context():
if this._device_id is not None and this._sycl_device is not None:
return

device_id = int(os.getenv("CUTLASS_SYCL_DEVICE_ID", default=0))
sycl_gpus = dpctl.get_devices(
dpctl.backend_type.level_zero, dpctl.device_type.gpu)

if len(sycl_gpus) <= device_id:
raise Exception("No LevelZero device found")

this._device_id = device_id
this._sycl_device = sycl_gpus[device_id]


def device_id() -> int:
initialize_cuda_context()
if os.getenv("CUTLASS_USE_SYCL"):
initialize_sycl_context()
this._use_sycl = True
else:
this._use_sycl = False
initialize_cuda_context()
return this._device_id


def sycl_device() -> dpctl.SyclDevice:
initialize_sycl_context()
return this._sycl_device
40 changes: 26 additions & 14 deletions python/cutlass/backend/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@
import cutlass
from cutlass.backend.frontend import CupyFrontend, NumpyFrontend, TorchFrontend
from cutlass.backend.memory_manager import DevicePtrWrapper
from cutlass.backend.utils.device import default_stream
from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor

import dpctl


class ArgumentBase:
"""
Expand All @@ -58,7 +61,7 @@ def __init__(
# tensor_C can be interpreted as the bias with bias=True in keyword args
self.bias = kwargs.get("bias", False)

self.stream = kwargs.get("stream", cuda.CUstream(0))
self.stream = kwargs.get("stream", default_stream())

# RMM buffers used to track tensor lifetime
self.buffers = {}
Expand All @@ -83,34 +86,43 @@ def tensor_to_ptr(self, tensor, name, is_output=False):
if is_numpy_tensor(tensor):
if is_output:
assert name
self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
self.buffers[name] = NumpyFrontend.argument(tensor, is_output, self.stream)
if is_output:
self.host_tensors[name] = tensor
return self.buffers[name].ptr
elif is_torch_tensor(tensor):
return TorchFrontend.argument(tensor)
return TorchFrontend.argument(tensor, self.stream)
elif isinstance(tensor, cuda.CUdeviceptr):
return tensor
elif is_cupy_tensor(tensor):
return CupyFrontend.argument(tensor)
else:
raise TypeError("Unsupported Frontend. Only support numpy and torch")
raise TypeError(
"Unsupported Frontend. Only support numpy and torch")

def sync(self, stream_sync=True):
is_sycl = isinstance(self.stream, dpctl.SyclQueue)
if stream_sync:
(err,) = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if is_sycl:
self.stream.wait()
else:
(err,) = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))

for key in self.host_tensors.keys():
host_tensor = self.host_tensors[key]
(err,) = cuda.cuMemcpyDtoH(
host_tensor,
self.buffers[key].ptr,
host_tensor.size * host_tensor.itemsize,
)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if is_sycl:
self.stream.memcpy(host_tensor, self.buffers[key].usm_mem,
host_tensor.size * host_tensor.itemsize)
else:
(err,) = cuda.cuMemcpyDtoH(
host_tensor,
self.buffers[key].ptr,
host_tensor.size * host_tensor.itemsize,
)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))

self.free()

Expand Down
16 changes: 11 additions & 5 deletions python/cutlass/backend/c_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def get_mainloop_arguments_3x(
element_A,
element_B,
alignment_A: int,
alignment_B: int) -> ctypes.Structure:
alignment_B: int,
use_sycl: bool = False) -> ctypes.Structure:
"""
Returns the ctypes structure to be used for the 3.x kernel's mainloop parameters.
Expand Down Expand Up @@ -207,10 +208,15 @@ def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
)

# Currently all 3.x kernels (CpAsync and Tma) have the same argument structure.
# Should that become not the case, this is the place to return custom ctypes
# structures based on selected kernel schedule.
return _MainloopArgumentsTma
if use_sycl:
# For SYCL, we don't have the additional 'mma_promotion_interval' arg.
return _MainloopArgumentsMultistage
else:
# Currently all 3.x kernels (CpAsync and Tma) for Nvidia devices have
# the same argument structure. Should that become not the case, this is
# the place to return custom ctypes structures based on selected kernel
# schedule.
return _MainloopArgumentsTma


def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor, scheduler_args, default_epilogue):
Expand Down
Loading

0 comments on commit f95d391

Please sign in to comment.