Skip to content

Commit

Permalink
First steps to enable SYCL backend in Python Interface (codeplaysoftw…
Browse files Browse the repository at this point in the history
…are#155)

First implementation steps towards supporting the SYCL backend in the
CUTLASS Python Interface.

The main additions from this PR are:
* Generating a suitable GEMM template and arguments for the CUTLASS 3.x
API and Intel PVC as target.
* Calling DPC++ instead of `nvcc` to compile device and host code.
* Using the DPCTL library to transfer data and launch the kernel via
SYCL.

The support so far focuses on a simple GEMM, epilogues (e.g, with
visitor) are not yet supported.

Compilation is currently only possible with development versions of
DPC++, the `-fsycl-rtc-mode` flag that was added to support CUTLASS
nested parameter classes in free-function kernels as part of this work
is not yet available in releases.

The activation of the SYCL backend via environment variable is a
temporary solution, a follow-up will look into a cleaner solution.

---------

Signed-off-by: Lukas Sommer <[email protected]>
Co-authored-by: Alejandro Acosta <[email protected]>
  • Loading branch information
2 people authored and taozha2 committed Feb 10, 2025
1 parent 5e33a19 commit 045d558
Show file tree
Hide file tree
Showing 16 changed files with 406 additions and 93 deletions.
33 changes: 32 additions & 1 deletion python/cutlass/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ def cuda_install_path():
else:
this.use_rmm = False

this._use_sycl = False


def set_log_level(level: int):
"""
Expand Down Expand Up @@ -172,6 +174,35 @@ def initialize_cuda_context():
this._device_id = int(device_id)


import dpctl

this._sycl_device: dpctl.SyclDevice = None

def initialize_sycl_context():
if this._device_id is not None and this._sycl_device is not None:
return

device_id = int(os.getenv("CUTLASS_SYCL_DEVICE_ID", default=0))
sycl_gpus = dpctl.get_devices(
dpctl.backend_type.level_zero, dpctl.device_type.gpu)

if len(sycl_gpus) <= device_id:
raise Exception("No LevelZero device found")

this._device_id = device_id
this._sycl_device = sycl_gpus[device_id]


def device_id() -> int:
initialize_cuda_context()
if os.getenv("CUTLASS_USE_SYCL"):
initialize_sycl_context()
this._use_sycl = True
else:
this._use_sycl = False
initialize_cuda_context()
return this._device_id


def sycl_device() -> dpctl.SyclDevice:
initialize_sycl_context()
return this._sycl_device
40 changes: 26 additions & 14 deletions python/cutlass/backend/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@
import cutlass
from cutlass.backend.frontend import CupyFrontend, NumpyFrontend, TorchFrontend
from cutlass.backend.memory_manager import DevicePtrWrapper
from cutlass.backend.utils.device import default_stream
from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor

import dpctl


class ArgumentBase:
"""
Expand All @@ -58,7 +61,7 @@ def __init__(
# tensor_C can be interpreted as the bias with bias=True in keyword args
self.bias = kwargs.get("bias", False)

self.stream = kwargs.get("stream", cuda.CUstream(0))
self.stream = kwargs.get("stream", default_stream())

# RMM buffers used to track tensor lifetime
self.buffers = {}
Expand All @@ -83,34 +86,43 @@ def tensor_to_ptr(self, tensor, name, is_output=False):
if is_numpy_tensor(tensor):
if is_output:
assert name
self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
self.buffers[name] = NumpyFrontend.argument(tensor, is_output, self.stream)
if is_output:
self.host_tensors[name] = tensor
return self.buffers[name].ptr
elif is_torch_tensor(tensor):
return TorchFrontend.argument(tensor)
return TorchFrontend.argument(tensor, self.stream)
elif isinstance(tensor, cuda.CUdeviceptr):
return tensor
elif is_cupy_tensor(tensor):
return CupyFrontend.argument(tensor)
else:
raise TypeError("Unsupported Frontend. Only support numpy and torch")
raise TypeError(
"Unsupported Frontend. Only support numpy and torch")

def sync(self, stream_sync=True):
is_sycl = isinstance(self.stream, dpctl.SyclQueue)
if stream_sync:
(err,) = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if is_sycl:
self.stream.wait()
else:
(err,) = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))

for key in self.host_tensors.keys():
host_tensor = self.host_tensors[key]
(err,) = cuda.cuMemcpyDtoH(
host_tensor,
self.buffers[key].ptr,
host_tensor.size * host_tensor.itemsize,
)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if is_sycl:
self.stream.memcpy(host_tensor, self.buffers[key].usm_mem,
host_tensor.size * host_tensor.itemsize)
else:
(err,) = cuda.cuMemcpyDtoH(
host_tensor,
self.buffers[key].ptr,
host_tensor.size * host_tensor.itemsize,
)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))

self.free()

Expand Down
16 changes: 11 additions & 5 deletions python/cutlass/backend/c_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def get_mainloop_arguments_3x(
element_A,
element_B,
alignment_A: int,
alignment_B: int) -> ctypes.Structure:
alignment_B: int,
use_sycl: bool = False) -> ctypes.Structure:
"""
Returns the ctypes structure to be used for the 3.x kernel's mainloop parameters.
Expand Down Expand Up @@ -207,10 +208,15 @@ def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
)

# Currently all 3.x kernels (CpAsync and Tma) have the same argument structure.
# Should that become not the case, this is the place to return custom ctypes
# structures based on selected kernel schedule.
return _MainloopArgumentsTma
if use_sycl:
# For SYCL, we don't have the additional 'mma_promotion_interval' arg.
return _MainloopArgumentsMultistage
else:
# Currently all 3.x kernels (CpAsync and Tma) for Nvidia devices have
# the same argument structure. Should that become not the case, this is
# the place to return custom ctypes structures based on selected kernel
# schedule.
return _MainloopArgumentsTma


def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor, scheduler_args, default_epilogue):
Expand Down
Loading

0 comments on commit 045d558

Please sign in to comment.