Skip to content

Commit f95d391

Browse files
First steps to enable SYCL backend in Python Interface (#155)
First implementation steps towards supporting the SYCL backend in the CUTLASS Python Interface. The main additions from this PR are: * Generating a suitable GEMM template and arguments for the CUTLASS 3.x API and Intel PVC as target. * Calling DPC++ instead of `nvcc` to compile device and host code. * Using the DPCTL library to transfer data and launch the kernel via SYCL. The support so far focuses on a simple GEMM, epilogues (e.g, with visitor) are not yet supported. Compilation is currently only possible with development versions of DPC++, the `-fsycl-rtc-mode` flag that was added to support CUTLASS nested parameter classes in free-function kernels as part of this work is not yet available in releases. The activation of the SYCL backend via environment variable is a temporary solution, a follow-up will look into a cleaner solution. --------- Signed-off-by: Lukas Sommer <[email protected]> Co-authored-by: Alejandro Acosta <[email protected]>
1 parent b66a485 commit f95d391

File tree

16 files changed

+406
-93
lines changed

16 files changed

+406
-93
lines changed

python/cutlass/__init__.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ def cuda_install_path():
9595
else:
9696
this.use_rmm = False
9797

98+
this._use_sycl = False
99+
98100

99101
def set_log_level(level: int):
100102
"""
@@ -172,6 +174,35 @@ def initialize_cuda_context():
172174
this._device_id = int(device_id)
173175

174176

177+
import dpctl
178+
179+
this._sycl_device: dpctl.SyclDevice = None
180+
181+
def initialize_sycl_context():
182+
if this._device_id is not None and this._sycl_device is not None:
183+
return
184+
185+
device_id = int(os.getenv("CUTLASS_SYCL_DEVICE_ID", default=0))
186+
sycl_gpus = dpctl.get_devices(
187+
dpctl.backend_type.level_zero, dpctl.device_type.gpu)
188+
189+
if len(sycl_gpus) <= device_id:
190+
raise Exception("No LevelZero device found")
191+
192+
this._device_id = device_id
193+
this._sycl_device = sycl_gpus[device_id]
194+
195+
175196
def device_id() -> int:
176-
initialize_cuda_context()
197+
if os.getenv("CUTLASS_USE_SYCL"):
198+
initialize_sycl_context()
199+
this._use_sycl = True
200+
else:
201+
this._use_sycl = False
202+
initialize_cuda_context()
177203
return this._device_id
204+
205+
206+
def sycl_device() -> dpctl.SyclDevice:
207+
initialize_sycl_context()
208+
return this._sycl_device

python/cutlass/backend/arguments.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@
3939
import cutlass
4040
from cutlass.backend.frontend import CupyFrontend, NumpyFrontend, TorchFrontend
4141
from cutlass.backend.memory_manager import DevicePtrWrapper
42+
from cutlass.backend.utils.device import default_stream
4243
from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
4344

45+
import dpctl
46+
4447

4548
class ArgumentBase:
4649
"""
@@ -58,7 +61,7 @@ def __init__(
5861
# tensor_C can be interpreted as the bias with bias=True in keyword args
5962
self.bias = kwargs.get("bias", False)
6063

61-
self.stream = kwargs.get("stream", cuda.CUstream(0))
64+
self.stream = kwargs.get("stream", default_stream())
6265

6366
# RMM buffers used to track tensor lifetime
6467
self.buffers = {}
@@ -83,34 +86,43 @@ def tensor_to_ptr(self, tensor, name, is_output=False):
8386
if is_numpy_tensor(tensor):
8487
if is_output:
8588
assert name
86-
self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
89+
self.buffers[name] = NumpyFrontend.argument(tensor, is_output, self.stream)
8790
if is_output:
8891
self.host_tensors[name] = tensor
8992
return self.buffers[name].ptr
9093
elif is_torch_tensor(tensor):
91-
return TorchFrontend.argument(tensor)
94+
return TorchFrontend.argument(tensor, self.stream)
9295
elif isinstance(tensor, cuda.CUdeviceptr):
9396
return tensor
9497
elif is_cupy_tensor(tensor):
9598
return CupyFrontend.argument(tensor)
9699
else:
97-
raise TypeError("Unsupported Frontend. Only support numpy and torch")
100+
raise TypeError(
101+
"Unsupported Frontend. Only support numpy and torch")
98102

99103
def sync(self, stream_sync=True):
104+
is_sycl = isinstance(self.stream, dpctl.SyclQueue)
100105
if stream_sync:
101-
(err,) = cudart.cudaDeviceSynchronize()
102-
if err != cuda.CUresult.CUDA_SUCCESS:
103-
raise RuntimeError("CUDA Error %s" % str(err))
106+
if is_sycl:
107+
self.stream.wait()
108+
else:
109+
(err,) = cudart.cudaDeviceSynchronize()
110+
if err != cuda.CUresult.CUDA_SUCCESS:
111+
raise RuntimeError("CUDA Error %s" % str(err))
104112

105113
for key in self.host_tensors.keys():
106114
host_tensor = self.host_tensors[key]
107-
(err,) = cuda.cuMemcpyDtoH(
108-
host_tensor,
109-
self.buffers[key].ptr,
110-
host_tensor.size * host_tensor.itemsize,
111-
)
112-
if err != cuda.CUresult.CUDA_SUCCESS:
113-
raise RuntimeError("CUDA Error %s" % str(err))
115+
if is_sycl:
116+
self.stream.memcpy(host_tensor, self.buffers[key].usm_mem,
117+
host_tensor.size * host_tensor.itemsize)
118+
else:
119+
(err,) = cuda.cuMemcpyDtoH(
120+
host_tensor,
121+
self.buffers[key].ptr,
122+
host_tensor.size * host_tensor.itemsize,
123+
)
124+
if err != cuda.CUresult.CUDA_SUCCESS:
125+
raise RuntimeError("CUDA Error %s" % str(err))
114126

115127
self.free()
116128

python/cutlass/backend/c_types.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,8 @@ def get_mainloop_arguments_3x(
161161
element_A,
162162
element_B,
163163
alignment_A: int,
164-
alignment_B: int) -> ctypes.Structure:
164+
alignment_B: int,
165+
use_sycl: bool = False) -> ctypes.Structure:
165166
"""
166167
Returns the ctypes structure to be used for the 3.x kernel's mainloop parameters.
167168
@@ -207,10 +208,15 @@ def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
207208
args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
208209
)
209210

210-
# Currently all 3.x kernels (CpAsync and Tma) have the same argument structure.
211-
# Should that become not the case, this is the place to return custom ctypes
212-
# structures based on selected kernel schedule.
213-
return _MainloopArgumentsTma
211+
if use_sycl:
212+
# For SYCL, we don't have the additional 'mma_promotion_interval' arg.
213+
return _MainloopArgumentsMultistage
214+
else:
215+
# Currently all 3.x kernels (CpAsync and Tma) for Nvidia devices have
216+
# the same argument structure. Should that become not the case, this is
217+
# the place to return custom ctypes structures based on selected kernel
218+
# schedule.
219+
return _MainloopArgumentsTma
214220

215221

216222
def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor, scheduler_args, default_epilogue):

0 commit comments

Comments
 (0)