First steps to enable SYCL backend in Python Interface (codeplaysoftw…

…are#155) First implementation steps towards supporting the SYCL backend in the CUTLASS Python Interface. The main additions from this PR are: * Generating a suitable GEMM template and arguments for the CUTLASS 3.x API and Intel PVC as target. * Calling DPC++ instead of `nvcc` to compile device and host code. * Using the DPCTL library to transfer data and launch the kernel via SYCL. The support so far focuses on a simple GEMM, epilogues (e.g, with visitor) are not yet supported. Compilation is currently only possible with development versions of DPC++, the `-fsycl-rtc-mode` flag that was added to support CUTLASS nested parameter classes in free-function kernels as part of this work is not yet available in releases. The activation of the SYCL backend via environment variable is a temporary solution, a follow-up will look into a cleaner solution. --------- Signed-off-by: Lukas Sommer <[email protected]> Co-authored-by: Alejandro Acosta <[email protected]>
taozha2 · Feb 10, 2025 · 045d558 · 045d558
1 parent 5e33a19
commit 045d558
Show file tree

Hide file tree

Showing 16 changed files with 406 additions and 93 deletions.
diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py
@@ -95,6 +95,8 @@ def cuda_install_path():
 else:
     this.use_rmm = False
 
+this._use_sycl = False
+
 
 def set_log_level(level: int):
     """
@@ -172,6 +174,35 @@ def initialize_cuda_context():
     this._device_id = int(device_id)
 
 
+import dpctl
+
+this._sycl_device: dpctl.SyclDevice = None
+
+def initialize_sycl_context():
+    if this._device_id is not None and this._sycl_device is not None:
+        return
+
+    device_id = int(os.getenv("CUTLASS_SYCL_DEVICE_ID", default=0))
+    sycl_gpus = dpctl.get_devices(
+        dpctl.backend_type.level_zero, dpctl.device_type.gpu)
+
+    if len(sycl_gpus) <= device_id:
+        raise Exception("No LevelZero device found")
+
+    this._device_id = device_id
+    this._sycl_device = sycl_gpus[device_id]
+
+
 def device_id() -> int:
-    initialize_cuda_context()
+    if os.getenv("CUTLASS_USE_SYCL"):
+        initialize_sycl_context()
+        this._use_sycl = True
+    else:
+        this._use_sycl = False
+        initialize_cuda_context()
     return this._device_id
+
+
+def sycl_device() -> dpctl.SyclDevice:
+    initialize_sycl_context()
+    return this._sycl_device
diff --git a/python/cutlass/backend/arguments.py b/python/cutlass/backend/arguments.py
@@ -39,8 +39,11 @@
 import cutlass
 from cutlass.backend.frontend import CupyFrontend, NumpyFrontend, TorchFrontend
 from cutlass.backend.memory_manager import DevicePtrWrapper
+from cutlass.backend.utils.device import default_stream
 from cutlass.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
 
+import dpctl
+
 
 class ArgumentBase:
     """
@@ -58,7 +61,7 @@ def __init__(
         # tensor_C can be interpreted as the bias with bias=True in keyword args
         self.bias = kwargs.get("bias", False)
 
-        self.stream = kwargs.get("stream", cuda.CUstream(0))
+        self.stream = kwargs.get("stream", default_stream())
 
         # RMM buffers used to track tensor lifetime
         self.buffers = {}
@@ -83,34 +86,43 @@ def tensor_to_ptr(self, tensor, name, is_output=False):
         if is_numpy_tensor(tensor):
             if is_output:
                 assert name
-            self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
+            self.buffers[name] = NumpyFrontend.argument(tensor, is_output, self.stream)
             if is_output:
                 self.host_tensors[name] = tensor
             return self.buffers[name].ptr
         elif is_torch_tensor(tensor):
-            return TorchFrontend.argument(tensor)
+            return TorchFrontend.argument(tensor, self.stream)
         elif isinstance(tensor, cuda.CUdeviceptr):
             return tensor
         elif is_cupy_tensor(tensor):
             return CupyFrontend.argument(tensor)
         else:
-            raise TypeError("Unsupported Frontend. Only support numpy and torch")
+            raise TypeError(
+                "Unsupported Frontend. Only support numpy and torch")
 
     def sync(self, stream_sync=True):
+        is_sycl = isinstance(self.stream, dpctl.SyclQueue)
         if stream_sync:
-            (err,) = cudart.cudaDeviceSynchronize()
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
+            if is_sycl:
+                self.stream.wait()
+            else:
+                (err,) = cudart.cudaDeviceSynchronize()
+                if err != cuda.CUresult.CUDA_SUCCESS:
+                    raise RuntimeError("CUDA Error %s" % str(err))
 
         for key in self.host_tensors.keys():
             host_tensor = self.host_tensors[key]
-            (err,) = cuda.cuMemcpyDtoH(
-                host_tensor,
-                self.buffers[key].ptr,
-                host_tensor.size * host_tensor.itemsize,
-            )
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise RuntimeError("CUDA Error %s" % str(err))
+            if is_sycl:
+                self.stream.memcpy(host_tensor, self.buffers[key].usm_mem,
+                                   host_tensor.size * host_tensor.itemsize)
+            else:
+                (err,) = cuda.cuMemcpyDtoH(
+                    host_tensor,
+                    self.buffers[key].ptr,
+                    host_tensor.size * host_tensor.itemsize,
+                )
+                if err != cuda.CUresult.CUDA_SUCCESS:
+                    raise RuntimeError("CUDA Error %s" % str(err))
 
         self.free()
 

diff --git a/python/cutlass/backend/c_types.py b/python/cutlass/backend/c_types.py
@@ -161,7 +161,8 @@ def get_mainloop_arguments_3x(
     element_A,
     element_B,
     alignment_A: int,
-    alignment_B: int) -> ctypes.Structure:
+    alignment_B: int,
+    use_sycl: bool = False) -> ctypes.Structure:
     """
     Returns the ctypes structure to be used for the 3.x kernel's mainloop parameters.
 
@@ -207,10 +208,15 @@ def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
                 args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
             )
 
-    # Currently all 3.x kernels (CpAsync and Tma) have the same argument structure.
-    # Should that become not the case, this is the place to return custom ctypes
-    # structures based on selected kernel schedule.
-    return _MainloopArgumentsTma
+    if use_sycl:
+        # For SYCL, we don't have the additional 'mma_promotion_interval' arg.
+        return _MainloopArgumentsMultistage
+    else:
+        # Currently all 3.x kernels (CpAsync and Tma) for Nvidia devices have
+        # the same argument structure. Should that become not the case, this is
+        # the place to return custom ctypes structures based on selected kernel
+        # schedule.
+        return _MainloopArgumentsTma
 
 
 def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor, scheduler_args, default_epilogue):