codeplaysoftware · AD2605 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 22, 2024
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
@@ -53,6 +53,7 @@ if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
   if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
     list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
     list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
+    list(APPEND DPCPP_FLAGS "-fgpu-inline-threshold=1000000;")
     list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;")
   endif()
 endif()

diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
@@ -62,7 +62,7 @@
 
 #if defined(CUTLASS_ENABLE_SYCL)
 #define CUTLASS_HOST
-#define CUTLASS_GLOBAL
+#define CUTLASS_GLOBAL __attribute__((always_inline)) inline
 #define CUTLASS_SHARED
 #else
 #define CUTLASS_HOST __host__

diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h
@@ -108,10 +108,10 @@ void Kernel2(typename Operator::Params params) {
 
 /// Generic CUTLASS kernel template.
 template <typename Operator>
+CUTLASS_GLOBAL
 #if defined(CUTLASS_ENABLE_SYCL)
-void device_kernel(typename Operator::Params const params, sycl::local_ptr<char> smem) {
+void device_kernel(typename Operator::Params const& params, sycl::local_ptr<char> smem) {
 #else
-CUTLASS_GLOBAL
 #ifdef __CUDACC__
 // Enclosing this in __CUDACC__ suppresses MSVC warnings.
 __launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)

diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -433,7 +433,8 @@ class GemmUniversalAdapter<
         }, params);
 #else
         auto event = launch<device_kernel<GemmKernel>>(launch_policy{
-          sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}},
+          sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
+          kernel_properties{sycl::ext::oneapi::experimental::max_linear_work_group_size<GemmKernel::MaxThreadsPerBlock>}},
           params);
 #endif
         EventManager::getInstance().addEvent(event);