Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Launch Bounds #144

Open
wants to merge 7 commits into
base: sycl-develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/FindDPCPP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
list(APPEND DPCPP_FLAGS "-fgpu-inline-threshold=1000000;")
aacostadiaz marked this conversation as resolved.
Show resolved Hide resolved
list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;")
endif()
endif()
Expand Down
2 changes: 1 addition & 1 deletion include/cutlass/detail/helper_macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@

#if defined(CUTLASS_ENABLE_SYCL)
#define CUTLASS_HOST
#define CUTLASS_GLOBAL
#define CUTLASS_GLOBAL __attribute__((always_inline)) inline
aacostadiaz marked this conversation as resolved.
Show resolved Hide resolved
#define CUTLASS_SHARED
#else
#define CUTLASS_HOST __host__
Expand Down
4 changes: 2 additions & 2 deletions include/cutlass/device_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ void Kernel2(typename Operator::Params params) {

/// Generic CUTLASS kernel template.
template <typename Operator>
CUTLASS_GLOBAL
#if defined(CUTLASS_ENABLE_SYCL)
void device_kernel(typename Operator::Params const params, sycl::local_ptr<char> smem) {
void device_kernel(typename Operator::Params const& params, sycl::local_ptr<char> smem) {
#else
CUTLASS_GLOBAL
#ifdef __CUDACC__
// Enclosing this in __CUDACC__ suppresses MSVC warnings.
__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
Expand Down
3 changes: 2 additions & 1 deletion include/cutlass/gemm/device/gemm_universal_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,8 @@ class GemmUniversalAdapter<
}, params);
#else
auto event = launch<device_kernel<GemmKernel>>(launch_policy{
sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}},
sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
kernel_properties{sycl::ext::oneapi::experimental::max_linear_work_group_size<GemmKernel::MaxThreadsPerBlock>}},
params);
#endif
EventManager::getInstance().addEvent(event);
Expand Down