codeplaysoftware · AD2605 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
@@ -38,7 +38,7 @@ find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
 
 add_library(DPCPP::DPCPP INTERFACE IMPORTED)
 
-set(DPCPP_FLAGS "-fsycl;")
+set(DPCPP_FLAGS "-fsycl;-mllvm;-enable-global-offset=false;")
 set(DPCPP_COMPILE_ONLY_FLAGS "")
 
 if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")
@@ -51,6 +51,8 @@ endif()
 
 if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
   if("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
+    list(APPEND DPCPP_FLAGS "-fno-sycl-decompose-functor;") #To enable GRID_CONSTANT like behaviour
+    list(APPEND DPCPP_FLAGS "-fgpu-inline-threshold=1000000;")
     list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
     list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
     list(APPEND DPCPP_COMPILE_ONLY_FLAGS; "-mllvm;-enable-global-offset=false;")

diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
@@ -76,12 +76,20 @@
 #include "cutlass/util/tensor_view_io.h"
 #include "cutlass/util/reference/device/gemm.h"
 #include "cutlass/util/reference/device/tensor_compare.h"
+#if defined(SYCL_NVIDIA_TARGET)
+#include "cutlass/util/reference/device/sycl_tensor_fill.h"
+#else
 #include "cutlass/util/reference/device/tensor_fill.h"
+#endif
 
 #include "helper.h"
 
 using namespace cute;
 
+#if defined(SYCL_NVIDIA_TARGET)
+using namespace cutlass;
+#endif
+
 #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -379,7 +387,11 @@ bool verify(const Options &options) {
     ref_D);
 
   // Wait for kernel to finish
-  CUDA_CHECK(cudaDeviceSynchronize());
+  #if defined(SYCL_NVIDIA_TARGET)
+    syclcompat::wait_and_throw();
+  #else
+    CUDA_CHECK(cudaDeviceSynchronize());
+  #endif
 
   // Check if output from CUTLASS kernel and reference kernel are equal or not
   bool passed = cutlass::reference::device::BlockCompareEqual(block_ref_D.get(), block_D.get(), block_D.size());
@@ -427,10 +439,10 @@ int run(Options &options)
   // Run profiling loop
   if (options.iterations > 0)
   {
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
     GpuTimer timer;
     timer.start();
     for (int iter = 0; iter < options.iterations; ++iter) {
-      CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
       CUTLASS_CHECK(gemm.run());
     }
     timer.stop();
@@ -466,6 +478,7 @@ int main(int argc, char const **args) {
 
   // CUTLASS must be compiled with CUDA 12.0 Toolkit to run this example
   // and must have compute capability at least 90.
+  #if !defined(SYCL_NVIDIA_TARGET)
   if (__CUDACC_VER_MAJOR__ < 12) {
     std::cerr << "This example requires CUDA 12 or newer.\n";
     // Returning zero so this test passes on older Toolkits. Its actions are no-op.
@@ -483,6 +496,7 @@ int main(int argc, char const **args) {
       << "later (compute capability 90 or greater).\n";
     return 0;
   }
+  #endif
   //
   // Parse options
   //

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -51,6 +51,11 @@ function(cutlass_example_add_executable NAME)
   if (NOT CUTLASS_ENABLE_SYCL)
     SET(ADD_CUDA ON)
   endif()
+
+  if (DPCPP_SYCL_ARCH STREQUAL "sm_90a")
+    # This is needed to call the function that initialise the TMA descriptor
+    SET(ADD_CUDA ON)
+  endif()
 
   target_link_libraries(
     ${NAME}
@@ -155,6 +160,7 @@ if (NOT CUTLASS_ENABLE_SYCL)
 else()
   foreach(EXAMPLE
     14_ampere_tf32_tensorop_gemm
+    48_hopper_warp_specialized_gemm
     cute
     sycl
     )

diff --git a/include/cute/arch/cluster_sm90.hpp b/include/cute/arch/cluster_sm90.hpp
@@ -34,11 +34,13 @@
 
 // Config
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && \
-  ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))))
+  ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))) || \
+  (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) && defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))
 #  define CUTE_ARCH_CLUSTER_SM90_ENABLED
 #endif
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)) || \
+    (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) && defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))
 #  define CUTE_ARCH_ELECT_ONE_SM90_ENABLED
 #endif
 
@@ -192,8 +194,8 @@ CUTE_HOST_DEVICE uint32_t elect_one_sync()
     : "+r"(laneid), "+r"(pred)
     : "r"(0xFFFFFFFF));
   return pred;
-#elif defined(__CUDA_ARCH__)
-  return (threadIdx.x % 32) == 0;
+#elif defined(__CUDA_ARCH__) || defined(__SYCL_CUDA_ARCH__)
+  return (ThreadIdxX() % 32) == 0;
 #else
   return true;
 #endif
@@ -222,8 +224,8 @@ elect_one_leader_sync()
     : "+r"(laneid), "+r"(pred)
     : "r"(0xFFFFFFFF));
   return {pred, laneid};
-#elif defined(__CUDA_ARCH__)
-  return {(threadIdx.x % 32) == 0, 0};
+#elif defined(__CUDA_ARCH__) || defined(__SYCL_CUDA_ARCH__)
+  return {(ThreadIdxX() % 32) == 0, 0};
 #else
   return {true, 0};
 #endif

diff --git a/include/cute/arch/copy_sm90.hpp b/include/cute/arch/copy_sm90.hpp
@@ -35,13 +35,15 @@
 #include <cute/arch/copy.hpp>
 
 // Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)) || \
+    (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) && defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))
 #  define CUTE_ARCH_STSM_SM90_ENABLED
 #  define CUTE_ARCH_TMA_SM90_ENABLED
 #endif
 
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED) && \
-  ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))
+  (((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))) || \
+  defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))
 #  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
 #endif
 

diff --git a/include/cute/arch/copy_sm90_desc.hpp b/include/cute/arch/copy_sm90_desc.hpp
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 #pragma once
 
-#if !defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL)
+#if (!defined(__CUDACC_RTC__) && !defined(CUTLASS_ENABLE_SYCL)) || defined(SYCL_NVIDIA_TARGET)
 #include <cuda.h>
 #include <cinttypes>
 #endif
@@ -176,7 +176,7 @@ enum class CacheHintSm90 : uint64_t {
   EVICT_LAST = 0x14F0000000000000,
 };
 
-#if (__CUDACC_VER_MAJOR__ >= 12)
+#if (__CUDACC_VER_MAJOR__ >= 12) || defined(SYCL_NVIDIA_TARGET)
 
 #if !defined(__CUDACC_RTC__)
 /// @return The TMA descriptor datatype enum corresponding to T.

diff --git a/include/cute/arch/copy_sm90_tma.hpp b/include/cute/arch/copy_sm90_tma.hpp
@@ -355,7 +355,7 @@ struct SM90_TMA_LOAD_IM2COL_3D
     uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(desc_ptr);
     uint32_t smem_int_mbar = cast_smem_ptr_to_uint(mbar_ptr);
     uint32_t smem_int_ptr  = cast_smem_ptr_to_uint(smem_ptr);
-    // Copy from global to shared::cluster.
+    // Copy from global to shared::cluster
     asm volatile (
       "cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes"
       " [%0], [%1, {%3, %4, %5}], [%2], {%6};"
@@ -1113,7 +1113,7 @@ CUTE_HOST_DEVICE static void
 tma_store_fence() {
 #if defined(CUTE_ARCH_TMA_SM90_ENABLED)
     asm volatile ("fence.proxy.async.shared::cta;");
-#elif defined(__CUDA_ARCH__)
+#elif defined(__CUDA_ARCH__) || (__SYCL_CUDA_ARCH__)
     CUTE_INVALID_CONTROL_PATH("Trying to use tma without CUTE_ARCH_TMA_SM90_ENABLED.");
 #endif
 }

diff --git a/include/cute/arch/mma_sm90.hpp b/include/cute/arch/mma_sm90.hpp
@@ -36,7 +36,8 @@
 #include <cute/arch/mma.hpp>
 
 // Config
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) || \
+    (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900))
 #    define CUTE_ARCH_MMA_SM90_ENABLED
 #    define CUTE_ARCH_MMA_F64_SM90_ENABLED
 #endif

diff --git a/include/cute/arch/mma_sm90_desc.hpp b/include/cute/arch/mma_sm90_desc.hpp
@@ -36,7 +36,9 @@
 #include <cute/arch/mma.hpp>
 
 // Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#if ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) || \
+     (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900))) && \
+    defined(__CUDA_ARCH_FEAT_SM90_ALL)
 #    define CUTE_ARCH_MMA_SM90A_ENABLED
 #endif
 

diff --git a/include/cute/arch/mma_sm90_gmma.hpp b/include/cute/arch/mma_sm90_gmma.hpp
@@ -33,7 +33,9 @@
 #include <cute/config.hpp>
 #include <cute/arch/mma.hpp>
 // Config
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#if ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) || \
+     (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900))) && \
+    defined(__CUDA_ARCH_FEAT_SM90_ALL)
 #  define CUTE_ARCH_MMA_SM90A_ENABLED
 #endif
 
@@ -84,15 +86,15 @@ warpgroup_fence_operand(uint32_t& reg) {
   // MSVC emits a build error for 'asm volatile'
   // even if it only occurs in a __device__ function.
   // This prevents the error.
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__SYCL_CUDA_ARCH__)
   asm volatile("" : "+r"(reg) :: "memory");
 #endif
 }
 
 CUTE_HOST_DEVICE
 void
 warpgroup_fence_operand(float& reg) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__SYCL_CUDA_ARCH__)
   asm volatile("" : "+f"(reg) :: "memory");
 #endif
 }

diff --git a/include/cute/atom/copy_atom.hpp b/include/cute/atom/copy_atom.hpp
@@ -762,7 +762,7 @@ print_latex_copy(LayoutS const& S, ThrIDS const& TS,  // (m,n) -> (tid,vid)  and
 #include <cute/atom/copy_traits_sm90.hpp>
 
 // Config
-#if (__CUDACC_VER_MAJOR__ >= 12)
+#if (__CUDACC_VER_MAJOR__ >= 12) || defined(SYCL_NVIDIA_TARGET)
 #  define CUTE_COPY_ATOM_TMA_SM90_ENABLED
 #endif
 

diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -78,8 +78,8 @@ struct TMA_LOAD_Unpack
 #if 0
       auto [c0,c1,c2,c3,c4] = append<5>(src_coord, 0);
       printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-            threadIdx.x, threadIdx.y, threadIdx.z,
-            blockIdx.x, blockIdx.y, blockIdx.z,
+            ThreadIdxX(), ThreadIdxY(), ThreadIdxZ(),
+            BlockIdxX(), BlockIdxY(), BlockIdxZ(),
             int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), dst_ptr);
 #endif
       return detail::explode_tuple(detail::CallCOPY<CopyOp>{},
@@ -314,8 +314,8 @@ struct TMA_STORE_Unpack
 #if 0
     auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
     printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-           threadIdx.x, threadIdx.y, threadIdx.z,
-           blockIdx.x, blockIdx.y, blockIdx.z,
+           ThreadIdxX(), ThreadIdxY(), ThreadIdxZ(),
+           BlockDimX(), BlockDimY(), BlockDimZ(),
            int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
 #endif
     return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE>{},
@@ -375,8 +375,8 @@ struct Copy_Traits<SM90_TMA_STORE, NumBitsPerTMA, AuxParams_>
 #if 0
     auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
     printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-           threadIdx.x, threadIdx.y, threadIdx.z,
-           blockIdx.x, blockIdx.y, blockIdx.z,
+           ThreadIdxX(), ThreadIdxY(), ThreadIdxZ(),
+           BlockIdxX(), BlockIdxY(), BlockIdxZ(),
            int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
 #endif
     return detail::explode_tuple(detail::CallCOPY<SM90_TMA_STORE>{},
@@ -457,8 +457,8 @@ struct Copy_Traits<SM90_TMA_REDUCE_ADD, NumBitsPerTMA, AuxParams_>
 #if 0
     auto [c0,c1,c2,c3,c4] = append<5>(dst_coord, 0);
     printf("THR (%d,%d,%d) BLK (%d,%d,%d) TMACRD (%d,%d,%d,%d,%d) SMEMADDR (%p)\n",
-           threadIdx.x, threadIdx.y, threadIdx.z,
-           blockIdx.x, blockIdx.y, blockIdx.z,
+           ThreadIdxX(), ThreadIdxY(), ThreadIdxZ(),
+           BlockIdxX(), BlockIdxY(), BlockIdxZ(),
            int32_t(c0), int32_t(c1), int32_t(c2), int32_t(c3), int32_t(c4), src_ptr);
 #endif
 
@@ -974,7 +974,8 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,         // The origin
     // TMA general info
     //
 
-  #if (__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)
+  #if ((__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__)) || \
+      defined(SYCL_NVIDIA_TARGET)
 
     CUtensorMapDataType     tma_format      = TMA::to_CUtensorMapDataType<TmaInternalType>();
     CUtensorMapInterleave   tma_interleave  = CU_TENSOR_MAP_INTERLEAVE_NONE;
@@ -984,7 +985,7 @@ make_tma_copy_desc(Tensor<GEngine,GLayout> const& gtensor,         // The origin
     // TMA smem swizzle type
     CUtensorMapSwizzle smem_swizzle = TMA::to_CUtensorMapSwizzle(get_tma_swizzle_bits(swizzle));
     CUresult result = cuTensorMapEncodeTiled(
-        &tma_desc,
+        reinterpret_cast<CUtensorMap*>(&tma_desc),
         tma_format,
         tma_dim,
         gmem_address,

diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
@@ -36,7 +36,8 @@
 
 #include <cutlass/arch/memory_sm75.h>
 #include <cute/arch/cluster_sm90.hpp>
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && (__CUDACC_VER_MAJOR__ >= 12)) || \
+    (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) && defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))
 #define CUDA_BARRIER_ENABLED 1
 #else
 #define CUDA_BARRIER_ENABLED 0

diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
@@ -61,9 +61,10 @@ struct global_load;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
+#if ((((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
      (__CUDACC_VER_MAJOR__ > 11)) &&                                  \
-    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) || \
+    (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 750))
   #define CUTLASS_ENABLE_L2_PREFETCH 1
 #else
   #define CUTLASS_ENABLE_L2_PREFETCH 0

diff --git a/include/cutlass/arch/mma_sm90.h b/include/cutlass/arch/mma_sm90.h
@@ -46,25 +46,31 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8))
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 8)) || \
+    defined(SYCL_NVIDIA_TARGET)
   #define CUTLASS_ARCH_MMA_SM90_F64_MMA_SUPPORTED
   #if (!defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED))
-    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) || \
+        (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) && \
+         defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))
       #define CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED
     #endif
   #endif
 #endif
 
-#if (__CUDACC_VER_MAJOR__ >= 12)
+#if (__CUDACC_VER_MAJOR__ >= 12) || defined(SYCL_NVIDIA_TARGET)
   #define CUTLASS_ARCH_MMA_SM90_SUPPORTED
   #if (!defined(CUTLASS_ARCH_MMA_SM90_ENABLED))
-    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) || \
+        (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) &&\
+         defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))
       #define CUTLASS_ARCH_MMA_SM90_ENABLED
     #endif
   #endif
 #endif
 
-#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))
+#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))) || \
+    defined(SYCL_NVIDIA_TARGET)
   #define CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED
 #endif
 

diff --git a/include/cutlass/arch/reg_reconfig.h b/include/cutlass/arch/reg_reconfig.h
@@ -37,8 +37,11 @@
 
 #include "cutlass/cutlass.h"
 
-#if (defined(__CUDA_ARCH__) &&\
-    (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#if (((defined(__CUDA_ARCH__) && \
+    (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)) || \
+    (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) &&\
+     defined(__PTX_VERSION__) && (__PTX_VERSION__ >= 80))) \
+     && defined(__CUDA_ARCH_FEAT_SM90_ALL))
     #define CUDA_CTA_RECONFIG_ACTIVATED 1
 #endif
 

diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
@@ -97,7 +97,7 @@ struct GenericBarrier {
   {
     int state = 0;
 
-#if (__CUDA_ARCH__ >= 700)
+#if (__CUDA_ARCH__ >= 700) || (__SYCL_CUDA_ARCH__ >= 700)
     /// SM70 and newer use memory consistency qualifiers
 
     // Acquire pattern using acquire modifier