From a6c8e5355076248cc6bcbcdd650ce343e23679fb Mon Sep 17 00:00:00 2001 From: jiyang1011 Date: Wed, 12 Mar 2025 00:34:56 -0700 Subject: [PATCH 01/21] spirv APIs --- cmake/FindDPCPP.cmake | 7 +- include/cute/arch/copy_xe.hpp | 7 +- include/cute/arch/mma_xe.hpp | 208 +++++++-- include/cute/arch/xe_config.hpp | 251 ++++++++++ include/cute/arch/xe_copy_1B.hpp | 567 +++++++++++++--------- include/cute/arch/xe_copy_2B.hpp | 621 +++++++++++++++---------- include/cute/arch/xe_copy_4B.hpp | 596 ++++++++++++++---------- include/cute/arch/xe_copy_8B.hpp | 90 ++-- test/unit/cute/intel_xe/copy_block.cpp | 2 + 9 files changed, 1562 insertions(+), 787 deletions(-) create mode 100644 include/cute/arch/xe_config.hpp diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index f14a62b267..0a78b26519 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -62,8 +62,11 @@ endif() if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc" OR "${DPCPP_SYCL_TARGET}" STREQUAL "spir64" OR "${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_bmg_g21") - list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier") - + if (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) + list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier") + else() + list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate") + endif() if(DPCPP_DISABLE_ITT_FOR_CUTLASS) list(APPEND DPCPP_FLAGS "-fno-sycl-instrument-device-code") endif() diff --git a/include/cute/arch/copy_xe.hpp b/include/cute/arch/copy_xe.hpp index 64087c62c6..d40ea9cf09 100644 --- a/include/cute/arch/copy_xe.hpp +++ b/include/cute/arch/copy_xe.hpp @@ -33,11 +33,6 @@ #include #include #include -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) inline x { assert(false); } -#endif // prefetch SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uchar( @@ -70,7 +65,7 @@ SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong4( SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong8( const __attribute__((opencl_global)) uint64_t *base, int immElemOff, enum CacheControl cacheOpt)); -#undef SYCL_DEVICE_BUILTIN + #ifdef __SYCL_DEVICE_ONLY__ SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int execution_scope, int memory_scope, int memory_semantics); diff --git a/include/cute/arch/mma_xe.hpp b/include/cute/arch/mma_xe.hpp index 600e604d40..1f17617e15 100644 --- a/include/cute/arch/mma_xe.hpp +++ b/include/cute/arch/mma_xe.hpp @@ -32,13 +32,7 @@ #include #include -#include - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) inline x { CUTE_INVALID_CONTROL_PATH("Trying to use XE built-in on non-XE hardware"); } -#endif +#include // mma_bf16 SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); @@ -66,7 +60,125 @@ SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_tf32_tf32_matrix_mad_k8(cute SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_tf32_tf32_matrix_mad_k8(float a, cute::intel::float8 b, cute::intel::float2 acc)); SYCL_DEVICE_OCL(float intel_sub_group_tf32_tf32_matrix_mad_k8(float a, cute::intel::float8 b, float acc)); -#undef SYCL_DEVICE_OCL +#if defined(CUTE_ARCH_MMA_XE_SPIRV_ENABLED) +namespace cute::detail +{ +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 ); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixASigned | SPIRV_MMAOperands::SPIRV_MatrixBSigned | SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(8, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixATf32 | SPIRV_MMAOperands::SPIRV_MatrixBTf32); +#endif + } +}; +} // namespace cute::detail end +#endif + +#if defined(CUTE_ARCH_MMA_XE_BUILTIN_ENABLED) +namespace cute::detail +{ +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#endif + } +}; +} // namespace cute::detail end +#endif namespace cute { //MxNxK_D,A,B,C @@ -86,8 +198,8 @@ struct XE_8x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-PVC hardware"); #endif @@ -106,8 +218,8 @@ struct XE_4x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-PVC hardware"); #endif @@ -126,8 +238,8 @@ struct XE_2x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-PVC hardware"); #endif @@ -147,8 +259,8 @@ struct XE_1x16x16_F32BF16BF16F32_TT intel::int8 const& b, float const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32BF16BF16F32_TT on non-PVC hardware"); #endif @@ -172,8 +284,8 @@ struct XE_8x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32F16F16F32_TT on non-PVC hardware"); #endif @@ -193,8 +305,8 @@ struct XE_4x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F32F16F16F32_TT on non-PVC hardware"); #endif @@ -214,8 +326,8 @@ struct XE_2x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F32F16F16F32_TT on non-PVC hardware"); #endif @@ -235,8 +347,8 @@ struct XE_1x16x16_F32F16F16F32_TT intel::int8 const& b, float const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32F16F16F32_TT on non-PVC hardware"); #endif @@ -260,8 +372,8 @@ struct XE_8x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32S8S8S32_TT on non-PVC hardware"); #endif @@ -281,8 +393,8 @@ struct XE_4x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32S8S8S32_TT on non-PVC hardware"); #endif @@ -302,8 +414,8 @@ struct XE_2x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32S8S8S32_TT on non-PVC hardware"); #endif @@ -323,8 +435,8 @@ struct XE_1x16x32_S32S8S8S32_TT intel::int8 const& b, int const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32S8S8S32_TT on non-PVC hardware"); #endif @@ -344,8 +456,8 @@ struct XE_8x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32U8U8S32_TT on non-PVC hardware"); #endif @@ -365,8 +477,8 @@ struct XE_4x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32U8U8S32_TT on non-PVC hardware"); #endif @@ -386,8 +498,8 @@ struct XE_2x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32U8U8S32_TT on non-PVC hardware"); #endif @@ -407,8 +519,8 @@ struct XE_1x16x32_S32U8U8S32_TT intel::uint8 const& b, int const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32U8U8S32_TT on non-PVC hardware"); #endif @@ -428,8 +540,8 @@ struct XE_8x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float8 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x8_F32TF32TF32F32_TT on non-PVC hardware"); #endif @@ -449,8 +561,8 @@ struct XE_4x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float4 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x8_F32TF32TF32F32_TT on non-PVC hardware"); #endif @@ -470,8 +582,8 @@ struct XE_2x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float2 const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x8_F32TF32TF32F32_TT on non-PVC hardware"); #endif @@ -491,8 +603,8 @@ struct XE_1x16x8_F32TF32TF32F32_TT intel::float8 const& b, float const& c) { -#if defined(SYCL_INTEL_TARGET) - d = intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#if defined(CUTE_ARCH_MMA_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x8_F32TF32TF32F32_TT on non-PVC hardware"); #endif diff --git a/include/cute/arch/xe_config.hpp b/include/cute/arch/xe_config.hpp new file mode 100644 index 0000000000..7156f43360 --- /dev/null +++ b/include/cute/arch/xe_config.hpp @@ -0,0 +1,251 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once +#include + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x +#else +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } +#endif + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x +#else +#define SYCL_DEVICE_OCL(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } +#endif + + +#undef __global +#define __global __attribute__((opencl_global)) + + +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) +#define CUTE_ARCH_COPY_XE_ENABLED +#define CUTE_ARCH_MMA_XE_ENABLED +#endif + +#if defined(CUTE_ARCH_COPY_XE_ENABLED) && defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200) +#define CUTE_ARCH_COPY_XE_BUILTIN_ENABLED +#define CUTE_ARCH_MMA_XE_BUILTIN_ENABLED +#elif defined(CUTE_ARCH_COPY_XE_ENABLED) +#define CUTE_ARCH_COPY_XE_SPIRV_ENABLED +#define CUTE_ARCH_MMA_XE_SPIRV_ENABLED +// #define CUTE_ARCH_MMA_XE_BUILTIN_ENABLED +#endif + +// SPIRV copy definitions +#if defined(CUTE_ARCH_COPY_XE_SPIRV_ENABLED) +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransformINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransposeINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockStoreINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + void* src_pointer, const void* dst_base_pointer, int memory_width, + int memory_height, int memory_pitch, cute::intel::coord_t coordinate); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockPrefetchINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate); + +namespace cute::detail { +template +struct XeSubgroup2DBlockLoad { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { +#ifdef __SYCL_DEVICE_ONLY__ + __spirv_Subgroup2DBlockLoadINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); +#endif + } +}; + +template +struct XeSubgroup2DBlockTransform { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { +#ifdef __SYCL_DEVICE_ONLY__ + __spirv_Subgroup2DBlockLoadTransformINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); +#endif + } +}; + +template +struct XeSubgroup2DBlockTranspose { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { +#ifdef __SYCL_DEVICE_ONLY__ + __spirv_Subgroup2DBlockLoadTransposeINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); +#endif + } +}; + +// template +// struct XeSubgroup2DBlockPrefetch { +// CUTE_HOST_DEVICE +// void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, +// cute::intel::coord_t coordinate) { +// #ifdef __SYCL_DEVICE_ONLY__ +// __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, +// srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); +// #endif +// } +// }; + + +template +struct XeSubgroup2DBlockStore { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { +#ifdef __SYCL_DEVICE_ONLY__ + __spirv_Subgroup2DBlockStoreINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + (void*)(srcPointer), dstBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); +#endif + } +}; +} // namespace cute::detail end +#endif + +namespace cute::detail +{ +template +struct XeSubgroup2DBlockPrefetch { + // static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; + +#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +template +struct XeSubgroup2DBlockLoad { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTransform { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTranspose { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockStore { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +#endif +} +enum class CacheControl { + kDefault = 0, + kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached + kL1UC_L3C = 2, // Override to L1 uncached and L3 cached + kL1C_L3UC = 3, // Override to L1 cached and L3 uncached + kL1C_L3C = 4, // Override to L1 cached and L3 cached + kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached + kL1S_L3C = 6, // Override to L1 streaming load and L3 cached + kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached +}; + +#if defined(CUTE_ARCH_MMA_XE_SPIRV_ENABLED) +// @brief spirv APIs for mma +// @param dims K +// @param ARegisters +// @param BRegisters +// @param AccRegisters +// @param Operands code +// @return DRegisters +SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::float8, int32_t); +SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::float4, int32_t); +SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::float2, int32_t); +SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, float, int32_t); + +SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::int8, int32_t); +SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::int4, int32_t); +SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::int2, int32_t); +SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, int, int32_t); + +SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort8, cute::intel::uint8, cute::intel::int8, int32_t); +SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort4, cute::intel::uint8, cute::intel::int4, int32_t); +SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort2, cute::intel::uint8, cute::intel::int2, int32_t); +SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, ushort, cute::intel::uint8, int, int32_t); + +SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float4, cute::intel::float8, cute::intel::float8, int32_t); +SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float2, cute::intel::float8, cute::intel::float4, int32_t); +SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, cute::intel::float2, int32_t); +SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, float, int32_t); + +struct SPIRV_MMAOperands { + static constexpr int SPIRV_MatrixASigned = 0x1; + static constexpr int SPIRV_MatrixBSigned = 0x2; + static constexpr int SPIRV_MatrixAInt8 = 0x10; + static constexpr int SPIRV_MatrixBInt8 = 0x20; + static constexpr int SPIRV_MatrixAFp16 = 0x400; + static constexpr int SPIRV_MatrixBFp16 = 0x800; + static constexpr int SPIRV_MatrixABf16 = 0x1000; + static constexpr int SPIRV_MatrixBBf16 = 0x2000; + static constexpr int SPIRV_MatrixATf32 = 0x100; + static constexpr int SPIRV_MatrixBTf32 = 0x200; +}; +#endif + +namespace cute::detail{ + template + struct XeSubgroupMatrixMultiplyAccumulate { + static_assert(dependent_false<>, "Unsupported MMA Configuration."); + }; +} // namespace cute::detail end diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp index 77da428802..944ae7072c 100644 --- a/include/cute/arch/xe_copy_1B.hpp +++ b/include/cute/arch/xe_copy_1B.hpp @@ -32,30 +32,9 @@ #include #include -#include #include #include "cute/pointer.hpp" -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - // 8bits No transform No transpose SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( intptr_t baseoffset, int width_minus_one, int height_minus_one, @@ -138,87 +117,8 @@ SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); -// U8 prefetch -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); - -#undef SYCL_DEVICE_BUILTIN - -#undef __global -#define __global __attribute__((opencl_global)) -// 8 bits No transform No transpose -SYCL_DEVICE_OCL(cute::intel::ushort intel_sub_group_block_read_8b_1r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_8b_2r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_8b_4r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_8b_8r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_8b_16r32c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_8b_1r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_8b_2r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_8b_4r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_8b_8r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort32 intel_sub_group_block_read_8b_16r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort64 intel_sub_group_block_read_8b_32r32x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 8bits VNNI transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_transform_8b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_transform_8b_32r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_transform_8b_32r16x4c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -// 8bits store -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar2 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar4 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_8b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uchar8 data)); - - -// 2D prefetch +// // 2D prefetch SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_1r32x2c( __global void* base_address, int width, int height, int pitch, cute::intel::coord_t coord)); @@ -235,10 +135,314 @@ SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_32r16x1c( __global void* base_address, int width, int height, int pitch, cute::intel::coord_t coord)); -#undef SYCL_DEVICE_OCL +namespace cute::detail +{ +#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +template<> +struct XeSubgroup2DBlockLoad<1, 32, 1, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 2, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 4, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 32, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 1, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 2, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 4, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 8, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 16, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 32, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 4> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 1, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 2, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 4, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 8, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); + } +}; +#endif + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_1r32x2c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_2r32x2c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_4r32x2c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_8r32x2c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_32r16x1c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; +} // namespace cute::detail end namespace cute { +struct XE_2D_U8x1x32_LD_N { + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + +struct XE_2D_U8x2x32_LD_N { + using BlockShape = Shape<_2, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + struct XE_2D_U8x2x32_ST_N { using BlockShape = Shape<_2, _32>; @@ -246,11 +450,9 @@ struct XE_2D_U8x2x32_ST_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort2 *)(src)); + detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -326,17 +528,14 @@ struct XE_2D_U8x2x32_LD_N { struct XE_2D_U8x4x32_LD_N { using BlockShape = Shape<_4, _32>; - using inst_dtype = int8_t; template CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -360,17 +559,14 @@ struct XE_2D_U8x4x32_LD_N { struct XE_2D_U8x8x32_LD_N { using BlockShape = Shape<_8, _32>; - using inst_dtype = int8_t; template CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -399,11 +595,9 @@ struct XE_2D_U8x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -413,10 +607,8 @@ struct XE_2D_U8x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -432,11 +624,9 @@ struct XE_2D_U8x32x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -591,11 +781,9 @@ struct XE_2D_U8x1x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -605,9 +793,8 @@ struct XE_2D_U8x1x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_1r32x2c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -623,11 +810,9 @@ struct XE_2D_U8x2x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -637,9 +822,8 @@ struct XE_2D_U8x2x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_2r32x2c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -655,11 +839,9 @@ struct XE_2D_U8x4x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -669,9 +851,8 @@ struct XE_2D_U8x4x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_4r32x2c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -687,11 +868,9 @@ struct XE_2D_U8x8x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -701,9 +880,8 @@ struct XE_2D_U8x8x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_8r32x2c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -719,11 +897,9 @@ struct XE_2D_U8x16x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -733,10 +909,8 @@ struct XE_2D_U8x16x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -752,11 +926,9 @@ struct XE_2D_U8x32x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -766,10 +938,8 @@ struct XE_2D_U8x32x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -787,11 +957,9 @@ struct XE_2D_U8x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u8_k32( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -801,9 +969,8 @@ struct XE_2D_U8x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_8b_32r16x1c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -819,11 +986,9 @@ struct XE_2D_U8x32x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -837,11 +1002,9 @@ struct XE_2D_U8x32x64_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 4>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -855,11 +1018,9 @@ struct XE_2D_U8x1x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar *)(src)); + detail::XeSubgroup2DBlockStore<1, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -873,11 +1034,9 @@ struct XE_2D_U8x2x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar2 *)(src)); + detail::XeSubgroup2DBlockStore<1, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -891,11 +1050,9 @@ struct XE_2D_U8x4x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar4 *)(src)); + detail::XeSubgroup2DBlockStore<1, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -907,11 +1064,9 @@ struct XE_2D_U8x8x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar8 *)(src)); + detail::XeSubgroup2DBlockStore<1, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -923,11 +1078,9 @@ struct XE_2D_U8x8x32_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uchar8 *)(src)); + detail::XeSubgroup2DBlockStore<1, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp index 9473601e7a..8794922096 100644 --- a/include/cute/arch/xe_copy_2B.hpp +++ b/include/cute/arch/xe_copy_2B.hpp @@ -32,29 +32,8 @@ #include #include -#include #include -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - SYCL_DEVICE_BUILTIN(cute::intel::ushort16 intel_subgroup_block_read_u16_m8k16v2( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord)); @@ -164,76 +143,6 @@ SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort8 data)); -#undef SYCL_DEVICE_BUILTIN - -#undef __global__ -#define __global __attribute__((opencl_global)) -// 16bits No transform No transpose -SYCL_DEVICE_OCL(cute::intel::ushort intel_sub_group_block_read_16b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_16b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_16b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_16b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_16b_16r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort32 intel_sub_group_block_read_16b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -SYCL_DEVICE_OCL(cute::intel::ushort2 intel_sub_group_block_read_16b_1r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort4 intel_sub_group_block_read_16b_2r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort8 intel_sub_group_block_read_16b_4r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort16 intel_sub_group_block_read_16b_8r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort32 intel_sub_group_block_read_16b_16r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ushort64 intel_sub_group_block_read_16b_32r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 16bits VNNI transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_transform_16b_16r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_transform_16b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_transform_16b_16r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_transform_16b_32r16x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -// 16bits store -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort2 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort4 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_16b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::ushort8 data)); // 2D prefetch SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_1r16x2c( @@ -245,13 +154,322 @@ SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_2r16x2c( SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_4r16x2c( __global void* base_address, int width, int height, int pitch, cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_8r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_16r16x1c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -#undef SYCL_DEVICE_OCL + +namespace cute::detail { +#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +template<> +struct XeSubgroup2DBlockLoad<2, 16, 1, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 2, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 4, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 32, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 1, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 2, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 4, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 8, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 16, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 32, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 32, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 16, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 32, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 1, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(ushort *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 2, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 4, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort8 *)(srcPointer)); + } +}; + +#endif + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 1, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_1r16x2c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 2, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_2r16x2c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 4, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_4r16x2c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 2> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; +} namespace cute { @@ -262,11 +480,9 @@ struct XE_2D_U16x1x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -280,11 +496,9 @@ struct XE_2D_U16x2x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -298,11 +512,9 @@ struct XE_2D_U16x4x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -316,11 +528,9 @@ struct XE_2D_U16x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -330,10 +540,8 @@ struct XE_2D_U16x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -349,11 +557,9 @@ struct XE_2D_U16x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -363,10 +569,8 @@ struct XE_2D_U16x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -382,11 +586,9 @@ struct XE_2D_U16x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -396,10 +598,8 @@ struct XE_2D_U16x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -415,11 +615,9 @@ struct XE_2D_U16x1x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -429,9 +627,8 @@ struct XE_2D_U16x1x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_16b_1r16x2c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -447,11 +644,9 @@ struct XE_2D_U16x2x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -461,9 +656,8 @@ struct XE_2D_U16x2x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_16b_2r16x2c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -479,11 +673,9 @@ struct XE_2D_U16x4x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -493,9 +685,8 @@ struct XE_2D_U16x4x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_16b_4r16x2c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -511,11 +702,9 @@ struct XE_2D_U16x8x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -525,10 +714,8 @@ struct XE_2D_U16x8x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -544,11 +731,9 @@ struct XE_2D_U16x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -558,10 +743,8 @@ struct XE_2D_U16x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -577,11 +760,9 @@ struct XE_2D_U16x32x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -591,11 +772,9 @@ struct XE_2D_U16x32x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) // __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -611,11 +790,9 @@ struct XE_2D_U16x16x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k16( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTransform<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -625,10 +802,8 @@ struct XE_2D_U16x16x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -644,11 +819,9 @@ struct XE_2D_U16x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k32( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTransform<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -658,10 +831,8 @@ struct XE_2D_U16x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -677,11 +848,9 @@ struct XE_2D_U16x16x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTransform<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -691,10 +860,8 @@ struct XE_2D_U16x16x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -710,11 +877,9 @@ struct XE_2D_U16x32x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTransform<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -724,10 +889,8 @@ struct XE_2D_U16x32x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -746,11 +909,9 @@ struct XE_2D_U16x16x8_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -767,11 +928,9 @@ struct XE_2D_U16x16x16_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -785,11 +944,9 @@ struct XE_2D_U16x1x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(cute::intel::ushort *)(src)); + detail::XeSubgroup2DBlockStore<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -803,11 +960,9 @@ struct XE_2D_U16x2x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort2 *)(src)); + detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -821,11 +976,9 @@ struct XE_2D_U16x4x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort4 *)(src)); + detail::XeSubgroup2DBlockStore<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -839,11 +992,9 @@ struct XE_2D_U16x8x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::ushort8 *)(src)); + detail::XeSubgroup2DBlockStore<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp index 9074ec1ea7..63effca597 100644 --- a/include/cute/arch/xe_copy_4B.hpp +++ b/include/cute/arch/xe_copy_4B.hpp @@ -32,38 +32,7 @@ #include #include -#include - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -enum class CacheControl { - kDefault = 0, - kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached - kL1UC_L3C = 2, // Override to L1 uncached and L3 cached - kL1C_L3UC = 3, // Override to L1 cached and L3 uncached - kL1C_L3C = 4, // Override to L1 cached and L3 cached - kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached - kL1S_L3C = 6, // Override to L1 streaming load and L3 cached - kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached -}; +#include // 32bits specific for tf32 No transform No transpose SYCL_DEVICE_BUILTIN( @@ -172,100 +141,310 @@ SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint8 data)); -#undef SYCL_DEVICE_BUILTIN - -#undef __global -#define __global __attribute__((opencl_global)) -// 32bits specific for tf32 No transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_32b_1r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_32b_2r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_4r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_32b_8r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_32b_16r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_32b_32r8c( - const __global void *base_address, int width, int height, int pitch, +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c( + __global void* base_address, int width, int height, int pitch, cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_1r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_2r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_32b_4r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_32b_8r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_32b_16r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_32b_32r8x2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); +namespace cute::detail { +#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +template<> +struct XeSubgroup2DBlockLoad<4, 16, 1, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; -// 32bits No transform No transpose -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_32b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_32b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_32b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_32b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint16 intel_sub_group_block_read_32b_16r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint32 intel_sub_group_block_read_32b_32r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); +template<> +struct XeSubgroup2DBlockLoad<4, 16, 2, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; -// 32bits No transform Transpose -SYCL_DEVICE_OCL(cute::intel::uint intel_sub_group_block_read_transpose_32b_16r1c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint2 intel_sub_group_block_read_transpose_32b_16r2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint4 intel_sub_group_block_read_transpose_32b_16r4c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::uint8 intel_sub_group_block_read_transpose_32b_16r8c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); +template<> +struct XeSubgroup2DBlockLoad<4, 16, 4, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; -// 32bits store -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_1r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_2r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint2 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_4r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint4 data)); -SYCL_DEVICE_OCL(void intel_sub_group_block_write_32b_8r16c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord, cute::intel::uint8 data)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -#undef SYCL_DEVICE_OCL +template<> +struct XeSubgroup2DBlockLoad<4, 16, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 32, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 1, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 2, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 4, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 32, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 1, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 2, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 4, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 8, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 16, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 32, 2> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 1, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 8, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 1, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(uint *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 2, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 4, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint8 *)(srcPointer)); + } +}; +#endif + +template<> +struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_32b_16r8x1c( + (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +} // namespace cute::detail end namespace cute { @@ -276,11 +455,9 @@ struct XE_2D_U32x1x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -294,11 +471,9 @@ struct XE_2D_U32x2x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -312,11 +487,9 @@ struct XE_2D_U32x4x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -330,11 +503,9 @@ struct XE_2D_U32x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -348,11 +519,9 @@ struct XE_2D_U32x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -366,11 +535,9 @@ struct XE_2D_U32x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -384,11 +551,9 @@ struct XE_2D_TF32x1x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -403,11 +568,9 @@ struct XE_2D_TF32x2x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -422,11 +585,9 @@ struct XE_2D_TF32x4x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -441,11 +602,9 @@ struct XE_2D_TF32x8x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -460,11 +619,9 @@ struct XE_2D_TF32x16x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -479,11 +636,9 @@ struct XE_2D_TF32x32x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -497,11 +652,9 @@ struct XE_2D_TF32x1x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -516,11 +669,9 @@ struct XE_2D_TF32x2x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -535,11 +686,9 @@ struct XE_2D_TF32x4x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -554,11 +703,9 @@ struct XE_2D_TF32x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -573,29 +720,13 @@ struct XE_2D_TF32x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_32b_16r8x1c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; }; struct XE_2D_TF32x32x16_LD_N { @@ -606,11 +737,9 @@ struct XE_2D_TF32x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<4, 8, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -625,11 +754,9 @@ struct XE_2D_U32x16x1_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<4, 1, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -645,11 +772,9 @@ struct XE_2D_U32x16x2_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<4, 2, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -665,11 +790,9 @@ struct XE_2D_U32x16x4_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -685,11 +808,9 @@ struct XE_2D_U32x16x8_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -699,9 +820,8 @@ struct XE_2D_U32x16x8_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - intel_sub_group_2d_block_prefetch_32b_16r8x1c( - (__global void*)baseoffset, width - 1, height - 1, pitch - 1, coord); +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -717,11 +837,9 @@ struct XE_2D_U32x1x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) // static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(cute::intel::uint *)(src)); + detail::XeSubgroup2DBlockStore<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -735,11 +853,9 @@ struct XE_2D_U32x2x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uint2 *)(src)); + detail::XeSubgroup2DBlockStore<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -753,11 +869,9 @@ struct XE_2D_U32x4x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uint4 *)(src)); + detail::XeSubgroup2DBlockStore<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -771,11 +885,9 @@ struct XE_2D_U32x8x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) // static_assert(sizeof(T) == 4, "Expected T to have size 4"); - __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord, - *(intel::uint8 *)(src)); + detail::XeSubgroup2DBlockStore<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif diff --git a/include/cute/arch/xe_copy_8B.hpp b/include/cute/arch/xe_copy_8B.hpp index e07aef9608..7203bbf3a2 100644 --- a/include/cute/arch/xe_copy_8B.hpp +++ b/include/cute/arch/xe_copy_8B.hpp @@ -32,27 +32,7 @@ #include #include -#include - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif +#include // 64bits No transform Transpose SYCL_DEVICE_BUILTIN( @@ -67,23 +47,45 @@ SYCL_DEVICE_BUILTIN( cute::intel::ulong4 __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord)); -#undef SYCL_DEVICE_BUILTIN -#undef __global -#define __global __attribute__((opencl_global)) -// 64bits No transform Transpose -SYCL_DEVICE_OCL(cute::intel::ulong intel_sub_group_block_read_transpose_64b_8r1c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ulong2 intel_sub_group_block_read_transpose_64b_8r2c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(cute::intel::ulong4 intel_sub_group_block_read_transpose_64b_8r4c( - const __global void *base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -#undef SYCL_DEVICE_OCL +#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +namespace cute::detail +{ +template<> +struct XeSubgroup2DBlockTranspose<8, 1, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; +template<> +struct XeSubgroup2DBlockTranspose<8, 2, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<8, 4, 8, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; +} +#endif namespace cute { struct XE_2D_U64x8x1_LD_T { @@ -93,11 +95,9 @@ struct XE_2D_U64x8x1_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<8, 1, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -111,11 +111,9 @@ struct XE_2D_U64x8x2_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<8, 2, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -129,11 +127,9 @@ struct XE_2D_U64x8x4_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockTranspose<8, 4, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif diff --git a/test/unit/cute/intel_xe/copy_block.cpp b/test/unit/cute/intel_xe/copy_block.cpp index 8d50704881..9ab4e1061b 100644 --- a/test/unit/cute/intel_xe/copy_block.cpp +++ b/test/unit/cute/intel_xe/copy_block.cpp @@ -352,7 +352,9 @@ TEST(PVC_CuTe_Xe, block_2d_16bits_vnni) { } TEST(PVC_CuTe_Xe, block_2d_32bits_transpose) { + #if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) copy_op{}(); copy_op{}(); + #endif copy_op{}(); } From 73bef6e4be50857aa0354842a828b1a131b9fce6 Mon Sep 17 00:00:00 2001 From: jiyang1011 Date: Sun, 6 Apr 2025 18:40:49 -0700 Subject: [PATCH 02/21] mma spirv api --- include/cute/arch/mma_xe.hpp | 351 +++++++++--------- include/cute/arch/xe_config.hpp | 313 +++++++++++++--- include/cute/arch/xe_copy_1B.hpp | 201 +++++----- include/cute/arch/xe_copy_2B.hpp | 75 ++-- include/cute/arch/xe_copy_4B.hpp | 59 ++- include/cute/arch/xe_copy_8B.hpp | 8 +- include/cute/atom/mma_traits_xe.hpp | 123 ++++++ include/cute/util/sycl_vec.hpp | 4 + .../epilogue/collective/xe_epilogue.hpp | 6 +- test/unit/cute/intel_xe/mma.cpp | 44 ++- test/unit/cute/intel_xe/utils.hpp | 17 +- 11 files changed, 792 insertions(+), 409 deletions(-) diff --git a/include/cute/arch/mma_xe.hpp b/include/cute/arch/mma_xe.hpp index 1f17617e15..9b67928a74 100644 --- a/include/cute/arch/mma_xe.hpp +++ b/include/cute/arch/mma_xe.hpp @@ -34,152 +34,6 @@ #include #include -// mma_bf16 -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL(float intel_sub_group_bf16_bf16_matrix_mad_k16(short a, cute::intel::int8 b, float acc)); -// mma_half -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL(float intel_sub_group_f16_f16_matrix_mad_k16(short a, cute::intel::int8 b, float acc)); -// mma_s8 -SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short8 a, cute::intel::int8 b, cute::intel::int8 acc)); -SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short4 a, cute::intel::int8 b, cute::intel::int4 acc)); -SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short2 a, cute::intel::int8 b, cute::intel::int2 acc)); -SYCL_DEVICE_OCL(int intel_sub_group_i8_i8_matrix_mad_k32(short a, cute::intel::int8 b, int acc)); -// mma_u8 -SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort8 a, cute::intel::uint8 b, cute::intel::int8 acc)); -SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort4 a, cute::intel::uint8 b, cute::intel::int4 acc)); -SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort2 a, cute::intel::uint8 b, cute::intel::int2 acc)); -SYCL_DEVICE_OCL(int intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort a, cute::intel::uint8 b, int acc)); -// mma_tf32 -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float4 a, cute::intel::float8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float2 a, cute::intel::float8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_tf32_tf32_matrix_mad_k8(float a, cute::intel::float8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL(float intel_sub_group_tf32_tf32_matrix_mad_k8(float a, cute::intel::float8 b, float acc)); - -#if defined(CUTE_ARCH_MMA_XE_SPIRV_ENABLED) -namespace cute::detail -{ -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 ); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixASigned | SPIRV_MMAOperands::SPIRV_MatrixBSigned | SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(8, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixATf32 | SPIRV_MMAOperands::SPIRV_MatrixBTf32); -#endif - } -}; -} // namespace cute::detail end -#endif - -#if defined(CUTE_ARCH_MMA_XE_BUILTIN_ENABLED) -namespace cute::detail -{ -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); -#endif - } -}; -} // namespace cute::detail end -#endif - namespace cute { //MxNxK_D,A,B,C //# of vector component of a x subgroup-size x function name @@ -198,7 +52,7 @@ struct XE_8x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float8 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-PVC hardware"); @@ -218,7 +72,7 @@ struct XE_4x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float4 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-PVC hardware"); @@ -238,7 +92,7 @@ struct XE_2x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float2 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-PVC hardware"); @@ -259,7 +113,7 @@ struct XE_1x16x16_F32BF16BF16F32_TT intel::int8 const& b, float const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32BF16BF16F32_TT on non-PVC hardware"); @@ -267,7 +121,87 @@ struct XE_1x16x16_F32BF16BF16F32_TT } }; -//MxNxK_D,A,B,C +struct XE_8x16x16_BF16BF16BF16BF16_TT +{ + using DRegisters = intel::short8[1]; + using ARegisters = intel::short8[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::short8[1]; + + CUTE_HOST_DEVICE static void + fma(intel::short8 & d, + intel::short8 const& a, + intel::int8 const& b, + intel::short8 const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); +#endif + } +}; +struct XE_4x16x16_BF16BF16BF16BF16_TT +{ + using DRegisters = intel::short4[1]; + using ARegisters = intel::short4[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::short4[1]; + + CUTE_HOST_DEVICE static void + fma(intel::short4 & d, + intel::short4 const& a, + intel::int8 const& b, + intel::short4 const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); +#endif + } +}; +struct XE_2x16x16_BF16BF16BF16BF16_TT +{ + using DRegisters = intel::short2[1]; + using ARegisters = intel::short2[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::short2[1]; + + CUTE_HOST_DEVICE static void + fma(intel::short2 & d, + intel::short2 const& a, + intel::int8 const& b, + intel::short2 const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); +#endif + } +}; +struct XE_1x16x16_BF16BF16BF16BF16_TT +{ + using DRegisters = short[1]; + using ARegisters = short[1]; + using BRegisters = intel::int8[1]; + using CRegisters = short[1]; + + CUTE_HOST_DEVICE static void + fma(short & d, + short const& a, + intel::int8 const& b, + short const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); +#endif + } +}; +//MxNxK_A,B,C,D //# of vector component of a x subgroup-size x function name //float8 intel_sub_group_f16_f16_matrix_mad_k16(short8 a, int8 b, int8 acc); //TODO: Is A really not transposed? Maybe better a macro than separate define for 1,2,4,8 @@ -284,7 +218,7 @@ struct XE_8x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float8 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32F16F16F32_TT on non-PVC hardware"); @@ -305,7 +239,7 @@ struct XE_4x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float4 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F32F16F16F32_TT on non-PVC hardware"); @@ -326,7 +260,7 @@ struct XE_2x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float2 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F32F16F16F32_TT on non-PVC hardware"); @@ -347,7 +281,7 @@ struct XE_1x16x16_F32F16F16F32_TT intel::int8 const& b, float const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32F16F16F32_TT on non-PVC hardware"); @@ -355,6 +289,89 @@ struct XE_1x16x16_F32F16F16F32_TT } }; +struct XE_8x16x16_F16F16F16F16_TT +{ + using DRegisters = intel::half8[1]; + using ARegisters = intel::short8[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::half8[1]; + + CUTE_HOST_DEVICE static void + fma(intel::half8 & d, + intel::short8 const& a, + intel::int8 const& b, + intel::half8 const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F16F16F16F16_TT on non-PVC hardware"); +#endif + } +}; + +struct XE_4x16x16_F16F16F16F16_TT +{ + using DRegisters = intel::half4[1]; + using ARegisters = intel::short4[1]; + using BRegisters = intel::int8[1]; + using CRegisters = intel::half4[1]; + + CUTE_HOST_DEVICE static void + fma(intel::half4 & d, + intel::short4 const& a, + intel::int8 const& b, + intel::half4 const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F16F16F16F16_TT on non-PVC hardware"); +#endif + } +}; + +struct XE_2x16x16_F16F16F16F16_TT +{ + using DRegisters = sycl::half2[1]; + using ARegisters = intel::short2[1]; + using BRegisters = intel::int8[1]; + using CRegisters = sycl::half2[1]; + + CUTE_HOST_DEVICE static void + fma(sycl::half2 & d, + intel::short2 const& a, + intel::int8 const& b, + sycl::half2 const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F16F16F16F16_TT on non-PVC hardware"); +#endif + } +}; + +struct XE_1x16x16_F16F16F16F16_TT +{ + using DRegisters = half_t[1]; + using ARegisters = short[1]; + using BRegisters = intel::int8[1]; + using CRegisters = half_t[1]; + + CUTE_HOST_DEVICE static void + fma(half_t & d, + short const& a, + intel::int8 const& b, + half_t const& c) + { +#if defined(CUTE_ARCH_XE_ENABLED) + d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); +#else + CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F16F16F16F16_TT on non-PVC hardware"); +#endif + } +}; //MxNxK_A,B,C,D //# of vector component of a x subgroup-size x function name //float8 intel_sub_group_i8_i8_matrix_mad_k16(short8 a, int8 b, float8 acc); @@ -372,7 +389,7 @@ struct XE_8x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int8 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32S8S8S32_TT on non-PVC hardware"); @@ -393,7 +410,7 @@ struct XE_4x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int4 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32S8S8S32_TT on non-PVC hardware"); @@ -414,7 +431,7 @@ struct XE_2x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int2 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32S8S8S32_TT on non-PVC hardware"); @@ -435,7 +452,7 @@ struct XE_1x16x32_S32S8S8S32_TT intel::int8 const& b, int const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32S8S8S32_TT on non-PVC hardware"); @@ -456,7 +473,7 @@ struct XE_8x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int8 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32U8U8S32_TT on non-PVC hardware"); @@ -477,7 +494,7 @@ struct XE_4x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int4 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32U8U8S32_TT on non-PVC hardware"); @@ -498,7 +515,7 @@ struct XE_2x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int2 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32U8U8S32_TT on non-PVC hardware"); @@ -519,7 +536,7 @@ struct XE_1x16x32_S32U8U8S32_TT intel::uint8 const& b, int const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32U8U8S32_TT on non-PVC hardware"); @@ -540,7 +557,7 @@ struct XE_8x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float8 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x8_F32TF32TF32F32_TT on non-PVC hardware"); @@ -561,7 +578,7 @@ struct XE_4x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float4 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x8_F32TF32TF32F32_TT on non-PVC hardware"); @@ -582,7 +599,7 @@ struct XE_2x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float2 const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x8_F32TF32TF32F32_TT on non-PVC hardware"); @@ -603,7 +620,7 @@ struct XE_1x16x8_F32TF32TF32F32_TT intel::float8 const& b, float const& c) { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x8_F32TF32TF32F32_TT on non-PVC hardware"); diff --git a/include/cute/arch/xe_config.hpp b/include/cute/arch/xe_config.hpp index 7156f43360..60a2cb196a 100644 --- a/include/cute/arch/xe_config.hpp +++ b/include/cute/arch/xe_config.hpp @@ -57,21 +57,41 @@ #if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) -#define CUTE_ARCH_COPY_XE_ENABLED -#define CUTE_ARCH_MMA_XE_ENABLED +#define CUTE_ARCH_XE_ENABLED #endif -#if defined(CUTE_ARCH_COPY_XE_ENABLED) && defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200) -#define CUTE_ARCH_COPY_XE_BUILTIN_ENABLED -#define CUTE_ARCH_MMA_XE_BUILTIN_ENABLED -#elif defined(CUTE_ARCH_COPY_XE_ENABLED) -#define CUTE_ARCH_COPY_XE_SPIRV_ENABLED -#define CUTE_ARCH_MMA_XE_SPIRV_ENABLED -// #define CUTE_ARCH_MMA_XE_BUILTIN_ENABLED +#if defined(CUTE_ARCH_XE_ENABLED) && defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200) +#define CUTE_ARCH_XE_BUILTIN_ENABLED +#elif defined(CUTE_ARCH_XE_ENABLED) +#define CUTE_ARCH_XE_SPIRV_ENABLED #endif +#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) +namespace cute::detail +{ +template +struct XeSubgroup2DBlockPrefetch { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockLoad { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTransform { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTranspose { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockStore { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +} +#endif // SPIRV copy definitions -#if defined(CUTE_ARCH_COPY_XE_SPIRV_ENABLED) SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, const void* src_base_pointer, int memory_width, int memory_height, @@ -92,7 +112,8 @@ SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockPrefetchIN int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, const void* src_base_pointer, int memory_width, int memory_height, int memory_pitch, cute::intel::coord_t coordinate); - + +#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) namespace cute::detail { template struct XeSubgroup2DBlockLoad { @@ -136,18 +157,17 @@ struct XeSubgroup2DBlockTranspose { } }; -// template -// struct XeSubgroup2DBlockPrefetch { -// CUTE_HOST_DEVICE -// void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, -// cute::intel::coord_t coordinate) { -// #ifdef __SYCL_DEVICE_ONLY__ -// __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, -// srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); -// #endif -// } -// }; - +template +struct XeSubgroup2DBlockPrefetch { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { +#ifdef __SYCL_DEVICE_ONLY__ + __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); +#endif + } +}; template struct XeSubgroup2DBlockStore { @@ -164,32 +184,6 @@ struct XeSubgroup2DBlockStore { } // namespace cute::detail end #endif -namespace cute::detail -{ -template -struct XeSubgroup2DBlockPrefetch { - // static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; - -#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) -template -struct XeSubgroup2DBlockLoad { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -template -struct XeSubgroup2DBlockTransform { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -template -struct XeSubgroup2DBlockTranspose { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -template -struct XeSubgroup2DBlockStore { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -#endif -} enum class CacheControl { kDefault = 0, kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached @@ -201,7 +195,132 @@ enum class CacheControl { kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached }; -#if defined(CUTE_ARCH_MMA_XE_SPIRV_ENABLED) +namespace cute::detail{ + template + struct XeSubgroupMatrixMultiplyAccumulate { + static_assert(dependent_false<>, "Unsupported MMA Configuration."); + }; +} // namespace cute::detail end + +// mma_bf16 +SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); +SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); +SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); +SYCL_DEVICE_OCL( float intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, float acc)); +// mma_half +SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); +SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); +SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); +SYCL_DEVICE_OCL( float intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, float acc)); +// mma_s8 +SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short8 a, cute::intel::int8 b, cute::intel::int8 acc)); +SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short4 a, cute::intel::int8 b, cute::intel::int4 acc)); +SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short2 a, cute::intel::int8 b, cute::intel::int2 acc)); +SYCL_DEVICE_OCL( int intel_sub_group_i8_i8_matrix_mad_k32( short a, cute::intel::int8 b, int acc)); +// mma_u8 +SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort8 a, cute::intel::uint8 b, cute::intel::int8 acc)); +SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort4 a, cute::intel::uint8 b, cute::intel::int4 acc)); +SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort2 a, cute::intel::uint8 b, cute::intel::int2 acc)); +SYCL_DEVICE_OCL( int intel_sub_group_u8_u8_matrix_mad_k32( ushort a, cute::intel::uint8 b, int acc)); +// mma_tf32 +SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float4 a, cute::intel::float8 b, cute::intel::float8 acc)); +SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float2 a, cute::intel::float8 b, cute::intel::float4 acc)); +SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, cute::intel::float2 acc)); +SYCL_DEVICE_OCL( float intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, float acc)); +// mma_bfloat16 with bfloat16 accumulator: +SYCL_DEVICE_OCL(cute::intel::short8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::short8 acc)); +SYCL_DEVICE_OCL(cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::short4 acc)); +SYCL_DEVICE_OCL(cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc)); +SYCL_DEVICE_OCL( short intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, short acc)); +// mma_half with half accumulator: +SYCL_DEVICE_OCL(cute::intel::half8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::half8 acc)); +SYCL_DEVICE_OCL(cute::intel::half4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::half4 acc)); +SYCL_DEVICE_OCL(cute::intel::half2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::half2 acc)); +SYCL_DEVICE_OCL( sycl::half intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, sycl::half acc)); + +#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) +namespace cute::detail +{ +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); +#endif + } +}; +} // namespace cute::detail end +#endif + + // @brief spirv APIs for mma // @param dims K // @param ARegisters @@ -229,6 +348,16 @@ SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL( SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, cute::intel::float2, int32_t); SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, float, int32_t); +SYCL_EXTERNAL cute::intel::short8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::short8, int32_t); +SYCL_EXTERNAL cute::intel::short4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::short4, int32_t); +SYCL_EXTERNAL cute::intel::short2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::short2, int32_t); +SYCL_EXTERNAL short __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, short, int32_t); + +SYCL_EXTERNAL cute::intel::half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::half8, int32_t); +SYCL_EXTERNAL cute::intel::half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::half4, int32_t); +SYCL_EXTERNAL cute::intel::half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::half2, int32_t); +SYCL_EXTERNAL sycl::half __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, sycl::half, int32_t); + struct SPIRV_MMAOperands { static constexpr int SPIRV_MatrixASigned = 0x1; static constexpr int SPIRV_MatrixBSigned = 0x2; @@ -238,14 +367,88 @@ struct SPIRV_MMAOperands { static constexpr int SPIRV_MatrixBFp16 = 0x800; static constexpr int SPIRV_MatrixABf16 = 0x1000; static constexpr int SPIRV_MatrixBBf16 = 0x2000; + static constexpr int SPIRV_MatrixCBf16 = 0xC; static constexpr int SPIRV_MatrixATf32 = 0x100; static constexpr int SPIRV_MatrixBTf32 = 0x200; }; +#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) +namespace cute::detail +{ +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 ); #endif + } +}; -namespace cute::detail{ - template - struct XeSubgroupMatrixMultiplyAccumulate { - static_assert(dependent_false<>, "Unsupported MMA Configuration."); - }; +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 |SPIRV_MMAOperands::SPIRV_MatrixCBf16); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixASigned | SPIRV_MMAOperands::SPIRV_MatrixBSigned | SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); +#endif + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(8, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixATf32 | SPIRV_MMAOperands::SPIRV_MatrixBTf32); +#endif + } +}; } // namespace cute::detail end +#endif diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp index 944ae7072c..26840134f5 100644 --- a/include/cute/arch/xe_copy_1B.hpp +++ b/include/cute/arch/xe_copy_1B.hpp @@ -117,7 +117,18 @@ SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); - +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); // // 2D prefetch SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_1r32x2c( __global void* base_address, int width, int height, int pitch, @@ -137,7 +148,7 @@ SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_32r16x1c( namespace cute::detail { -#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) template<> struct XeSubgroup2DBlockLoad<1, 32, 1, 1> { template @@ -357,7 +368,45 @@ struct XeSubgroup2DBlockStore<1, 16, 8, 2> { (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); } }; -#endif + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 1> { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; template<> struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { @@ -408,6 +457,7 @@ struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); } }; +#endif } // namespace cute::detail end namespace cute @@ -418,60 +468,38 @@ struct XE_2D_U8x1x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif } -}; - -struct XE_2D_U8x2x32_LD_N { - using BlockShape = Shape<_2, _32>; - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord); #else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-PVC hardware"); #endif - } + } + }; }; -struct XE_2D_U8x2x32_ST_N { +struct XE_2D_U8x2x32_LD_N { using BlockShape = Shape<_2, _32>; - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); -#endif - } -}; - -struct XE_2D_U8x1x32_LD_N { - using BlockShape = Shape<_1, _32>; - using inst_dtype = int8_t; - template CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(SYCL_INTEL_TARGET) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockLoad<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif @@ -481,9 +509,8 @@ struct XE_2D_U8x1x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -492,38 +519,20 @@ struct XE_2D_U8x1x32_LD_N { }; }; -struct XE_2D_U8x2x32_LD_N { +struct XE_2D_U8x2x32_ST_N { using BlockShape = Shape<_2, _32>; - using inst_dtype = int8_t; template CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) + T *src) { +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); + detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); #endif } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, CacheControl::kL1C_L3C); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-PVC hardware"); -#endif - } - }; - }; struct XE_2D_U8x4x32_LD_N { @@ -533,7 +542,7 @@ struct XE_2D_U8x4x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -545,10 +554,8 @@ struct XE_2D_U8x4x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -564,7 +571,7 @@ struct XE_2D_U8x8x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -576,10 +583,8 @@ struct XE_2D_U8x8x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( - (intptr_t)baseoffset, width - 1, height - 1, pitch - 1, coord, - CacheControl::kL1C_L3C); +#if defined(CUTE_ARCH_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( "Trying to use block prefetch on non-PVC hardware"); @@ -595,7 +600,7 @@ struct XE_2D_U8x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -607,7 +612,7 @@ struct XE_2D_U8x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -624,7 +629,7 @@ struct XE_2D_U8x32x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -781,7 +786,7 @@ struct XE_2D_U8x1x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -793,7 +798,7 @@ struct XE_2D_U8x1x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -810,7 +815,7 @@ struct XE_2D_U8x2x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -822,7 +827,7 @@ struct XE_2D_U8x2x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -839,7 +844,7 @@ struct XE_2D_U8x4x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -851,7 +856,7 @@ struct XE_2D_U8x4x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -868,7 +873,7 @@ struct XE_2D_U8x8x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -880,7 +885,7 @@ struct XE_2D_U8x8x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -897,7 +902,7 @@ struct XE_2D_U8x16x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -909,7 +914,7 @@ struct XE_2D_U8x16x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -926,7 +931,7 @@ struct XE_2D_U8x32x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockLoad<1, 32, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -938,7 +943,7 @@ struct XE_2D_U8x32x64_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -957,7 +962,7 @@ struct XE_2D_U8x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockTransform<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -969,7 +974,7 @@ struct XE_2D_U8x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -986,7 +991,7 @@ struct XE_2D_U8x32x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockTransform<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -1002,7 +1007,7 @@ struct XE_2D_U8x32x64_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockTransform<1, 16, 32, 4>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -1018,7 +1023,7 @@ struct XE_2D_U8x1x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockStore<1, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -1034,7 +1039,7 @@ struct XE_2D_U8x2x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockStore<1, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -1050,7 +1055,7 @@ struct XE_2D_U8x4x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockStore<1, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -1064,7 +1069,7 @@ struct XE_2D_U8x8x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockStore<1, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -1078,7 +1083,7 @@ struct XE_2D_U8x8x32_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); detail::XeSubgroup2DBlockStore<1, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, src); #else diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp index 8794922096..4bbd053015 100644 --- a/include/cute/arch/xe_copy_2B.hpp +++ b/include/cute/arch/xe_copy_2B.hpp @@ -156,7 +156,7 @@ SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_4r16x2c( cute::intel::coord_t coord)); namespace cute::detail { -#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) template<> struct XeSubgroup2DBlockLoad<2, 16, 1, 1> { template @@ -378,8 +378,6 @@ struct XeSubgroup2DBlockStore<2, 16, 8, 1> { } }; -#endif - template<> struct XeSubgroup2DBlockPrefetch<2, 16, 8, 1> { CUTE_HOST_DEVICE @@ -469,6 +467,7 @@ struct XeSubgroup2DBlockPrefetch<2, 16, 32, 2> { (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); } }; +#endif } namespace cute @@ -480,7 +479,7 @@ struct XE_2D_U16x1x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -496,7 +495,7 @@ struct XE_2D_U16x2x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -512,7 +511,7 @@ struct XE_2D_U16x4x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -528,7 +527,7 @@ struct XE_2D_U16x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -540,7 +539,7 @@ struct XE_2D_U16x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -557,7 +556,7 @@ struct XE_2D_U16x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -569,7 +568,7 @@ struct XE_2D_U16x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -586,7 +585,7 @@ struct XE_2D_U16x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -598,7 +597,7 @@ struct XE_2D_U16x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -615,7 +614,7 @@ struct XE_2D_U16x1x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -627,7 +626,7 @@ struct XE_2D_U16x1x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -644,7 +643,7 @@ struct XE_2D_U16x2x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -656,7 +655,7 @@ struct XE_2D_U16x2x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -673,7 +672,7 @@ struct XE_2D_U16x4x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -685,7 +684,7 @@ struct XE_2D_U16x4x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -702,7 +701,7 @@ struct XE_2D_U16x8x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -714,7 +713,7 @@ struct XE_2D_U16x8x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -731,7 +730,7 @@ struct XE_2D_U16x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -743,7 +742,7 @@ struct XE_2D_U16x16x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -760,7 +759,7 @@ struct XE_2D_U16x32x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockLoad<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -772,7 +771,7 @@ struct XE_2D_U16x32x32_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) // __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); #else @@ -790,7 +789,7 @@ struct XE_2D_U16x16x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockTransform<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -802,7 +801,7 @@ struct XE_2D_U16x16x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -819,7 +818,7 @@ struct XE_2D_U16x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockTransform<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -831,7 +830,7 @@ struct XE_2D_U16x32x16_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -848,7 +847,7 @@ struct XE_2D_U16x16x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockTransform<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -860,7 +859,7 @@ struct XE_2D_U16x16x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -877,7 +876,7 @@ struct XE_2D_U16x32x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockTransform<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -889,7 +888,7 @@ struct XE_2D_U16x32x32_LD_V { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -909,7 +908,7 @@ struct XE_2D_U16x16x8_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 4"); detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -928,7 +927,7 @@ struct XE_2D_U16x16x16_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -944,7 +943,7 @@ struct XE_2D_U16x1x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockStore<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -960,7 +959,7 @@ struct XE_2D_U16x2x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -976,7 +975,7 @@ struct XE_2D_U16x4x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockStore<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -992,7 +991,7 @@ struct XE_2D_U16x8x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) // static_assert(sizeof(T) == 2, "Expected T to have size 2"); detail::XeSubgroup2DBlockStore<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); #else diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp index 63effca597..38f57c1596 100644 --- a/include/cute/arch/xe_copy_4B.hpp +++ b/include/cute/arch/xe_copy_4B.hpp @@ -146,7 +146,7 @@ SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c( cute::intel::coord_t coord)); namespace cute::detail { -#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) template<> struct XeSubgroup2DBlockLoad<4, 16, 1, 1> { template @@ -432,7 +432,6 @@ struct XeSubgroup2DBlockStore<4, 16, 8, 1> { reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint8 *)(srcPointer)); } }; -#endif template<> struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { @@ -443,7 +442,7 @@ struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); } }; - +#endif } // namespace cute::detail end namespace cute @@ -455,7 +454,7 @@ struct XE_2D_U32x1x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -471,7 +470,7 @@ struct XE_2D_U32x2x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -487,7 +486,7 @@ struct XE_2D_U32x4x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -503,7 +502,7 @@ struct XE_2D_U32x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -519,7 +518,7 @@ struct XE_2D_U32x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -535,7 +534,7 @@ struct XE_2D_U32x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -551,7 +550,7 @@ struct XE_2D_TF32x1x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -568,7 +567,7 @@ struct XE_2D_TF32x2x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -585,7 +584,7 @@ struct XE_2D_TF32x4x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -602,7 +601,7 @@ struct XE_2D_TF32x8x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -619,7 +618,7 @@ struct XE_2D_TF32x16x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -636,7 +635,7 @@ struct XE_2D_TF32x32x8_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -652,7 +651,7 @@ struct XE_2D_TF32x1x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -669,7 +668,7 @@ struct XE_2D_TF32x2x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -686,7 +685,7 @@ struct XE_2D_TF32x4x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -703,7 +702,7 @@ struct XE_2D_TF32x8x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -720,7 +719,7 @@ struct XE_2D_TF32x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -737,7 +736,7 @@ struct XE_2D_TF32x32x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockLoad<4, 8, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -754,7 +753,7 @@ struct XE_2D_U32x16x1_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockTranspose<4, 1, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -772,7 +771,7 @@ struct XE_2D_U32x16x2_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockTranspose<4, 2, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -790,7 +789,7 @@ struct XE_2D_U32x16x4_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -808,7 +807,7 @@ struct XE_2D_U32x16x8_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -820,7 +819,7 @@ struct XE_2D_U32x16x8_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( @@ -837,7 +836,7 @@ struct XE_2D_U32x1x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) // static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockStore<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -853,7 +852,7 @@ struct XE_2D_U32x2x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockStore<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -869,7 +868,7 @@ struct XE_2D_U32x4x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockStore<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); #else @@ -885,7 +884,7 @@ struct XE_2D_U32x8x16_ST_N { CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, int pitch, intel::coord_t coord, const T *src) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) // static_assert(sizeof(T) == 4, "Expected T to have size 4"); detail::XeSubgroup2DBlockStore<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); #else diff --git a/include/cute/arch/xe_copy_8B.hpp b/include/cute/arch/xe_copy_8B.hpp index 7203bbf3a2..ad2ae2cfd3 100644 --- a/include/cute/arch/xe_copy_8B.hpp +++ b/include/cute/arch/xe_copy_8B.hpp @@ -49,7 +49,7 @@ SYCL_DEVICE_BUILTIN( int pitch_minus_one, cute::intel::coord_t coord)); -#if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) +#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) namespace cute::detail { template<> @@ -95,7 +95,7 @@ struct XE_2D_U64x8x1_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); detail::XeSubgroup2DBlockTranspose<8, 1, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -111,7 +111,7 @@ struct XE_2D_U64x8x2_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); detail::XeSubgroup2DBlockTranspose<8, 2, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -127,7 +127,7 @@ struct XE_2D_U64x8x4_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_COPY_XE_ENABLED) +#if defined(CUTE_ARCH_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); detail::XeSubgroup2DBlockTranspose<8, 4, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else diff --git a/include/cute/atom/mma_traits_xe.hpp b/include/cute/atom/mma_traits_xe.hpp index 862483a7ca..8f3a7b704f 100644 --- a/include/cute/atom/mma_traits_xe.hpp +++ b/include/cute/atom/mma_traits_xe.hpp @@ -101,6 +101,69 @@ struct MMA_Traits using CLayout = Layout, Stride<_1, _1>>; }; +template <> +struct MMA_Traits +{ + using ValTypeD = bfloat16_t; + using ValTypeA = bfloat16_t; + using ValTypeB = bfloat16_t; + using ValTypeC = bfloat16_t; + + using Shape_MNK = Shape<_8,_16,_16>; + using ThrID = Layout<_16>; + + using ALayout = Layout, Stride<_8, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_8, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = bfloat16_t; + using ValTypeA = bfloat16_t; + using ValTypeB = bfloat16_t; + using ValTypeC = bfloat16_t; + + using Shape_MNK = Shape<_4,_16,_16>; + using ThrID = Layout<_16>; + + using ALayout = Layout, Stride<_4, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_4, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = bfloat16_t; + using ValTypeA = bfloat16_t; + using ValTypeB = bfloat16_t; + using ValTypeC = bfloat16_t; + + using Shape_MNK = Shape<_2,_16,_16>; + using ThrID = Layout<_16>; + + using ALayout = Layout, Stride<_2, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_2, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = bfloat16_t; + using ValTypeA = bfloat16_t; + using ValTypeB = bfloat16_t; + using ValTypeC = bfloat16_t; + + using Shape_MNK = Shape<_1,_16,_16>; + using ThrID = Layout<_16>; + + using ALayout = Layout, Stride<_1, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_1, _1>>; +}; template <> struct MMA_Traits @@ -162,6 +225,66 @@ struct MMA_Traits using CLayout = Layout, Stride<_1, _1>>; }; +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_8,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_8, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_8, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_4,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_4, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_4, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_2,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_2, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_2, _1>>; +}; + +template <> +struct MMA_Traits +{ + using ValTypeD = half_t; + using ValTypeA = half_t; + using ValTypeB = half_t; + using ValTypeC = half_t; + + using Shape_MNK = Shape<_1,_16,_16>; + using ThrID = Layout<_16>; + using ALayout = Layout, Stride<_1, _1>>; + using BLayout = Layout, Stride<_1, _16>>; + using CLayout = Layout, Stride<_1, _1>>; +}; + template <> struct MMA_Traits { diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp index fdaba345a0..8428274fef 100644 --- a/include/cute/util/sycl_vec.hpp +++ b/include/cute/util/sycl_vec.hpp @@ -57,6 +57,10 @@ using float2 = vector_t; using float4 = vector_t; using float8 = vector_t; +using half2 = vector_t<_Float16, 2>; +using half4 = vector_t<_Float16, 4>; +using half8 = vector_t<_Float16, 8>; + using short2 = vector_t; using short4 = vector_t; using short8 = vector_t; diff --git a/include/cutlass/epilogue/collective/xe_epilogue.hpp b/include/cutlass/epilogue/collective/xe_epilogue.hpp index 8133326ff7..f55acdc4ad 100644 --- a/include/cutlass/epilogue/collective/xe_epilogue.hpp +++ b/include/cutlass/epilogue/collective/xe_epilogue.hpp @@ -295,8 +295,6 @@ class CollectiveEpilogue< auto sg_m_coord = m_coord * ATOM_M + sg_local_m_coord; auto sg_n_coord = n_coord * ATOM_N + sg_local_n_coord; auto sg_coord = make_coord(sg_m_coord, sg_n_coord, k_coord, l_coord); - - bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed(); // Represent the full output tensor Tensor mD_mnl = cute::get_pvc_tensor(make_shape(M,N,L)); @@ -363,12 +361,12 @@ class CollectiveEpilogue< CUTLASS_PRAGMA_UNROLL for (int epi_m = 0; epi_m < FragsM; epi_m++) { - if (is_C_load_needed) { + if (is_source_supported && fusion_callbacks.is_C_load_needed()) { //cordinates for C and D are the same copy(params.xe_load_c, tCgD(_, epi_m, epi_n), trC); } - cst_callbacks.previsit(epi_m, epi_n, 0, is_C_load_needed); + cst_callbacks.previsit(epi_m, epi_n, 0, is_source_supported && fusion_callbacks.is_C_load_needed()); auto acc_frag_mn = acc_frag(_, epi_m, epi_n); diff --git a/test/unit/cute/intel_xe/mma.cpp b/test/unit/cute/intel_xe/mma.cpp index 1c0e3d8a61..5589310f61 100755 --- a/test/unit/cute/intel_xe/mma.cpp +++ b/test/unit/cute/intel_xe/mma.cpp @@ -263,6 +263,26 @@ TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F32BF16BF16F32_TT) { bfloat16_t, float>(512, 512, 256); } +TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_4x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_BF16BF16BF16BF16_TT) { + MMA_Test(512, 512, 256); +} + TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_F32F16F16F32_TT) { MMA_Test(512, 512, 256); @@ -279,8 +299,28 @@ TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_F32F16F16F32_TT) { } TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F32F16F16F32_TT) { - MMA_Test( - 512, 512, 256); + MMA_Test + (512, 512, 256); +} +#if defined(CUTE_ARCH_MMA_XE_SPIRV_ENABLED) +TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_4x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); +} + +TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); +} +#endif +TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F16F16F16F16_TT) { + MMA_Test + (512, 512, 256); } TEST(PVC_CuTe_Xe, FMA_XE_UniversalFMA_F32F32F32F32) { diff --git a/test/unit/cute/intel_xe/utils.hpp b/test/unit/cute/intel_xe/utils.hpp index e109d9fe27..48973a0de9 100755 --- a/test/unit/cute/intel_xe/utils.hpp +++ b/test/unit/cute/intel_xe/utils.hpp @@ -59,10 +59,10 @@ void verify(uint32_t m, uint32_t n, uint32_t k, atype *A, btype *B, ctype *C, bool row_a = true, bool row_b = true) { int cnt = 0; bool is_normal = true; - + using accum_type = conditional_t == 32, ctype, float>; for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { - ctype expect = ctype(0); + accum_type expect = accum_type(0); for (int z = 0; z < k; z++) { auto a = row_a ? A[i * k + z] : A[i + z * m]; auto b = row_b ? B[z * n + j] : B[z + j * k]; @@ -71,15 +71,10 @@ void verify(uint32_t m, uint32_t n, uint32_t k, atype *A, btype *B, ctype *C, ctype val = C[i * n + j]; - if constexpr(std::is_floating_point_v) { - if (isnormal(val) && isnormal(expect)) { - auto error = std::abs((expect - val) / val); - if (error > 0.01f) { - cnt++; - } - } else { - // TODO(codeplay): Assert that at least some values are non-zero. - if(!(expect == 0 && val == 0)) is_normal = false; + if (isnormal(val) && isnormal(expect)) { + auto error = std::abs((expect - val) / val); + if (error > 0.02f) { + cnt++; } } else { if (val != expect) { From d9f83034a7c388b8ed9bf3cca27f2ea734778f5b Mon Sep 17 00:00:00 2001 From: "Yang, Ji" Date: Tue, 29 Apr 2025 17:04:38 +0800 Subject: [PATCH 03/21] remove -1 from OCL API --- include/cute/arch/xe_copy_1B.hpp | 12 ++++++------ include/cute/arch/xe_copy_2B.hpp | 6 +++--- include/cute/arch/xe_copy_4B.hpp | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp index 26840134f5..b03b8a0e3d 100644 --- a/include/cute/arch/xe_copy_1B.hpp +++ b/include/cute/arch/xe_copy_1B.hpp @@ -404,7 +404,7 @@ struct XeSubgroup2DBlockPrefetch<1, 32, 8, 1> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( - (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + (intptr_t)srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, CacheControl::kL1C_L3C); } }; @@ -414,7 +414,7 @@ struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_8b_1r32x2c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; @@ -424,7 +424,7 @@ struct XeSubgroup2DBlockPrefetch<1, 32, 2, 2> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_8b_2r32x2c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; @@ -434,7 +434,7 @@ struct XeSubgroup2DBlockPrefetch<1, 32, 4, 2> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_8b_4r32x2c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; @@ -444,7 +444,7 @@ struct XeSubgroup2DBlockPrefetch<1, 32, 8, 2> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_8b_8r32x2c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; @@ -454,7 +454,7 @@ struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_8b_32r16x1c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; #endif diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp index 4bbd053015..326d326258 100644 --- a/include/cute/arch/xe_copy_2B.hpp +++ b/include/cute/arch/xe_copy_2B.hpp @@ -414,7 +414,7 @@ struct XeSubgroup2DBlockPrefetch<2, 16, 1, 2> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_16b_1r16x2c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; @@ -424,7 +424,7 @@ struct XeSubgroup2DBlockPrefetch<2, 16, 2, 2> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_16b_2r16x2c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; @@ -434,7 +434,7 @@ struct XeSubgroup2DBlockPrefetch<2, 16, 4, 2> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_16b_4r16x2c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp index 38f57c1596..c8b31ce807 100644 --- a/include/cute/arch/xe_copy_4B.hpp +++ b/include/cute/arch/xe_copy_4B.hpp @@ -439,7 +439,7 @@ struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate) { intel_sub_group_2d_block_prefetch_32b_16r8x1c( - (__global void*)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); } }; #endif From 5537fd7e8782f83079076c32c8adeb1ae4543475 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 6 May 2025 13:51:38 +0100 Subject: [PATCH 04/21] rebase --- include/cute/arch/xe_copy_4B.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp index ab081b028b..827f6a006b 100644 --- a/include/cute/arch/xe_copy_4B.hpp +++ b/include/cute/arch/xe_copy_4B.hpp @@ -731,10 +731,10 @@ struct XE_2D_TF32x16x16_LD_N { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord) { -#if defined(SYCL_INTEL_TARGET) - detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord);#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); +#if defined(CUTE_ARCH_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block prefetch on non-Xe hardware"); #endif } }; From c89a8759f555b7446d65efb6834f8e2445afd3ac Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 6 May 2025 16:56:54 +0100 Subject: [PATCH 05/21] Disable spirv functions for PVC --- CMakeLists.txt | 6 ++++++ include/cute/arch/xe_config.hpp | 2 +- python/cutlass/backend/compiler.py | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d913fed5e..cbe7e09923 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,8 @@ option(CUTLASS_SYCL_PROFILING_ENABLED "Use SYCL events to calculate device execu option(CUTLASS_SYCL_RUNNING_CI "Enable this option when building in a CI environment. It activates CI specific configurations, such as additional checks or selectively disabling tests that cannot run in CI." OFF) +option(CUTLASS_SYCL_BUILTIN_ENABLE "Enable this option to use builtin functions instead of SPIR-V" OFF) + if (CUTLASS_ENABLE_SYCL) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) @@ -130,6 +132,10 @@ if (CUTLASS_ENABLE_SYCL) add_compile_definitions(SYCLCOMPAT_PROFILING_ENABLED) endif() + if (CUTLASS_SYCL_BUILTIN_ENABLE) + add_compile_definitions(CUTLASS_SYCL_BUILTIN_ENABLE) + endif() + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/onemkl.cmake) endif() find_package(Doxygen QUIET) diff --git a/include/cute/arch/xe_config.hpp b/include/cute/arch/xe_config.hpp index 60a2cb196a..9d9189811c 100644 --- a/include/cute/arch/xe_config.hpp +++ b/include/cute/arch/xe_config.hpp @@ -60,7 +60,7 @@ #define CUTE_ARCH_XE_ENABLED #endif -#if defined(CUTE_ARCH_XE_ENABLED) && defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200) +#if defined(CUTE_ARCH_XE_ENABLED) && ((defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200)) || defined(CUTLASS_SYCL_BUILTIN_ENABLE)) #define CUTE_ARCH_XE_BUILTIN_ENABLED #elif defined(CUTE_ARCH_XE_ENABLED) #define CUTE_ARCH_XE_SPIRV_ENABLED diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py index 62585afc78..412295c600 100644 --- a/python/cutlass/backend/compiler.py +++ b/python/cutlass/backend/compiler.py @@ -163,6 +163,7 @@ def __init__(self) -> None: "-DCUTLASS_ENABLE_SYCL", "-fsycl-rtc-mode", "-DSYCL_INTEL_TARGET", + "-DCUTLASS_SYCL_BUILTIN_ENABLE" # TODO(Codeplay): remove this when the spirv functions are available for PVC "-shared", "-fPIC", "-fno-sycl-dead-args-optimization", "-Xspirv-translator -spirv-ext=+SPV_INTEL_split_barrier", From 5e26dd35f0ee669a032796e43a3b308a5fb11292 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 6 May 2025 17:31:45 +0100 Subject: [PATCH 06/21] move spirv definitions --- include/cute/arch/xe_config.hpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/include/cute/arch/xe_config.hpp b/include/cute/arch/xe_config.hpp index 9d9189811c..37e8b4b8a7 100644 --- a/include/cute/arch/xe_config.hpp +++ b/include/cute/arch/xe_config.hpp @@ -91,6 +91,9 @@ struct XeSubgroup2DBlockStore { }; } #endif + +#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) + // SPIRV copy definitions SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, @@ -105,15 +108,14 @@ SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransp const void* src_base_pointer, int memory_width, int memory_height, int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockStoreINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, void* src_pointer, const void* dst_base_pointer, int memory_width, - int memory_height, int memory_pitch, cute::intel::coord_t coordinate); + int memory_height, int memory_pitch, cute::intel::coord_t coordinate); SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockPrefetchINTEL( int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, const void* src_base_pointer, int memory_width, int memory_height, int memory_pitch, cute::intel::coord_t coordinate); -#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) namespace cute::detail { template struct XeSubgroup2DBlockLoad { @@ -202,6 +204,8 @@ namespace cute::detail{ }; } // namespace cute::detail end +#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) + // mma_bf16 SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); @@ -238,7 +242,6 @@ SYCL_DEVICE_OCL(cute::intel::half4 intel_sub_group_f16_f16_matrix_mad_k16(cute:: SYCL_DEVICE_OCL(cute::intel::half2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::half2 acc)); SYCL_DEVICE_OCL( sycl::half intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, sycl::half acc)); -#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) namespace cute::detail { template<> @@ -328,6 +331,8 @@ struct XeSubgroupMatrixMultiplyAccumulate // @param AccRegisters // @param Operands code // @return DRegisters +#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) + SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::float8, int32_t); SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::float4, int32_t); SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::float2, int32_t); @@ -371,7 +376,7 @@ struct SPIRV_MMAOperands { static constexpr int SPIRV_MatrixATf32 = 0x100; static constexpr int SPIRV_MatrixBTf32 = 0x200; }; -#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) + namespace cute::detail { template<> From 8c679479c4e5654ad0fd86f9afd1147af92af975 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 6 May 2025 18:02:59 +0100 Subject: [PATCH 07/21] fix --- python/cutlass/backend/compiler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py index 412295c600..c559273942 100644 --- a/python/cutlass/backend/compiler.py +++ b/python/cutlass/backend/compiler.py @@ -159,11 +159,12 @@ def __init__(self) -> None: "--expt-relaxed-constexpr", "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored", ] + # TODO(Codeplay): remove CUTLASS_SYCL_BUILTIN_ENABLE when the spirv functions are available for PVC self._dpcpp_compile_options = ["-fsycl", "-std=c++17", "-DCUTLASS_ENABLE_SYCL", "-fsycl-rtc-mode", "-DSYCL_INTEL_TARGET", - "-DCUTLASS_SYCL_BUILTIN_ENABLE" # TODO(Codeplay): remove this when the spirv functions are available for PVC + "-DCUTLASS_SYCL_BUILTIN_ENABLE", "-shared", "-fPIC", "-fno-sycl-dead-args-optimization", "-Xspirv-translator -spirv-ext=+SPV_INTEL_split_barrier", From 879eb3529333a4c7d14885d14fafc938654a5114 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 8 May 2025 18:34:18 +0100 Subject: [PATCH 08/21] Refactor --- include/cute/arch/copy_xe.hpp | 95 +- include/cute/arch/copy_xe_U16.hpp | 565 +++++++ include/cute/arch/copy_xe_U32.hpp | 496 ++++++ include/cute/arch/copy_xe_U4.hpp | 173 ++ .../arch/{xe_copy_8B.hpp => copy_xe_U64.hpp} | 65 +- include/cute/arch/copy_xe_U8.hpp | 529 +++++++ include/cute/arch/copy_xe_builtin.hpp | 1409 +++++++++++++++++ include/cute/arch/copy_xe_spirv.hpp | 134 ++ include/cute/arch/mma_xe.hpp | 68 +- include/cute/arch/mma_xe_builtin.hpp | 140 ++ include/cute/arch/mma_xe_spirv.hpp | 157 ++ include/cute/arch/xe_config.hpp | 459 ------ include/cute/arch/xe_copy_1B.hpp | 1098 ------------- include/cute/arch/xe_copy_2B.hpp | 1002 ------------ include/cute/arch/xe_copy_4B.hpp | 908 ----------- include/cute/atom/copy_traits_xe.hpp | 33 +- 16 files changed, 3670 insertions(+), 3661 deletions(-) create mode 100644 include/cute/arch/copy_xe_U16.hpp create mode 100644 include/cute/arch/copy_xe_U32.hpp create mode 100644 include/cute/arch/copy_xe_U4.hpp rename include/cute/arch/{xe_copy_8B.hpp => copy_xe_U64.hpp} (57%) create mode 100644 include/cute/arch/copy_xe_U8.hpp create mode 100644 include/cute/arch/copy_xe_builtin.hpp create mode 100644 include/cute/arch/copy_xe_spirv.hpp create mode 100644 include/cute/arch/mma_xe_builtin.hpp create mode 100644 include/cute/arch/mma_xe_spirv.hpp delete mode 100644 include/cute/arch/xe_config.hpp delete mode 100644 include/cute/arch/xe_copy_1B.hpp delete mode 100644 include/cute/arch/xe_copy_2B.hpp delete mode 100644 include/cute/arch/xe_copy_4B.hpp diff --git a/include/cute/arch/copy_xe.hpp b/include/cute/arch/copy_xe.hpp index eebc97ee67..c36befa7c8 100644 --- a/include/cute/arch/copy_xe.hpp +++ b/include/cute/arch/copy_xe.hpp @@ -29,43 +29,22 @@ * **************************************************************************************************/ #pragma once -#include -#include -#include -#include -// prefetch -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uchar( - const __attribute__((opencl_global)) uint8_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ushort( - const __attribute__((opencl_global)) uint16_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint2( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint4( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_uint8( - const __attribute__((opencl_global)) uint32_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong2( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong4( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_lsc_prefetch_global_ulong8( - const __attribute__((opencl_global)) uint64_t *base, int immElemOff, - enum CacheControl cacheOpt)); +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) +#define CUTE_ARCH_COPY_XE_ENABLED +#endif + +#if defined(CUTE_ARCH_COPY_XE_ENABLED) && ((defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200)) || defined(CUTLASS_SYCL_BUILTIN_ENABLE)) +#include +#elif defined(CUTE_ARCH_COPY_XE_ENABLED) +#include +#endif +#include +#include +#include +#include +#include #ifdef __SYCL_DEVICE_ONLY__ SYCL_EXTERNAL __attribute__((convergent)) void __spirv_ControlBarrierWaitINTEL(int execution_scope, int memory_scope, int memory_semantics); @@ -137,49 +116,6 @@ struct XE_1D_LDSM { } }; -template -struct PREFETCH { - using SRegisters = S[1]; - using DRegisters = D[1]; - - template - CUTE_HOST_DEVICE static void copy(const S_ &src, D_ &dst) { -#if defined(SYCL_INTEL_TARGET) - if constexpr(sizeof(D) == 1) { - __builtin_IB_lsc_prefetch_global_uchar( - (const __attribute__((opencl_global)) uint8_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 2) { - __builtin_IB_lsc_prefetch_global_ushort( - (const __attribute__((opencl_global)) uint16_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 4) { - __builtin_IB_lsc_prefetch_global_uint( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 8) { - __builtin_IB_lsc_prefetch_global_uint2( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 16) { - __builtin_IB_lsc_prefetch_global_uint4( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 32) { - __builtin_IB_lsc_prefetch_global_uint8( - (const __attribute__((opencl_global)) uint32_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } - else if constexpr(sizeof(D) == 64) { - __builtin_IB_lsc_prefetch_global_ulong8( - (const __attribute__((opencl_global)) uint64_t *)(&*&src), 0, CacheControl::kL1C_L3C); - } -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } -}; - template struct XE_1D_LOAD_GLOBAL { using SRegisters = S[1]; @@ -207,9 +143,6 @@ struct XE_1D_LOAD_GLOBAL { CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif } - - using PREFETCH = PREFETCH; - }; template diff --git a/include/cute/arch/copy_xe_U16.hpp b/include/cute/arch/copy_xe_U16.hpp new file mode 100644 index 0000000000..5deb8434d8 --- /dev/null +++ b/include/cute/arch/copy_xe_U16.hpp @@ -0,0 +1,565 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +namespace cute +{ +struct XE_2D_U16x1x16_LD_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x2x16_LD_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x4x16_LD_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x8x16_LD_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x16_LD_N { + using BlockShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x16_LD_N { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x1x32_LD_N { + using BlockShape = Shape<_1, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x2x32_LD_N { + using BlockShape = Shape<_2, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x4x32_LD_N { + using BlockShape = Shape<_4, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x8x32_LD_N { + using BlockShape = Shape<_8, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x32_LD_N { + using BlockShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x32_LD_N { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockLoad<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x16_LD_V { + using BlockShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x16_LD_V { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x32_LD_V { + using BlockShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x32x32_LD_V { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTransform<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U16x16x8_LD_T { + using BlockShape = Shape<_8, _16>; + using inst_dtype = uint32_t; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x16x16_LD_T { + using BlockShape = Shape<_16, _16>; + using inst_dtype = uint32_t; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x1x16_ST_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x2x16_ST_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x4x16_ST_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U16x8x16_ST_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 2, "Expected T to have size 2"); + detail::XeSubgroup2DBlockStore<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; +} // end namespace cute diff --git a/include/cute/arch/copy_xe_U32.hpp b/include/cute/arch/copy_xe_U32.hpp new file mode 100644 index 0000000000..8802b0868e --- /dev/null +++ b/include/cute/arch/copy_xe_U32.hpp @@ -0,0 +1,496 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +namespace cute +{ +struct XE_2D_U32x1x16_LD_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x2x16_LD_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x4x16_LD_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x8x16_LD_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x16_LD_N { + using BlockShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x32x16_LD_N { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x1x8_LD_N { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x2x8_LD_N { + using BlockShape = Shape<_2, _8>; + using ValueShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x4x8_LD_N { + using BlockShape = Shape<_4, _8>; + using ValueShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x8x8_LD_N { + using BlockShape = Shape<_8, _8>; + using ValueShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x16x8_LD_N { + using BlockShape = Shape<_16, _8>; + using ValueShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x32x8_LD_N { + using BlockShape = Shape<_32, _8>; + using ValueShape = Shape<_16, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x1x16_LD_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x2x16_LD_N { + using BlockShape = Shape<_2, _16>; + using ValueShape = Shape<_1, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x4x16_LD_N { + using BlockShape = Shape<_4, _16>; + using ValueShape = Shape<_2, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x8x16_LD_N { + using BlockShape = Shape<_8, _16>; + using ValueShape = Shape<_4, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_TF32x16x16_LD_N { + using BlockShape = Shape<_16, _16>; + using ValueShape = Shape<_8, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_TF32x32x16_LD_N { + using BlockShape = Shape<_32, _16>; + using ValueShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockLoad<4, 8, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + + +struct XE_2D_U32x16x1_LD_T { + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 1, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x2_LD_T { + using BlockShape = Shape<_2, _16>; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 2, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x4_LD_T { + using BlockShape = Shape<_4, _16>; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x16x8_LD_T { + using BlockShape = Shape<_8, _16>; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U32x1x16_ST_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x2x16_ST_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x4x16_ST_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U32x8x16_ST_N { + using BlockShape = Shape<_8, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + // static_assert(sizeof(T) == 4, "Expected T to have size 4"); + detail::XeSubgroup2DBlockStore<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +} // end namespace cute diff --git a/include/cute/arch/copy_xe_U4.hpp b/include/cute/arch/copy_xe_U4.hpp new file mode 100644 index 0000000000..f253dd4249 --- /dev/null +++ b/include/cute/arch/copy_xe_U4.hpp @@ -0,0 +1,173 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" +#include "cute/pointer.hpp" + +namespace cute +{ + +struct XE_2D_U4x16x16_LD_T { + using BlockShape = Shape<_16, _16>; + using inst_dtype = uint32_t; + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockTranspose<4, 2, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U4x32x16_LD_T { + using BlockShape = Shape<_32, _16>; + using inst_dtype = uint32_t; + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U4x32x64_LD_N { + using BlockShape = Shape<_32, _64>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); + + // ================= shuffle begin ================= + // FIXME: the performance of shuffle algorithm here is too bad, we are working with + // compiler/IGC team to optimize it. + + static constexpr auto subgroup_size = 16; + static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; + static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; + + auto sg = syclcompat::get_nd_item<1>().get_sub_group(); + auto id = int(ThreadIdxX()) % subgroup_size; + + cute::subbyte_iterator dst_iter(dst); + cute::array_subbyte dst_tmp{}; + + #pragma unroll + for (int cw = 0; cw < copy_W; cw++) { + auto remote_id = (id + cw * subgroup_size) / copy_W; + + // TODO: select 'ushort32' will cause compiling error, use 'ushort16' instead, why? + intel::ushort16 remote_dst[2]; + remote_dst[0] = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); + remote_dst[1] = sycl::select_from_group(sg, *((reinterpret_cast(dst)) + 1), remote_id); + + cute::subbyte_iterator remote_dst_iter(remote_dst); + + #pragma unroll + for (int row = 0; row < copy_H; row++) { + dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); + } + } + + *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U4x16x64_LD_N { + using BlockShape = Shape<_16, _64>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); + + // ================= shuffle begin ================= + // FIXME: the performance of shuffle algorithm here is too bad, we are working with + // compiler/IGC team to optimize it. + + static constexpr auto subgroup_size = 16; + static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; + static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; + + auto sg = syclcompat::get_nd_item<1>().get_sub_group(); + auto id = int(ThreadIdxX()) % subgroup_size; + + cute::subbyte_iterator dst_iter(dst); + cute::array_subbyte dst_tmp{}; + + #pragma unroll + for (int cw = 0; cw < copy_W; cw++) { + auto remote_id = (id + cw * subgroup_size) / copy_W; + + intel::ushort16 remote_dst; + remote_dst = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); + + cute::subbyte_iterator remote_dst_iter(&remote_dst); + + + #pragma unroll + for (int row = 0; row < copy_H; row++) { + dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); + } + } + + *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +} // end namespace cute diff --git a/include/cute/arch/xe_copy_8B.hpp b/include/cute/arch/copy_xe_U64.hpp similarity index 57% rename from include/cute/arch/xe_copy_8B.hpp rename to include/cute/arch/copy_xe_U64.hpp index 987d8a98f6..49a984a789 100644 --- a/include/cute/arch/xe_copy_8B.hpp +++ b/include/cute/arch/copy_xe_U64.hpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without @@ -30,62 +30,9 @@ **************************************************************************************************/ #pragma once -#include -#include -#include +#include +#include "cute/config.hpp" -// 64bits No transform Transpose -SYCL_DEVICE_BUILTIN( - cute::intel::ulong __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ulong2 __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ulong4 __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - - -#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) -namespace cute::detail -{ -template<> -struct XeSubgroup2DBlockTranspose<8, 1, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTranspose<8, 2, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTranspose<8, 4, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; -} -#endif namespace cute { struct XE_2D_U64x8x1_LD_T { @@ -95,7 +42,7 @@ struct XE_2D_U64x8x1_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); detail::XeSubgroup2DBlockTranspose<8, 1, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -111,7 +58,7 @@ struct XE_2D_U64x8x2_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); detail::XeSubgroup2DBlockTranspose<8, 2, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else @@ -127,7 +74,7 @@ struct XE_2D_U64x8x4_LD_T { CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, int height, int pitch, intel::coord_t coord, T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 8, "Expected T to have size 8"); detail::XeSubgroup2DBlockTranspose<8, 4, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); #else diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp new file mode 100644 index 0000000000..f3a2d574ab --- /dev/null +++ b/include/cute/arch/copy_xe_U8.hpp @@ -0,0 +1,529 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +namespace cute +{ +struct XE_2D_U8x1x32_LD_N { + using BlockShape = Shape<_1, _32>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x2x32_LD_N { + using BlockShape = Shape<_2, _32>; + using inst_dtype = int8_t; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x2x32_ST_N { + using BlockShape = Shape<_2, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + +struct XE_2D_U8x4x32_LD_N { + using BlockShape = Shape<_4, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x8x32_LD_N { + using BlockShape = Shape<_8, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x16x32_LD_N { + using BlockShape = Shape<_16, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x32x32_LD_N { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x1x64_LD_N { + using BlockShape = Shape<_1, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x2x64_LD_N { + using BlockShape = Shape<_2, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x4x64_LD_N { + using BlockShape = Shape<_4, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x8x64_LD_N { + using BlockShape = Shape<_8, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x16x64_LD_N { + using BlockShape = Shape<_16, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x32x64_LD_N { + using BlockShape = Shape<_32, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 32, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + + + +struct XE_2D_U8x32x16_LD_V { + using BlockShape = Shape<_32, _16>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } + + struct PREFETCH { + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, + intel::coord_t coord) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + detail::XeSubgroup2DBlockPrefetch<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); +#else + CUTE_INVALID_CONTROL_PATH( + "Trying to use block prefetch on non-Xe hardware"); +#endif + } + }; +}; + +struct XE_2D_U8x32x32_LD_V { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x32x64_LD_V { + using BlockShape = Shape<_32, _64>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockTransform<1, 16, 32, 4>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x1x16_ST_N { + using BlockShape = Shape<_1, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x2x16_ST_N { + using BlockShape = Shape<_2, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x4x16_ST_N { + using BlockShape = Shape<_4, _16>; + + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x8x16_ST_N { + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_U8x8x32_ST_N { + template + CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, + int pitch, intel::coord_t coord, + const T *src) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockStore<1, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, src); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; +} // end namespace cute diff --git a/include/cute/arch/copy_xe_builtin.hpp b/include/cute/arch/copy_xe_builtin.hpp new file mode 100644 index 0000000000..a6404475eb --- /dev/null +++ b/include/cute/arch/copy_xe_builtin.hpp @@ -0,0 +1,1409 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x +#else +#define SYCL_DEVICE_BUILTIN(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } +#endif + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x +#else +#define SYCL_DEVICE_OCL(x) \ + inline x { \ + CUTE_INVALID_CONTROL_PATH( \ + "Attempting to use a device built-in in host code."); \ + } +#endif + + +#undef __global +#define __global __attribute__((opencl_global)) + + +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) +#define CUTE_ARCH_XE_COPY_ENABLED +#endif + +namespace cute::detail +{ +template +struct XeSubgroup2DBlockPrefetch { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockLoad { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTransform { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockTranspose { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +template +struct XeSubgroup2DBlockStore { + static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); +}; +} + +enum class CacheControl { + kDefault = 0, + kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached + kL1UC_L3C = 2, // Override to L1 uncached and L3 cached + kL1C_L3UC = 3, // Override to L1 cached and L3 uncached + kL1C_L3C = 4, // Override to L1 cached and L3 cached + kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached + kL1S_L3C = 6, // Override to L1 streaming load and L3 cached + kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached +}; + +// 8bits No transform No transpose +SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + + +// 8bits VNNI transform No transpose +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u8_k32( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 8bits No transform No transpose +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar2 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar4)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); + +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +// // 2D prefetch +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_1r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_2r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_4r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_8r32x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_32r16x1c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN(cute::intel::ushort16 intel_subgroup_block_read_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN(cute::intel::int8 intel_subgroup_block_read_transform_u16_k16( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// U16 prefetch +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); + +// 16 bits No transform No transpose +SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN( + cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 16bits VNNI transform No transpose +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u16_k16( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k32( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 16bits +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort2 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort4 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort8 data)); + +// 2D prefetch +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_1r16x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_2r16x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_4r16x2c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); + + +// 32bits specific for tf32 No transform No transpose +SYCL_DEVICE_BUILTIN( + cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 32bits No transform No transpose +SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 32bits No transform Transpose +SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +// 32bits +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint2 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint4 data)); +SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint8 data)); + +SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c( + __global void* base_address, int width, int height, int pitch, + cute::intel::coord_t coord)); + +// 64bits No transform Transpose +SYCL_DEVICE_BUILTIN( + cute::intel::ulong __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ulong2 __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::ulong4 __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); + +namespace cute::detail +{ +template<> +struct XeSubgroup2DBlockLoad<1, 32, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 1, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 2, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 4, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<1, 32, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<1, 16, 32, 4> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<1, 16, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + (intptr_t)srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_1r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_2r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_4r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_8r32x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_32r16x1c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 1, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 2, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 4, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<2, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTransform<2, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(ushort *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<2, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( + (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_1r16x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_2r16x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_4r16x2c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 2> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 16, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 32, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 1, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 2, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 4, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 8, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 16, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockLoad<4, 8, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 1, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 8, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 1, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(uint *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 2, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint2 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 4, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint4 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockStore<4, 16, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( + reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint8 *)(srcPointer)); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_32b_16r8x1c( + (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<8, 1, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<8, 2, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<8, 4, 8, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u64_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +} // namespace cute::detail diff --git a/include/cute/arch/copy_xe_spirv.hpp b/include/cute/arch/copy_xe_spirv.hpp new file mode 100644 index 0000000000..0a59c47a26 --- /dev/null +++ b/include/cute/arch/copy_xe_spirv.hpp @@ -0,0 +1,134 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cute/config.hpp" + +// TODO(Codeplay): This builtin is not available on SPIRV +SYCL_EXTERNAL extern "C" +cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord); + +// SPIRV copy definitions +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransformINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransposeINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockStoreINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + void* src_pointer, const void* dst_base_pointer, int memory_width, + int memory_height, int memory_pitch, cute::intel::coord_t coordinate); +SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockPrefetchINTEL( + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void* src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate); + +namespace cute::detail { + +template +struct XeSubgroup2DBlockLoad { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + __spirv_Subgroup2DBlockLoadINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); + } +}; + +template +struct XeSubgroup2DBlockTransform { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + __spirv_Subgroup2DBlockLoadTransformINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); + } +}; + +template +struct XeSubgroup2DBlockTranspose { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + __spirv_Subgroup2DBlockLoadTransposeINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); + } +}; + +template +struct XeSubgroup2DBlockPrefetch { + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template +struct XeSubgroup2DBlockStore { + template + CUTE_HOST_DEVICE + void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* srcPointer) { + __spirv_Subgroup2DBlockStoreINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + (void*)(srcPointer), dstBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { + template + CUTE_HOST_DEVICE + void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +} // namespace cute::detail end diff --git a/include/cute/arch/mma_xe.hpp b/include/cute/arch/mma_xe.hpp index 4755250fea..5b7371ab60 100644 --- a/include/cute/arch/mma_xe.hpp +++ b/include/cute/arch/mma_xe.hpp @@ -30,9 +30,19 @@ **************************************************************************************************/ #pragma once +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) +#define CUTE_ARCH_MMA_XE_ENABLED +#endif + +#if defined(CUTE_ARCH_MMA_XE_ENABLED) && ((defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200)) || defined(CUTLASS_SYCL_BUILTIN_ENABLE)) +#include +#elif defined(CUTE_ARCH_MMA_XE_ENABLED) +#include +#endif + #include #include -#include +#include namespace cute { //MxNxK_D,A,B,C @@ -52,7 +62,7 @@ struct XE_8x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float8 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); @@ -72,7 +82,7 @@ struct XE_4x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float4 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); @@ -92,7 +102,7 @@ struct XE_2x16x16_F32BF16BF16F32_TT intel::int8 const& b, intel::float2 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32BF16BF16F32_TT on non-Xe hardware"); @@ -113,7 +123,7 @@ struct XE_1x16x16_F32BF16BF16F32_TT intel::int8 const& b, float const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32BF16BF16F32_TT on non-Xe hardware"); @@ -134,7 +144,7 @@ struct XE_8x16x16_BF16BF16BF16BF16_TT intel::int8 const& b, intel::short8 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); @@ -154,7 +164,7 @@ struct XE_4x16x16_BF16BF16BF16BF16_TT intel::int8 const& b, intel::short4 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); @@ -174,7 +184,7 @@ struct XE_2x16x16_BF16BF16BF16BF16_TT intel::int8 const& b, intel::short2 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); @@ -194,7 +204,7 @@ struct XE_1x16x16_BF16BF16BF16BF16_TT intel::int8 const& b, short const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_BF16BF16BF16BF16_TT on non-PVC hardware"); @@ -218,7 +228,7 @@ struct XE_8x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float8 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F32F16F16F32_TT on non-Xe hardware"); @@ -239,7 +249,7 @@ struct XE_4x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float4 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F32F16F16F32_TT on non-Xe hardware"); @@ -260,7 +270,7 @@ struct XE_2x16x16_F32F16F16F32_TT intel::int8 const& b, intel::float2 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F32F16F16F32_TT on non-Xe hardware"); @@ -281,7 +291,7 @@ struct XE_1x16x16_F32F16F16F32_TT intel::int8 const& b, float const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F32F16F16F32_TT on non-Xe hardware"); @@ -302,7 +312,7 @@ struct XE_8x16x16_F16F16F16F16_TT intel::int8 const& b, intel::half8 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F16F16F16F16_TT on non-PVC hardware"); @@ -323,7 +333,7 @@ struct XE_4x16x16_F16F16F16F16_TT intel::int8 const& b, intel::half4 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F16F16F16F16_TT on non-PVC hardware"); @@ -344,7 +354,7 @@ struct XE_2x16x16_F16F16F16F16_TT intel::int8 const& b, sycl::half2 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F16F16F16F16_TT on non-PVC hardware"); @@ -365,7 +375,7 @@ struct XE_1x16x16_F16F16F16F16_TT intel::int8 const& b, half_t const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F16F16F16F16_TT on non-PVC hardware"); @@ -389,7 +399,7 @@ struct XE_8x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int8 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32S8S8S32_TT on non-Xe hardware"); @@ -410,7 +420,7 @@ struct XE_4x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int4 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32S8S8S32_TT on non-Xe hardware"); @@ -431,7 +441,7 @@ struct XE_2x16x32_S32S8S8S32_TT intel::int8 const& b, intel::int2 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32S8S8S32_TT on non-Xe hardware"); @@ -452,7 +462,7 @@ struct XE_1x16x32_S32S8S8S32_TT intel::int8 const& b, int const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32S8S8S32_TT on non-Xe hardware"); @@ -473,7 +483,7 @@ struct XE_8x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int8 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x32_S32U8U8S32_TT on non-Xe hardware"); @@ -494,7 +504,7 @@ struct XE_4x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int4 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x32_S32U8U8S32_TT on non-Xe hardware"); @@ -515,7 +525,7 @@ struct XE_2x16x32_S32U8U8S32_TT intel::uint8 const& b, intel::int2 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x32_S32U8U8S32_TT on non-Xe hardware"); @@ -536,7 +546,7 @@ struct XE_1x16x32_S32U8U8S32_TT intel::uint8 const& b, int const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x32_S32U8U8S32_TT on non-Xe hardware"); @@ -557,7 +567,7 @@ struct XE_8x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float8 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x8_F32TF32TF32F32_TT on non-Xe hardware"); @@ -578,7 +588,7 @@ struct XE_4x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float4 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x8_F32TF32TF32F32_TT on non-Xe hardware"); @@ -599,7 +609,7 @@ struct XE_2x16x8_F32TF32TF32F32_TT intel::float8 const& b, intel::float2 const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x8_F32TF32TF32F32_TT on non-Xe hardware"); @@ -620,7 +630,7 @@ struct XE_1x16x8_F32TF32TF32F32_TT intel::float8 const& b, float const& c) { -#if defined(CUTE_ARCH_XE_ENABLED) +#if defined(CUTE_ARCH_MMA_XE_ENABLED) d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); #else CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x8_F32TF32TF32F32_TT on non-Xe hardware"); diff --git a/include/cute/arch/mma_xe_builtin.hpp b/include/cute/arch/mma_xe_builtin.hpp new file mode 100644 index 0000000000..856849cc49 --- /dev/null +++ b/include/cute/arch/mma_xe_builtin.hpp @@ -0,0 +1,140 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once +#include + +// mma_bf16 +SYCL_EXTERNAL cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc); +SYCL_DEVICE_OCL cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc); +SYCL_EXTERNAL cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc); +SYCL_EXTERNAL float intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, float acc); +// mma_half +SYCL_EXTERNAL cute::intel::float8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc); +SYCL_EXTERNAL cute::intel::float4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc); +SYCL_EXTERNAL cute::intel::float2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc); +SYCL_EXTERNAL float intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, float acc); +// mma_s8 +SYCL_EXTERNAL cute::intel::int8 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short8 a, cute::intel::int8 b, cute::intel::int8 acc); +SYCL_EXTERNAL cute::intel::int4 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short4 a, cute::intel::int8 b, cute::intel::int4 acc); +SYCL_EXTERNAL cute::intel::int2 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short2 a, cute::intel::int8 b, cute::intel::int2 acc); +SYCL_EXTERNAL int intel_sub_group_i8_i8_matrix_mad_k32( short a, cute::intel::int8 b, int acc); +// mma_u8 +SYCL_EXTERNAL cute::intel::int8 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort8 a, cute::intel::uint8 b, cute::intel::int8 acc); +SYCL_EXTERNAL cute::intel::int4 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort4 a, cute::intel::uint8 b, cute::intel::int4 acc); +SYCL_EXTERNAL cute::intel::int2 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort2 a, cute::intel::uint8 b, cute::intel::int2 acc); +SYCL_EXTERNAL int intel_sub_group_u8_u8_matrix_mad_k32( ushort a, cute::intel::uint8 b, int acc); +// mma_tf32 +SYCL_EXTERNAL cute::intel::float8 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float4 a, cute::intel::float8 b, cute::intel::float8 acc); +SYCL_EXTERNAL cute::intel::float4 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float2 a, cute::intel::float8 b, cute::intel::float4 acc); +SYCL_EXTERNAL cute::intel::float2 intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, cute::intel::float2 acc); +SYCL_EXTERNAL float intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, float acc); +// mma_bfloat16 with bfloat16 accumulator: +SYCL_EXTERNAL cute::intel::short8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::short8 acc); +SYCL_EXTERNAL cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::short4 acc); +SYCL_EXTERNAL cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc); +SYCL_EXTERNAL short intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, short acc); +// mma_half with half accumulator: +SYCL_EXTERNAL cute::intel::half8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::half8 acc); +SYCL_EXTERNAL cute::intel::half4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::half4 acc); +SYCL_EXTERNAL cute::intel::half2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::half2 acc); +SYCL_EXTERNAL sycl::half intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, sycl::half acc); + +namespace cute::detail +{ + +template +struct XeSubgroupMatrixMultiplyAccumulate { + static_assert(dependent_false<>, "Unsupported MMA Configuration."); +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); + } +}; +} // namespace cute::detail end diff --git a/include/cute/arch/mma_xe_spirv.hpp b/include/cute/arch/mma_xe_spirv.hpp new file mode 100644 index 0000000000..0f32e51f3a --- /dev/null +++ b/include/cute/arch/mma_xe_spirv.hpp @@ -0,0 +1,157 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once +#include + +SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::float8, int32_t); +SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::float4, int32_t); +SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::float2, int32_t); +SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, float, int32_t); + +SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::int8, int32_t); +SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::int4, int32_t); +SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::int2, int32_t); +SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, int, int32_t); + +SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort8, cute::intel::uint8, cute::intel::int8, int32_t); +SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort4, cute::intel::uint8, cute::intel::int4, int32_t); +SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort2, cute::intel::uint8, cute::intel::int2, int32_t); +SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, ushort, cute::intel::uint8, int, int32_t); + +SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float4, cute::intel::float8, cute::intel::float8, int32_t); +SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float2, cute::intel::float8, cute::intel::float4, int32_t); +SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, cute::intel::float2, int32_t); +SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, float, int32_t); + +SYCL_EXTERNAL cute::intel::short8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::short8, int32_t); +SYCL_EXTERNAL cute::intel::short4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::short4, int32_t); +SYCL_EXTERNAL cute::intel::short2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::short2, int32_t); +SYCL_EXTERNAL short __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, short, int32_t); + +SYCL_EXTERNAL cute::intel::half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::half8, int32_t); +SYCL_EXTERNAL cute::intel::half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::half4, int32_t); +SYCL_EXTERNAL cute::intel::half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::half2, int32_t); +SYCL_EXTERNAL sycl::half __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, sycl::half, int32_t); + +struct SPIRV_MMAOperands { + static constexpr int SPIRV_MatrixASigned = 0x1; + static constexpr int SPIRV_MatrixBSigned = 0x2; + static constexpr int SPIRV_MatrixAInt8 = 0x10; + static constexpr int SPIRV_MatrixBInt8 = 0x20; + static constexpr int SPIRV_MatrixAFp16 = 0x400; + static constexpr int SPIRV_MatrixBFp16 = 0x800; + static constexpr int SPIRV_MatrixABf16 = 0x1000; + static constexpr int SPIRV_MatrixBBf16 = 0x2000; + static constexpr int SPIRV_MatrixCBf16 = 0xC; + static constexpr int SPIRV_MatrixATf32 = 0x100; + static constexpr int SPIRV_MatrixBTf32 = 0x200; +}; + +namespace cute::detail +{ + +template +struct XeSubgroupMatrixMultiplyAccumulate { + static_assert(dependent_false<>, "Unsupported MMA Configuration."); +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 ); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 | + SPIRV_MMAOperands::SPIRV_MatrixCBf16); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixASigned | SPIRV_MMAOperands::SPIRV_MatrixBSigned | + SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); + } +}; + +template<> +struct XeSubgroupMatrixMultiplyAccumulate { + template + CUTE_HOST_DEVICE + auto operator()(ARegisters a, BRegisters b, CRegisters c) { + return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(8, a, b, c, + SPIRV_MMAOperands::SPIRV_MatrixATf32 | SPIRV_MMAOperands::SPIRV_MatrixBTf32); + } +}; +} // namespace cute::detail end diff --git a/include/cute/arch/xe_config.hpp b/include/cute/arch/xe_config.hpp deleted file mode 100644 index 37e8b4b8a7..0000000000 --- a/include/cute/arch/xe_config.hpp +++ /dev/null @@ -1,459 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once -#include - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_BUILTIN(x) SYCL_EXTERNAL extern "C" x -#else -#define SYCL_DEVICE_BUILTIN(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - -#ifdef __SYCL_DEVICE_ONLY__ -#define SYCL_DEVICE_OCL(x) SYCL_EXTERNAL x -#else -#define SYCL_DEVICE_OCL(x) \ - inline x { \ - CUTE_INVALID_CONTROL_PATH( \ - "Attempting to use a device built-in in host code."); \ - } -#endif - - -#undef __global -#define __global __attribute__((opencl_global)) - - -#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_INTEL_TARGET) -#define CUTE_ARCH_XE_ENABLED -#endif - -#if defined(CUTE_ARCH_XE_ENABLED) && ((defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER < 20250200)) || defined(CUTLASS_SYCL_BUILTIN_ENABLE)) -#define CUTE_ARCH_XE_BUILTIN_ENABLED -#elif defined(CUTE_ARCH_XE_ENABLED) -#define CUTE_ARCH_XE_SPIRV_ENABLED -#endif - -#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) -namespace cute::detail -{ -template -struct XeSubgroup2DBlockPrefetch { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -template -struct XeSubgroup2DBlockLoad { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -template -struct XeSubgroup2DBlockTransform { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -template -struct XeSubgroup2DBlockTranspose { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -template -struct XeSubgroup2DBlockStore { - static_assert(dependent_false<>, "Unsupported 2D Block Load Configuration."); -}; -} -#endif - -#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) - -// SPIRV copy definitions -SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); -SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransformINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); -SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransposeINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); -SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockStoreINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - void* src_pointer, const void* dst_base_pointer, int memory_width, - int memory_height, int memory_pitch, cute::intel::coord_t coordinate); -SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockPrefetchINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate); - -namespace cute::detail { -template -struct XeSubgroup2DBlockLoad { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { -#ifdef __SYCL_DEVICE_ONLY__ - __spirv_Subgroup2DBlockLoadINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, - static_cast(dstPointer)); -#endif - } -}; - -template -struct XeSubgroup2DBlockTransform { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { -#ifdef __SYCL_DEVICE_ONLY__ - __spirv_Subgroup2DBlockLoadTransformINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, - static_cast(dstPointer)); -#endif - } -}; - -template -struct XeSubgroup2DBlockTranspose { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { -#ifdef __SYCL_DEVICE_ONLY__ - __spirv_Subgroup2DBlockLoadTransposeINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, - static_cast(dstPointer)); -#endif - } -}; - -template -struct XeSubgroup2DBlockPrefetch { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { -#ifdef __SYCL_DEVICE_ONLY__ - __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); -#endif - } -}; - -template -struct XeSubgroup2DBlockStore { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { -#ifdef __SYCL_DEVICE_ONLY__ - __spirv_Subgroup2DBlockStoreINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - (void*)(srcPointer), dstBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); -#endif - } -}; -} // namespace cute::detail end -#endif - -enum class CacheControl { - kDefault = 0, - kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached - kL1UC_L3C = 2, // Override to L1 uncached and L3 cached - kL1C_L3UC = 3, // Override to L1 cached and L3 uncached - kL1C_L3C = 4, // Override to L1 cached and L3 cached - kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached - kL1S_L3C = 6, // Override to L1 streaming load and L3 cached - kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached -}; - -namespace cute::detail{ - template - struct XeSubgroupMatrixMultiplyAccumulate { - static_assert(dependent_false<>, "Unsupported MMA Configuration."); - }; -} // namespace cute::detail end - -#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) - -// mma_bf16 -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL( float intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, float acc)); -// mma_half -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL( float intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, float acc)); -// mma_s8 -SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short8 a, cute::intel::int8 b, cute::intel::int8 acc)); -SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short4 a, cute::intel::int8 b, cute::intel::int4 acc)); -SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_i8_i8_matrix_mad_k32(cute::intel::short2 a, cute::intel::int8 b, cute::intel::int2 acc)); -SYCL_DEVICE_OCL( int intel_sub_group_i8_i8_matrix_mad_k32( short a, cute::intel::int8 b, int acc)); -// mma_u8 -SYCL_DEVICE_OCL(cute::intel::int8 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort8 a, cute::intel::uint8 b, cute::intel::int8 acc)); -SYCL_DEVICE_OCL(cute::intel::int4 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort4 a, cute::intel::uint8 b, cute::intel::int4 acc)); -SYCL_DEVICE_OCL(cute::intel::int2 intel_sub_group_u8_u8_matrix_mad_k32(cute::intel::ushort2 a, cute::intel::uint8 b, cute::intel::int2 acc)); -SYCL_DEVICE_OCL( int intel_sub_group_u8_u8_matrix_mad_k32( ushort a, cute::intel::uint8 b, int acc)); -// mma_tf32 -SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float4 a, cute::intel::float8 b, cute::intel::float8 acc)); -SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_tf32_tf32_matrix_mad_k8(cute::intel::float2 a, cute::intel::float8 b, cute::intel::float4 acc)); -SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, cute::intel::float2 acc)); -SYCL_DEVICE_OCL( float intel_sub_group_tf32_tf32_matrix_mad_k8( float a, cute::intel::float8 b, float acc)); -// mma_bfloat16 with bfloat16 accumulator: -SYCL_DEVICE_OCL(cute::intel::short8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::short8 acc)); -SYCL_DEVICE_OCL(cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::short4 acc)); -SYCL_DEVICE_OCL(cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc)); -SYCL_DEVICE_OCL( short intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, short acc)); -// mma_half with half accumulator: -SYCL_DEVICE_OCL(cute::intel::half8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::half8 acc)); -SYCL_DEVICE_OCL(cute::intel::half4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::half4 acc)); -SYCL_DEVICE_OCL(cute::intel::half2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::half2 acc)); -SYCL_DEVICE_OCL( sycl::half intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, sycl::half acc)); - -namespace cute::detail -{ -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_i8_i8_matrix_mad_k32(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_u8_u8_matrix_mad_k32(a, b, c); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return intel_sub_group_tf32_tf32_matrix_mad_k8(a, b, c); -#endif - } -}; -} // namespace cute::detail end -#endif - - -// @brief spirv APIs for mma -// @param dims K -// @param ARegisters -// @param BRegisters -// @param AccRegisters -// @param Operands code -// @return DRegisters -#if defined(CUTE_ARCH_XE_SPIRV_ENABLED) - -SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::float8, int32_t); -SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::float4, int32_t); -SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::float2, int32_t); -SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, float, int32_t); - -SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::int8, int32_t); -SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::int4, int32_t); -SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::int2, int32_t); -SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, int, int32_t); - -SYCL_EXTERNAL cute::intel::int8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort8, cute::intel::uint8, cute::intel::int8, int32_t); -SYCL_EXTERNAL cute::intel::int4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort4, cute::intel::uint8, cute::intel::int4, int32_t); -SYCL_EXTERNAL cute::intel::int2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::ushort2, cute::intel::uint8, cute::intel::int2, int32_t); -SYCL_EXTERNAL int __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, ushort, cute::intel::uint8, int, int32_t); - -SYCL_EXTERNAL cute::intel::float8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float4, cute::intel::float8, cute::intel::float8, int32_t); -SYCL_EXTERNAL cute::intel::float4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::float2, cute::intel::float8, cute::intel::float4, int32_t); -SYCL_EXTERNAL cute::intel::float2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, cute::intel::float2, int32_t); -SYCL_EXTERNAL float __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, float, cute::intel::float8, float, int32_t); - -SYCL_EXTERNAL cute::intel::short8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::short8, int32_t); -SYCL_EXTERNAL cute::intel::short4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::short4, int32_t); -SYCL_EXTERNAL cute::intel::short2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::short2, int32_t); -SYCL_EXTERNAL short __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, short, int32_t); - -SYCL_EXTERNAL cute::intel::half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::half8, int32_t); -SYCL_EXTERNAL cute::intel::half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::half4, int32_t); -SYCL_EXTERNAL cute::intel::half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::half2, int32_t); -SYCL_EXTERNAL sycl::half __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, sycl::half, int32_t); - -struct SPIRV_MMAOperands { - static constexpr int SPIRV_MatrixASigned = 0x1; - static constexpr int SPIRV_MatrixBSigned = 0x2; - static constexpr int SPIRV_MatrixAInt8 = 0x10; - static constexpr int SPIRV_MatrixBInt8 = 0x20; - static constexpr int SPIRV_MatrixAFp16 = 0x400; - static constexpr int SPIRV_MatrixBFp16 = 0x800; - static constexpr int SPIRV_MatrixABf16 = 0x1000; - static constexpr int SPIRV_MatrixBBf16 = 0x2000; - static constexpr int SPIRV_MatrixCBf16 = 0xC; - static constexpr int SPIRV_MatrixATf32 = 0x100; - static constexpr int SPIRV_MatrixBTf32 = 0x200; -}; - -namespace cute::detail -{ -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 ); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixABf16 | SPIRV_MMAOperands::SPIRV_MatrixBBf16 |SPIRV_MMAOperands::SPIRV_MatrixCBf16); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixASigned | SPIRV_MMAOperands::SPIRV_MatrixBSigned | SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(32, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixAInt8 | SPIRV_MMAOperands::SPIRV_MatrixBInt8); -#endif - } -}; - -template<> -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { -#ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(8, a, b, c, SPIRV_MMAOperands::SPIRV_MatrixATf32 | SPIRV_MMAOperands::SPIRV_MatrixBTf32); -#endif - } -}; -} // namespace cute::detail end -#endif diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp deleted file mode 100644 index 3ed5831e7b..0000000000 --- a/include/cute/arch/xe_copy_1B.hpp +++ /dev/null @@ -1,1098 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once - -#include -#include -#include -#include -#include "cute/pointer.hpp" - -// 8bits No transform No transpose -SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - - -// 8bits VNNI transform No transpose -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u8_k32( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 8bits No transform No transpose -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar2 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar4)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uchar8)); - -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( - long baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( - long baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( - long baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( - long baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -// // 2D prefetch -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_1r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_2r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_4r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_8r32x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_8b_32r16x1c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -namespace cute::detail -{ -#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) -template<> -struct XeSubgroup2DBlockLoad<1, 32, 1, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 2, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 4, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 32, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 1, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m1k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 2, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m2k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 4, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m4k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 8, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m8k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 16, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m16k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<1, 32, 32, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTransform<1, 16, 32, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTransform<1, 16, 32, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTransform<1, 16, 32, 4> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u8_k32v4( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockStore<1, 16, 1, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u8_m1k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<1, 16, 2, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u8_m2k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar2 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<1, 16, 4, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u8_m4k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar4 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<1, 16, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u8_m8k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<1, 16, 8, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u8_m8k16v2( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uchar8 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 1, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( - (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 2, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( - (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 4, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( - (intptr_t)srcBasePointer, memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 8, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( - (intptr_t)srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_8b_1r32x2c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 2, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_8b_2r32x2c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 4, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_8b_4r32x2c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 32, 8, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_8b_8r32x2c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_8b_32r16x1c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; -#endif -} // namespace cute::detail end - -namespace cute -{ -struct XE_2D_U8x1x32_LD_N { - using BlockShape = Shape<_1, _32>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x2x32_LD_N { - using BlockShape = Shape<_2, _32>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x2x32_ST_N { - using BlockShape = Shape<_2, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); -#endif - } -}; - -struct XE_2D_U8x4x32_LD_N { - using BlockShape = Shape<_4, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x8x32_LD_N { - using BlockShape = Shape<_8, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x16x32_LD_N { - using BlockShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x32x32_LD_N { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x16x16_LD_T { - using BlockShape = Shape<_16, _16>; - using inst_dtype = uint32_t; - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x32x16_LD_T { - using BlockShape = Shape<_32, _16>; - using inst_dtype = uint32_t; - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x32x64_LD_N { - using BlockShape = Shape<_32, _64>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m32k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); - - // ================= shuffle begin ================= - // FIXME: the performance of shuffle algorithm here is too bad, we are working with - // compiler/IGC team to optimize it. - - static constexpr auto subgroup_size = 16; - static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; - static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; - - auto sg = syclcompat::get_nd_item<1>().get_sub_group(); - auto id = int(ThreadIdxX()) % subgroup_size; - - cute::subbyte_iterator dst_iter(dst); - cute::array_subbyte dst_tmp{}; - - #pragma unroll - for (int cw = 0; cw < copy_W; cw++) { - auto remote_id = (id + cw * subgroup_size) / copy_W; - - // TODO: select 'ushort32' will cause compiling error, use 'ushort16' instead, why? - intel::ushort16 remote_dst[2]; - remote_dst[0] = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); - remote_dst[1] = sycl::select_from_group(sg, *((reinterpret_cast(dst)) + 1), remote_id); - - cute::subbyte_iterator remote_dst_iter(remote_dst); - - #pragma unroll - for (int row = 0; row < copy_H; row++) { - dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); - } - } - - *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U4x16x64_LD_N { - using BlockShape = Shape<_16, _64>; - using inst_dtype = int8_t; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(SYCL_INTEL_TARGET) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - *reinterpret_cast(dst) = - __builtin_IB_subgroup_block_read_flat_u8_m16k32v1( - (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); - - // ================= shuffle begin ================= - // FIXME: the performance of shuffle algorithm here is too bad, we are working with - // compiler/IGC team to optimize it. - - static constexpr auto subgroup_size = 16; - static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size; - static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value; - - auto sg = syclcompat::get_nd_item<1>().get_sub_group(); - auto id = int(ThreadIdxX()) % subgroup_size; - - cute::subbyte_iterator dst_iter(dst); - cute::array_subbyte dst_tmp{}; - - #pragma unroll - for (int cw = 0; cw < copy_W; cw++) { - auto remote_id = (id + cw * subgroup_size) / copy_W; - - intel::ushort16 remote_dst; - remote_dst = sycl::select_from_group(sg, *(reinterpret_cast(dst)), remote_id); - - cute::subbyte_iterator remote_dst_iter(&remote_dst); - - - #pragma unroll - for (int row = 0; row < copy_H; row++) { - dst_tmp[row + cw * copy_H] = remote_dst_iter[row * copy_W + id % copy_W].get(); - } - } - - *reinterpret_cast(cute::raw_pointer_cast(dst_iter)) = *reinterpret_cast(cute::raw_pointer_cast(dst_tmp.begin())); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x1x64_LD_N { - using BlockShape = Shape<_1, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 1, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x2x64_LD_N { - using BlockShape = Shape<_2, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 2, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x4x64_LD_N { - using BlockShape = Shape<_4, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 4, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x8x64_LD_N { - using BlockShape = Shape<_8, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 32, 8, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x16x64_LD_N { - using BlockShape = Shape<_16, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x32x64_LD_N { - using BlockShape = Shape<_32, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - - - -struct XE_2D_U8x32x16_LD_V { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockTransform<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<1, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U8x32x32_LD_V { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockTransform<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x32x64_LD_V { - using BlockShape = Shape<_32, _64>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockTransform<1, 16, 32, 4>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x1x16_ST_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockStore<1, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x2x16_ST_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockStore<1, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x4x16_ST_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockStore<1, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x8x16_ST_N { - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockStore<1, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U8x8x32_ST_N { - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockStore<1, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; -} // end namespace cute diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp deleted file mode 100644 index d2cefeb9de..0000000000 --- a/include/cute/arch/xe_copy_2B.hpp +++ /dev/null @@ -1,1002 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once - -#include -#include -#include - -SYCL_DEVICE_BUILTIN(cute::intel::ushort16 intel_subgroup_block_read_u16_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN(cute::intel::int8 intel_subgroup_block_read_transform_u16_k16( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// U16 prefetch -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control)); - -// 16 bits No transform No transpose -SYCL_DEVICE_BUILTIN(cute::intel::ushort __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN( - cute::intel::ushort2 __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort4 __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort8 __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort16 __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort32 __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 16bits VNNI transform No transpose -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transform_u16_k16( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k32( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 16bits -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort2 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort4 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::ushort8 data)); - -// 2D prefetch -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_1r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_2r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_16b_4r16x2c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -namespace cute::detail { -#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) -template<> -struct XeSubgroup2DBlockLoad<2, 16, 1, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 2, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 4, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 32, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 1, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m1k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 2, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m2k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 4, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m4k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 8, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m8k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 16, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m16k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<2, 16, 32, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u16_m32k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTransform<2, 16, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTransform<2, 16, 32, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTransform<2, 16, 16, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTransform<2, 16, 32, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockStore<2, 16, 1, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u16_m1k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(ushort *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<2, 16, 2, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u16_m2k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort2 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<2, 16, 4, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u16_m4k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort4 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<2, 16, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u16_m8k16v1( - (intptr_t)(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::ushort8 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 8, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 16, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 32, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 1, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_16b_1r16x2c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 2, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_16b_2r16x2c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 4, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_16b_4r16x2c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 8, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 16, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<2, 16, 32, 2> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, CacheControl::kL1C_L3C); - } -}; -#endif -} - -namespace cute -{ -struct XE_2D_U16x1x16_LD_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x2x16_LD_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x4x16_LD_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x8x16_LD_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x16_LD_N { - using BlockShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x16_LD_N { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x1x32_LD_N { - using BlockShape = Shape<_1, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 1, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x2x32_LD_N { - using BlockShape = Shape<_2, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 2, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x4x32_LD_N { - using BlockShape = Shape<_4, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 4, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x8x32_LD_N { - using BlockShape = Shape<_8, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 8, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x32_LD_N { - using BlockShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x32_LD_N { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockLoad<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - // __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( - detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x16_LD_V { - using BlockShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockTransform<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x16_LD_V { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockTransform<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x32_LD_V { - using BlockShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockTransform<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 16, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x32x32_LD_V { - using BlockShape = Shape<_32, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockTransform<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U16x16x8_LD_T { - using BlockShape = Shape<_8, _16>; - using inst_dtype = uint32_t; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 4"); - detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x16x16_LD_T { - using BlockShape = Shape<_16, _16>; - using inst_dtype = uint32_t; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x1x16_ST_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockStore<2, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x2x16_ST_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockStore<2, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x4x16_ST_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockStore<2, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U16x8x16_ST_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - // static_assert(sizeof(T) == 2, "Expected T to have size 2"); - detail::XeSubgroup2DBlockStore<2, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; -} // end namespace cute diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp deleted file mode 100644 index 827f6a006b..0000000000 --- a/include/cute/arch/xe_copy_4B.hpp +++ /dev/null @@ -1,908 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -#pragma once - -#include -#include -#include - -// 32bits specific for tf32 No transform No transpose -SYCL_DEVICE_BUILTIN( - cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 32bits No transform No transpose -SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint32 __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 32bits No transform Transpose -SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); -SYCL_DEVICE_BUILTIN( - cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); - -// 32bits -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint2 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint4 data)); -SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord, cute::intel::uint8 data)); - -SYCL_DEVICE_OCL(void intel_sub_group_2d_block_prefetch_32b_16r8x1c( - __global void* base_address, int width, int height, int pitch, - cute::intel::coord_t coord)); - -namespace cute::detail { -#if defined(CUTE_ARCH_XE_BUILTIN_ENABLED) -template<> -struct XeSubgroup2DBlockLoad<4, 16, 1, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 16, 2, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 16, 4, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k16v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 16, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k16v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 16, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k16v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 16, 32, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k16v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 1, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 2, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 4, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 32, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 1, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m1k8v2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 2, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m2k8v2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 4, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m4k8v2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 8, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m8k8v2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 16, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m16k8v2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockLoad<4, 8, 32, 2> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u32_m32k8v2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTranspose<4, 1, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockTranspose<4, 8, 16, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); - } -}; - -template<> -struct XeSubgroup2DBlockStore<4, 16, 1, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( - reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(uint *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<4, 16, 2, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u32_m2k16v1( - reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint2 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<4, 16, 4, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u32_m4k16v1( - reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint4 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockStore<4, 16, 8, 1> { - template - CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { - __builtin_IB_subgroup_block_write_flat_u32_m8k16v1( - reinterpret_cast(dstBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, *(intel::uint8 *)(srcPointer)); - } -}; - -template<> -struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { - CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - intel_sub_group_2d_block_prefetch_32b_16r8x1c( - (__global void*)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate); - } -}; -#endif -} // namespace cute::detail end - -namespace cute -{ -struct XE_2D_U32x1x16_LD_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x2x16_LD_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x4x16_LD_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x8x16_LD_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x16_LD_N { - using BlockShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 16, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x32x16_LD_N { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 16, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x1x8_LD_N { - using BlockShape = Shape<_32, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 1, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x2x8_LD_N { - using BlockShape = Shape<_2, _8>; - using ValueShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 2, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x4x8_LD_N { - using BlockShape = Shape<_4, _8>; - using ValueShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 4, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x8x8_LD_N { - using BlockShape = Shape<_8, _8>; - using ValueShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 8, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x16x8_LD_N { - using BlockShape = Shape<_16, _8>; - using ValueShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x32x8_LD_N { - using BlockShape = Shape<_32, _8>; - using ValueShape = Shape<_16, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x1x16_LD_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 1, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x2x16_LD_N { - using BlockShape = Shape<_2, _16>; - using ValueShape = Shape<_1, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 2, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x4x16_LD_N { - using BlockShape = Shape<_4, _16>; - using ValueShape = Shape<_2, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 4, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x8x16_LD_N { - using BlockShape = Shape<_8, _16>; - using ValueShape = Shape<_4, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 8, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_TF32x16x16_LD_N { - using BlockShape = Shape<_16, _16>; - using ValueShape = Shape<_8, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 16, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_TF32x32x16_LD_N { - using BlockShape = Shape<_32, _16>; - using ValueShape = Shape<_16, _32>; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockLoad<4, 8, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - - -struct XE_2D_U32x16x1_LD_T { - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockTranspose<4, 1, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x2_LD_T { - using BlockShape = Shape<_2, _16>; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockTranspose<4, 2, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x4_LD_T { - using BlockShape = Shape<_4, _16>; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockTranspose<4, 4, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x16x8_LD_T { - using BlockShape = Shape<_8, _16>; - - static constexpr bool is_transpose = true; - - template - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, intel::coord_t coord, - T *dst) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockTranspose<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord, dst); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } - - struct PREFETCH { - CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, - int height, int pitch, - intel::coord_t coord) { -#if defined(CUTE_ARCH_XE_ENABLED) - detail::XeSubgroup2DBlockPrefetch<4, 8, 16, 1>{}(baseoffset, width, height, pitch, coord); -#else - CUTE_INVALID_CONTROL_PATH( - "Trying to use block prefetch on non-Xe hardware"); -#endif - } - }; -}; - -struct XE_2D_U32x1x16_ST_N { - using BlockShape = Shape<_1, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - // static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockStore<4, 16, 1, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x2x16_ST_N { - using BlockShape = Shape<_2, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockStore<4, 16, 2, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x4x16_ST_N { - using BlockShape = Shape<_4, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockStore<4, 16, 4, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -struct XE_2D_U32x8x16_ST_N { - using BlockShape = Shape<_8, _16>; - - template - CUTE_HOST_DEVICE static void copy(void *baseoffset, int width, int height, - int pitch, intel::coord_t coord, - const T *src) { -#if defined(CUTE_ARCH_XE_ENABLED) - // static_assert(sizeof(T) == 4, "Expected T to have size 4"); - detail::XeSubgroup2DBlockStore<4, 16, 8, 1>{}(baseoffset, width, height, pitch, coord, src); -#else - CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); -#endif - } -}; - -} // end namespace cute diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 03b3f842fe..240ce7553d 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -408,9 +408,9 @@ CUTE_HOST_DEVICE constexpr auto make_fragment_layout(TiledCopy &tiled_copy, auto [mma_atom_shape, total_mma_atom_iters_M, total_mma_atom_iters_N] = fragment_top_level_shape; auto mma_atom_shape_2d = prepend<2>(mma_atom_shape, _1{}); - Int mma_atom_size_M = + auto mma_atom_size_M = Int(mma_atom_shape_2d) : size<1>(mma_atom_shape_2d)>{}; - Int mma_atom_size_N = + auto mma_atom_size_N = Int(mma_atom_shape_2d) : size<0>(mma_atom_shape_2d)>{}; using ThreadLayout_ = Shape<_1, Int>; @@ -418,15 +418,15 @@ CUTE_HOST_DEVICE constexpr auto make_fragment_layout(TiledCopy &tiled_copy, ThreadLayout_, decltype(cute::reverse(ThreadLayout_{}))>; auto thread_copy_shape = shape_div(typename TiledCopy::BlockShape{}, ThreadLayout{}); - Int copy_size_M = size<0>(thread_copy_shape); - Int copy_size_N = size<1>(thread_copy_shape); + auto copy_size_M = size<0>(thread_copy_shape); + auto copy_size_N = size<1>(thread_copy_shape); static_assert(copy_size_M >= mma_atom_size_M, "MMA atom larger than copy atom is not currently supported."); static_assert(copy_size_N >= mma_atom_size_N, "MMA atom larger than copy atom is not currently supported."); - Int mma_atom_iters_in_copy_M = copy_size_M / mma_atom_size_M; - Int mma_atom_iters_in_copy_N = copy_size_N / mma_atom_size_N; - Int copy_iters_M = total_mma_atom_iters_M / mma_atom_iters_in_copy_M; - Int copy_iters_N = total_mma_atom_iters_N / mma_atom_iters_in_copy_N; + auto mma_atom_iters_in_copy_M = copy_size_M / mma_atom_size_M; + auto mma_atom_iters_in_copy_N = copy_size_N / mma_atom_size_N; + auto copy_iters_M = total_mma_atom_iters_M / mma_atom_iters_in_copy_M; + auto copy_iters_N = total_mma_atom_iters_N / mma_atom_iters_in_copy_N; auto order = std::conditional_t, Step<_2, _4>, Step<_3, _5>>, @@ -2137,23 +2137,6 @@ struct Copy_Traits> { using RefLayout = DstLayout; }; -template -struct Copy_Traits> { - // Logical thread id to thread idx - using ThrID = Layout<_16>; - // Map from (src-thr,src-val) to bit - using SrcLayout = Layout::value>>, Stride<_0, _1>>; - // Map from (dst-thr,dst-val) to bit - using DstLayout = Layout::value>>, - Stride::value>, _1>>; - // Reference map from (thr,val) to bit - using RefLayout = DstLayout; - - template - CUTE_HOST_DEVICE - Copy_Traits(Copy_Traits const& traits) {} -}; - template struct Copy_Traits> { // Logical thread id to thread idx From 9864ab2ea6d51f7aa2443bdf0814e669f78b370e Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 8 May 2025 18:36:41 +0100 Subject: [PATCH 09/21] Fix cmake --- cmake/FindDPCPP.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake index 0a78b26519..9f45285cdc 100644 --- a/cmake/FindDPCPP.cmake +++ b/cmake/FindDPCPP.cmake @@ -62,7 +62,7 @@ endif() if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc" OR "${DPCPP_SYCL_TARGET}" STREQUAL "spir64" OR "${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_bmg_g21") - if (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) + if ((CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) OR CUTLASS_SYCL_BUILTIN_ENABLE) list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier") else() list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate") From 39e549d18625a78bdc00ed3aa96dbc31870fe3d0 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 8 May 2025 18:41:32 +0100 Subject: [PATCH 10/21] Re-enable test --- include/cute/arch/copy_xe_spirv.hpp | 17 ++++++++++++++++- test/unit/cute/intel_xe/copy_block.cpp | 2 -- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/cute/arch/copy_xe_spirv.hpp b/include/cute/arch/copy_xe_spirv.hpp index 0a59c47a26..73fd72dc83 100644 --- a/include/cute/arch/copy_xe_spirv.hpp +++ b/include/cute/arch/copy_xe_spirv.hpp @@ -33,11 +33,15 @@ #include #include "cute/config.hpp" -// TODO(Codeplay): This builtin is not available on SPIRV +// TODO(Codeplay): These builtins are not available on SPIRV SYCL_EXTERNAL extern "C" cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord); +SYCL_EXTERNAL extern "C" +cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord); // SPIRV copy definitions SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( @@ -131,4 +135,15 @@ struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { } }; +template<> +struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + } // namespace cute::detail end diff --git a/test/unit/cute/intel_xe/copy_block.cpp b/test/unit/cute/intel_xe/copy_block.cpp index aa9adfa6a5..b94e56fdfe 100644 --- a/test/unit/cute/intel_xe/copy_block.cpp +++ b/test/unit/cute/intel_xe/copy_block.cpp @@ -356,9 +356,7 @@ TEST(PVC_CuTe_Xe, block_2d_16bits_vnni) { } TEST(PVC_CuTe_Xe, block_2d_32bits_transpose) { - #if defined(CUTE_ARCH_COPY_XE_BUILTIN_ENABLED) copy_op{}(); copy_op{}(); - #endif copy_op{}(); } From d6c9358365b9c0ca61997693a01599bbdfb7561e Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 8 May 2025 19:07:25 +0100 Subject: [PATCH 11/21] Fix mma builtin --- include/cute/arch/mma_xe_builtin.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cute/arch/mma_xe_builtin.hpp b/include/cute/arch/mma_xe_builtin.hpp index 856849cc49..504872f656 100644 --- a/include/cute/arch/mma_xe_builtin.hpp +++ b/include/cute/arch/mma_xe_builtin.hpp @@ -33,7 +33,7 @@ // mma_bf16 SYCL_EXTERNAL cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc); -SYCL_DEVICE_OCL cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc); +SYCL_EXTERNAL cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc); SYCL_EXTERNAL cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc); SYCL_EXTERNAL float intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, float acc); // mma_half @@ -70,7 +70,7 @@ SYCL_EXTERNAL sycl::half intel_sub_group_f16_f16_matrix_mad_k16( namespace cute::detail { -template +template struct XeSubgroupMatrixMultiplyAccumulate { static_assert(dependent_false<>, "Unsupported MMA Configuration."); }; From ec9d0a7f1331316e1b684c806b32e47f3edbbdd3 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 8 May 2025 19:15:24 +0100 Subject: [PATCH 12/21] Fix copy builtin --- include/cute/arch/copy_xe_spirv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cute/arch/copy_xe_spirv.hpp b/include/cute/arch/copy_xe_spirv.hpp index 73fd72dc83..2c188cc4b8 100644 --- a/include/cute/arch/copy_xe_spirv.hpp +++ b/include/cute/arch/copy_xe_spirv.hpp @@ -39,7 +39,7 @@ cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord); SYCL_EXTERNAL extern "C" -cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( +cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord); @@ -141,7 +141,7 @@ struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { CUTE_HOST_DEVICE void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, cute::intel::coord_t coordinate, T* dstPointer) { - *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); } }; From 7144422fa9dd594d0426efad095bcc856a53607e Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Fri, 9 May 2025 12:15:56 +0100 Subject: [PATCH 13/21] Revert minor changes --- include/cute/atom/copy_traits_xe.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 240ce7553d..4ce7ed0d8f 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -408,9 +408,9 @@ CUTE_HOST_DEVICE constexpr auto make_fragment_layout(TiledCopy &tiled_copy, auto [mma_atom_shape, total_mma_atom_iters_M, total_mma_atom_iters_N] = fragment_top_level_shape; auto mma_atom_shape_2d = prepend<2>(mma_atom_shape, _1{}); - auto mma_atom_size_M = + Int mma_atom_size_M = Int(mma_atom_shape_2d) : size<1>(mma_atom_shape_2d)>{}; - auto mma_atom_size_N = + Int mma_atom_size_N = Int(mma_atom_shape_2d) : size<0>(mma_atom_shape_2d)>{}; using ThreadLayout_ = Shape<_1, Int>; @@ -418,15 +418,15 @@ CUTE_HOST_DEVICE constexpr auto make_fragment_layout(TiledCopy &tiled_copy, ThreadLayout_, decltype(cute::reverse(ThreadLayout_{}))>; auto thread_copy_shape = shape_div(typename TiledCopy::BlockShape{}, ThreadLayout{}); - auto copy_size_M = size<0>(thread_copy_shape); - auto copy_size_N = size<1>(thread_copy_shape); + Int copy_size_M = size<0>(thread_copy_shape); + Int copy_size_N = size<1>(thread_copy_shape); static_assert(copy_size_M >= mma_atom_size_M, "MMA atom larger than copy atom is not currently supported."); static_assert(copy_size_N >= mma_atom_size_N, "MMA atom larger than copy atom is not currently supported."); - auto mma_atom_iters_in_copy_M = copy_size_M / mma_atom_size_M; - auto mma_atom_iters_in_copy_N = copy_size_N / mma_atom_size_N; - auto copy_iters_M = total_mma_atom_iters_M / mma_atom_iters_in_copy_M; - auto copy_iters_N = total_mma_atom_iters_N / mma_atom_iters_in_copy_N; + Int mma_atom_iters_in_copy_M = copy_size_M / mma_atom_size_M; + Int mma_atom_iters_in_copy_N = copy_size_N / mma_atom_size_N; + Int copy_iters_M = total_mma_atom_iters_M / mma_atom_iters_in_copy_M; + Int copy_iters_N = total_mma_atom_iters_N / mma_atom_iters_in_copy_N; auto order = std::conditional_t, Step<_2, _4>, Step<_3, _5>>, From 4bbaaa62c4bfa8082400a42dc3c1cf30fcea5d19 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 12 May 2025 12:27:42 +0100 Subject: [PATCH 14/21] Use builtin for prefetch --- include/cute/arch/copy_xe_U16.hpp | 1 - include/cute/arch/copy_xe_spirv.hpp | 441 +++++++++++++++++++++++++--- include/cute/atom/mma_traits_xe.hpp | 64 ---- 3 files changed, 398 insertions(+), 108 deletions(-) diff --git a/include/cute/arch/copy_xe_U16.hpp b/include/cute/arch/copy_xe_U16.hpp index 5deb8434d8..1a14351fbb 100644 --- a/include/cute/arch/copy_xe_U16.hpp +++ b/include/cute/arch/copy_xe_U16.hpp @@ -335,7 +335,6 @@ struct XE_2D_U16x32x32_LD_N { int height, int pitch, intel::coord_t coord) { #if defined(CUTE_ARCH_COPY_XE_ENABLED) - // __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( detail::XeSubgroup2DBlockPrefetch<2, 16, 32, 2>{}(baseoffset, width, height, pitch, coord); #else CUTE_INVALID_CONTROL_PATH( diff --git a/include/cute/arch/copy_xe_spirv.hpp b/include/cute/arch/copy_xe_spirv.hpp index 2c188cc4b8..d37db7cbe0 100644 --- a/include/cute/arch/copy_xe_spirv.hpp +++ b/include/cute/arch/copy_xe_spirv.hpp @@ -38,56 +38,199 @@ SYCL_EXTERNAL extern "C" cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord); + SYCL_EXTERNAL extern "C" cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord); +enum class CacheControl { + kDefault = 0, + kL1UC_L3UC = 1, // Override to L1 uncached and L3 uncached + kL1UC_L3C = 2, // Override to L1 uncached and L3 cached + kL1C_L3UC = 3, // Override to L1 cached and L3 uncached + kL1C_L3C = 4, // Override to L1 cached and L3 cached + kL1S_L3UC = 5, // Override to L1 streaming load and L3 uncached + kL1S_L3C = 6, // Override to L1 streaming load and L3 cached + kL1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached +}; + +// U16 prefetch +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_1r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_2r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_4r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_8r32x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_8b_32r16x1c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL +void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL extern "C" +void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, enum CacheControl cache_control); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_16b_1r16x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_16b_2r16x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_16b_4r16x2c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + +SYCL_EXTERNAL +void intel_sub_group_2d_block_prefetch_32b_16r8x1c( + __attribute__((opencl_global)) void *base_address, int width, int height, int pitch, + cute::intel::coord_t coord); + // SPIRV copy definitions SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void *dst_pointer); + SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransformINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void *dst_pointer); + SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockLoadTransposeINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate, void* dst_pointer); + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate, void *dst_pointer); + SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockStoreINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - void* src_pointer, const void* dst_base_pointer, int memory_width, - int memory_height, int memory_pitch, cute::intel::coord_t coordinate); + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + void *src_pointer, const void *dst_base_pointer, int memory_width, + int memory_height, int memory_pitch, cute::intel::coord_t coordinate); + SYCL_EXTERNAL __attribute__((convergent)) void __spirv_Subgroup2DBlockPrefetchINTEL( - int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, - const void* src_base_pointer, int memory_width, int memory_height, - int memory_pitch, cute::intel::coord_t coordinate); + int ElementSize, int BlockWidth, int BlockHeight, int BlockCount, + const void *src_base_pointer, int memory_width, int memory_height, + int memory_pitch, cute::intel::coord_t coordinate); namespace cute::detail { - template struct XeSubgroup2DBlockLoad { template CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { __spirv_Subgroup2DBlockLoadINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, - static_cast(dstPointer)); - } + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); + } }; template struct XeSubgroup2DBlockTransform { template CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { __spirv_Subgroup2DBlockLoadTransformINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, - static_cast(dstPointer)); + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); } }; @@ -95,32 +238,36 @@ template struct XeSubgroup2DBlockTranspose { template CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { __spirv_Subgroup2DBlockLoadTransposeINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate, - static_cast(dstPointer)); + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate, + static_cast(dstPointer)); } }; template struct XeSubgroup2DBlockPrefetch { CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate) { - __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - srcBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); - } + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __spirv_Subgroup2DBlockPrefetchINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, + srcBasePointer, memoryWidth, + memoryHeight, memoryPitch, coordinate); + } }; template struct XeSubgroup2DBlockStore { template CUTE_HOST_DEVICE - void operator()(const void* dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* srcPointer) { + void operator()(const void *dstBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *srcPointer) { __spirv_Subgroup2DBlockStoreINTEL(ElementSize, BlockWidth, BlockHeight, BlockCount, - (void*)(srcPointer), dstBasePointer, memoryWidth, memoryHeight, memoryPitch, coordinate); + (void *)(srcPointer), dstBasePointer, + memoryWidth, memoryHeight, + memoryPitch, coordinate); } }; @@ -128,10 +275,10 @@ template<> struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { template CUTE_HOST_DEVICE - void operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { + void operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); } }; @@ -139,11 +286,219 @@ template<> struct XeSubgroup2DBlockTranspose<4, 4, 16, 1> { template CUTE_HOST_DEVICE void - operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, - cute::intel::coord_t coordinate, T* dstPointer) { + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T *dstPointer) { *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_transpose_u32_k4( - reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m1k32v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); } }; +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m2k32v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m4k32v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u8_m8k32v1( + reinterpret_cast(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_1r32x2c( + (__attribute__((opencl_global)) void*)(srcBasePointer), memoryWidth, memoryHeight, + memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_2r32x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, + memoryPitch, coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_4r32x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 32, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_8r32x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<1, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_8b_32r16x1c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 1, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_1r16x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 2, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_2r16x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 4, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_16b_4r16x2c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 8, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 16, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<2, 16, 32, 2> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2( + reinterpret_cast(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate, + CacheControl::kL1C_L3C); + } +}; + +template<> +struct XeSubgroup2DBlockPrefetch<4, 8, 16, 1> { + CUTE_HOST_DEVICE void + operator()(const void *srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate) { + intel_sub_group_2d_block_prefetch_32b_16r8x1c( + (__attribute__((opencl_global)) void *)(srcBasePointer), memoryWidth, memoryHeight, memoryPitch, + coordinate); + } +}; } // namespace cute::detail end diff --git a/include/cute/atom/mma_traits_xe.hpp b/include/cute/atom/mma_traits_xe.hpp index c0f8355bf6..f99e171954 100644 --- a/include/cute/atom/mma_traits_xe.hpp +++ b/include/cute/atom/mma_traits_xe.hpp @@ -165,70 +165,6 @@ struct MMA_Traits using CLayout = Layout, Stride<_1, _1>>; }; -template <> -struct MMA_Traits -{ - using ValTypeD = bfloat16_t; - using ValTypeA = bfloat16_t; - using ValTypeB = bfloat16_t; - using ValTypeC = bfloat16_t; - - using Shape_MNK = Shape<_8,_16,_16>; - using ThrID = Layout<_16>; - - using ALayout = Layout, Stride<_8, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_8, _1>>; -}; - -template <> -struct MMA_Traits -{ - using ValTypeD = bfloat16_t; - using ValTypeA = bfloat16_t; - using ValTypeB = bfloat16_t; - using ValTypeC = bfloat16_t; - - using Shape_MNK = Shape<_4,_16,_16>; - using ThrID = Layout<_16>; - - using ALayout = Layout, Stride<_4, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_4, _1>>; -}; - -template <> -struct MMA_Traits -{ - using ValTypeD = bfloat16_t; - using ValTypeA = bfloat16_t; - using ValTypeB = bfloat16_t; - using ValTypeC = bfloat16_t; - - using Shape_MNK = Shape<_2,_16,_16>; - using ThrID = Layout<_16>; - - using ALayout = Layout, Stride<_2, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_2, _1>>; -}; - -template <> -struct MMA_Traits -{ - using ValTypeD = bfloat16_t; - using ValTypeA = bfloat16_t; - using ValTypeB = bfloat16_t; - using ValTypeC = bfloat16_t; - - using Shape_MNK = Shape<_1,_16,_16>; - using ThrID = Layout<_16>; - - using ALayout = Layout, Stride<_1, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_1, _1>>; -}; - template <> struct MMA_Traits { From 304de177c69d321df0387f66dfe745fb37412edc Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 13 May 2025 15:11:09 +0100 Subject: [PATCH 15/21] Remove FP16 MMA with FP16 accumulator --- include/cute/arch/mma_xe.hpp | 83 ---------------------------- include/cute/arch/mma_xe_builtin.hpp | 16 +----- include/cute/arch/mma_xe_spirv.hpp | 15 ----- include/cute/atom/mma_traits_xe.hpp | 60 -------------------- include/cute/util/sycl_vec.hpp | 4 -- test/unit/cute/intel_xe/mma.cpp | 20 ------- 6 files changed, 1 insertion(+), 197 deletions(-) diff --git a/include/cute/arch/mma_xe.hpp b/include/cute/arch/mma_xe.hpp index 5b7371ab60..f67777f101 100644 --- a/include/cute/arch/mma_xe.hpp +++ b/include/cute/arch/mma_xe.hpp @@ -299,89 +299,6 @@ struct XE_1x16x16_F32F16F16F32_TT } }; -struct XE_8x16x16_F16F16F16F16_TT -{ - using DRegisters = intel::half8[1]; - using ARegisters = intel::short8[1]; - using BRegisters = intel::int8[1]; - using CRegisters = intel::half8[1]; - - CUTE_HOST_DEVICE static void - fma(intel::half8 & d, - intel::short8 const& a, - intel::int8 const& b, - intel::half8 const& c) - { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) - d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); -#else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_F16F16F16F16_TT on non-PVC hardware"); -#endif - } -}; - -struct XE_4x16x16_F16F16F16F16_TT -{ - using DRegisters = intel::half4[1]; - using ARegisters = intel::short4[1]; - using BRegisters = intel::int8[1]; - using CRegisters = intel::half4[1]; - - CUTE_HOST_DEVICE static void - fma(intel::half4 & d, - intel::short4 const& a, - intel::int8 const& b, - intel::half4 const& c) - { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) - d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); -#else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_4x16x16_F16F16F16F16_TT on non-PVC hardware"); -#endif - } -}; - -struct XE_2x16x16_F16F16F16F16_TT -{ - using DRegisters = sycl::half2[1]; - using ARegisters = intel::short2[1]; - using BRegisters = intel::int8[1]; - using CRegisters = sycl::half2[1]; - - CUTE_HOST_DEVICE static void - fma(sycl::half2 & d, - intel::short2 const& a, - intel::int8 const& b, - sycl::half2 const& c) - { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) - d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); -#else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_2x16x16_F16F16F16F16_TT on non-PVC hardware"); -#endif - } -}; - -struct XE_1x16x16_F16F16F16F16_TT -{ - using DRegisters = half_t[1]; - using ARegisters = short[1]; - using BRegisters = intel::int8[1]; - using CRegisters = half_t[1]; - - CUTE_HOST_DEVICE static void - fma(half_t & d, - short const& a, - intel::int8 const& b, - half_t const& c) - { -#if defined(CUTE_ARCH_MMA_XE_ENABLED) - d = detail::XeSubgroupMatrixMultiplyAccumulate{}(a, b, c); -#else - CUTE_INVALID_CONTROL_PATH("Attempting to use XE_1x16x16_F16F16F16F16_TT on non-PVC hardware"); -#endif - } -}; //MxNxK_A,B,C,D //# of vector component of a x subgroup-size x function name //float8 intel_sub_group_i8_i8_matrix_mad_k16(short8 a, int8 b, float8 acc); diff --git a/include/cute/arch/mma_xe_builtin.hpp b/include/cute/arch/mma_xe_builtin.hpp index 504872f656..1e9716a9ef 100644 --- a/include/cute/arch/mma_xe_builtin.hpp +++ b/include/cute/arch/mma_xe_builtin.hpp @@ -61,11 +61,6 @@ SYCL_EXTERNAL cute::intel::short8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute: SYCL_EXTERNAL cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::short4 acc); SYCL_EXTERNAL cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc); SYCL_EXTERNAL short intel_sub_group_bf16_bf16_matrix_mad_k16( short a, cute::intel::int8 b, short acc); -// mma_half with half accumulator: -SYCL_EXTERNAL cute::intel::half8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::half8 acc); -SYCL_EXTERNAL cute::intel::half4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::half4 acc); -SYCL_EXTERNAL cute::intel::half2 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::half2 acc); -SYCL_EXTERNAL sycl::half intel_sub_group_f16_f16_matrix_mad_k16( short a, cute::intel::int8 b, sycl::half acc); namespace cute::detail { @@ -101,16 +96,7 @@ struct XeSubgroupMatrixMultiplyAccumulate -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { - return intel_sub_group_f16_f16_matrix_mad_k16(a, b, c); - } -}; - + template<> struct XeSubgroupMatrixMultiplyAccumulate { template diff --git a/include/cute/arch/mma_xe_spirv.hpp b/include/cute/arch/mma_xe_spirv.hpp index 0f32e51f3a..d565057655 100644 --- a/include/cute/arch/mma_xe_spirv.hpp +++ b/include/cute/arch/mma_xe_spirv.hpp @@ -56,11 +56,6 @@ SYCL_EXTERNAL cute::intel::short4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL( SYCL_EXTERNAL cute::intel::short2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::short2, int32_t); SYCL_EXTERNAL short __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, short, int32_t); -SYCL_EXTERNAL cute::intel::half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::half8, int32_t); -SYCL_EXTERNAL cute::intel::half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::half4, int32_t); -SYCL_EXTERNAL cute::intel::half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::half2, int32_t); -SYCL_EXTERNAL sycl::half __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, short, cute::intel::int8, sycl::half, int32_t); - struct SPIRV_MMAOperands { static constexpr int SPIRV_MatrixASigned = 0x1; static constexpr int SPIRV_MatrixBSigned = 0x2; @@ -114,16 +109,6 @@ struct XeSubgroupMatrixMultiplyAccumulate -struct XeSubgroupMatrixMultiplyAccumulate { - template - CUTE_HOST_DEVICE - auto operator()(ARegisters a, BRegisters b, CRegisters c) { - return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c, - SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16); - } -}; - template<> struct XeSubgroupMatrixMultiplyAccumulate { template diff --git a/include/cute/atom/mma_traits_xe.hpp b/include/cute/atom/mma_traits_xe.hpp index f99e171954..5c64b17965 100644 --- a/include/cute/atom/mma_traits_xe.hpp +++ b/include/cute/atom/mma_traits_xe.hpp @@ -225,66 +225,6 @@ struct MMA_Traits using CLayout = Layout, Stride<_1, _1>>; }; -template <> -struct MMA_Traits -{ - using ValTypeD = half_t; - using ValTypeA = half_t; - using ValTypeB = half_t; - using ValTypeC = half_t; - - using Shape_MNK = Shape<_8,_16,_16>; - using ThrID = Layout<_16>; - using ALayout = Layout, Stride<_8, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_8, _1>>; -}; - -template <> -struct MMA_Traits -{ - using ValTypeD = half_t; - using ValTypeA = half_t; - using ValTypeB = half_t; - using ValTypeC = half_t; - - using Shape_MNK = Shape<_4,_16,_16>; - using ThrID = Layout<_16>; - using ALayout = Layout, Stride<_4, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_4, _1>>; -}; - -template <> -struct MMA_Traits -{ - using ValTypeD = half_t; - using ValTypeA = half_t; - using ValTypeB = half_t; - using ValTypeC = half_t; - - using Shape_MNK = Shape<_2,_16,_16>; - using ThrID = Layout<_16>; - using ALayout = Layout, Stride<_2, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_2, _1>>; -}; - -template <> -struct MMA_Traits -{ - using ValTypeD = half_t; - using ValTypeA = half_t; - using ValTypeB = half_t; - using ValTypeC = half_t; - - using Shape_MNK = Shape<_1,_16,_16>; - using ThrID = Layout<_16>; - using ALayout = Layout, Stride<_1, _1>>; - using BLayout = Layout, Stride<_1, _16>>; - using CLayout = Layout, Stride<_1, _1>>; -}; - template <> struct MMA_Traits { diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp index 8428274fef..fdaba345a0 100644 --- a/include/cute/util/sycl_vec.hpp +++ b/include/cute/util/sycl_vec.hpp @@ -57,10 +57,6 @@ using float2 = vector_t; using float4 = vector_t; using float8 = vector_t; -using half2 = vector_t<_Float16, 2>; -using half4 = vector_t<_Float16, 4>; -using half8 = vector_t<_Float16, 8>; - using short2 = vector_t; using short4 = vector_t; using short8 = vector_t; diff --git a/test/unit/cute/intel_xe/mma.cpp b/test/unit/cute/intel_xe/mma.cpp index 5589310f61..59d91c9df8 100755 --- a/test/unit/cute/intel_xe/mma.cpp +++ b/test/unit/cute/intel_xe/mma.cpp @@ -302,26 +302,6 @@ TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F32F16F16F32_TT) { MMA_Test (512, 512, 256); } -#if defined(CUTE_ARCH_MMA_XE_SPIRV_ENABLED) -TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_F16F16F16F16_TT) { - MMA_Test - (512, 512, 256); -} - -TEST(PVC_CuTe_Xe, MMA_XE_4x16x16_F16F16F16F16_TT) { - MMA_Test - (512, 512, 256); -} - -TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_F16F16F16F16_TT) { - MMA_Test - (512, 512, 256); -} -#endif -TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F16F16F16F16_TT) { - MMA_Test - (512, 512, 256); -} TEST(PVC_CuTe_Xe, FMA_XE_UniversalFMA_F32F32F32F32) { MMA_Test, 64, 64, 8, 16, 16, float, From a2c45b1af93a759647df6419593c3adb8b5882fb Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Wed, 14 May 2025 18:10:41 +0100 Subject: [PATCH 16/21] Add U8 copy operation for K16 MMA --- .../02_pvc_gemm_mixed_dtype.cpp | 2 +- include/cute/arch/copy_xe_U8.hpp | 40 ++- include/cute/arch/copy_xe_builtin.hpp | 16 +- include/cute/arch/copy_xe_spirv.hpp | 16 + include/cute/atom/copy_traits_xe.hpp | 318 ++++++++++++------ include/cute/util/sycl_vec.hpp | 1 + test/unit/cute/intel_xe/copy_block.cpp | 10 +- .../device/default_gemm_configuration.hpp | 4 +- ...f16n_f32t_mixed_input_tensor_op_f32_xe.cpp | 2 +- 9 files changed, 284 insertions(+), 125 deletions(-) diff --git a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp index 617f1eca64..ae6819b77a 100644 --- a/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp +++ b/examples/sycl/02_pvc_gemm_mixed_dtype/02_pvc_gemm_mixed_dtype.cpp @@ -535,7 +535,7 @@ int main(int argc, const char** argv) using ElementScale = MmaType; // Note: XE_2D_U18x32x32_LD_N is incompatible with our bf16 MMA atoms - using GmemTiledCopyA = XE_2D_U8x32x32_LD_V; // U8 (1-byte) block copy for A (narrower type) + using GmemTiledCopyA = XE_2D_U8x32x32_LD_N; // U8 (1-byte) block copy for A (narrower type) using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; // U16 (2-byte) block copy for B (wider type) static_assert(sizeof(ElementInputA) == 1, "ElementA width must match GmemTiledCopyA U8"); diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp index f3a2d574ab..9400777659 100644 --- a/include/cute/arch/copy_xe_U8.hpp +++ b/include/cute/arch/copy_xe_U8.hpp @@ -35,7 +35,7 @@ namespace cute { -struct XE_2D_U8x1x32_LD_N { +struct XE_2D_Packed_U8x1x32_LD_N { using BlockShape = Shape<_1, _32>; using inst_dtype = int8_t; @@ -65,7 +65,7 @@ struct XE_2D_U8x1x32_LD_N { }; }; -struct XE_2D_U8x2x32_LD_N { +struct XE_2D_Packed_U8x2x32_LD_N { using BlockShape = Shape<_2, _32>; using inst_dtype = int8_t; @@ -111,7 +111,7 @@ struct XE_2D_U8x2x32_ST_N { } }; -struct XE_2D_U8x4x32_LD_N { +struct XE_2D_Packed_U8x4x32_LD_N { using BlockShape = Shape<_4, _32>; template @@ -140,7 +140,7 @@ struct XE_2D_U8x4x32_LD_N { }; }; -struct XE_2D_U8x8x32_LD_N { +struct XE_2D_Packed_U8x8x32_LD_N { using BlockShape = Shape<_8, _32>; template @@ -169,7 +169,7 @@ struct XE_2D_U8x8x32_LD_N { }; }; -struct XE_2D_U8x16x32_LD_N { +struct XE_2D_Packed_U8x16x32_LD_N { using BlockShape = Shape<_16, _32>; template @@ -198,7 +198,7 @@ struct XE_2D_U8x16x32_LD_N { }; }; -struct XE_2D_U8x32x32_LD_N { +struct XE_2D_Packed_U8x32x32_LD_N { using BlockShape = Shape<_32, _32>; template @@ -214,7 +214,23 @@ struct XE_2D_U8x32x32_LD_N { } }; -struct XE_2D_U8x1x64_LD_N { +struct XE_2D_U8x32x32_LD_N { + using BlockShape = Shape<_32, _32>; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(CUTE_ARCH_COPY_XE_ENABLED) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + detail::XeSubgroup2DBlockLoad<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); +#endif + } +}; + +struct XE_2D_Packed_U8x1x64_LD_N { using BlockShape = Shape<_1, _64>; template @@ -243,7 +259,7 @@ struct XE_2D_U8x1x64_LD_N { }; }; -struct XE_2D_U8x2x64_LD_N { +struct XE_2D_Packed_U8x2x64_LD_N { using BlockShape = Shape<_2, _64>; template @@ -272,7 +288,7 @@ struct XE_2D_U8x2x64_LD_N { }; }; -struct XE_2D_U8x4x64_LD_N { +struct XE_2D_Packed_U8x4x64_LD_N { using BlockShape = Shape<_4, _64>; template @@ -301,7 +317,7 @@ struct XE_2D_U8x4x64_LD_N { }; }; -struct XE_2D_U8x8x64_LD_N { +struct XE_2D_Packed_U8x8x64_LD_N { using BlockShape = Shape<_8, _64>; template @@ -330,7 +346,7 @@ struct XE_2D_U8x8x64_LD_N { }; }; -struct XE_2D_U8x16x64_LD_N { +struct XE_2D_Packed_U8x16x64_LD_N { using BlockShape = Shape<_16, _64>; template @@ -359,7 +375,7 @@ struct XE_2D_U8x16x64_LD_N { }; }; -struct XE_2D_U8x32x64_LD_N { +struct XE_2D_Packed_U8x32x64_LD_N { using BlockShape = Shape<_32, _64>; template diff --git a/include/cute/arch/copy_xe_builtin.hpp b/include/cute/arch/copy_xe_builtin.hpp index a6404475eb..d3e8d7ea80 100644 --- a/include/cute/arch/copy_xe_builtin.hpp +++ b/include/cute/arch/copy_xe_builtin.hpp @@ -146,7 +146,10 @@ SYCL_DEVICE_BUILTIN( cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u8_m32k32v2( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord)); - +SYCL_DEVICE_BUILTIN( + cute::intel::uchar64 __builtin_IB_subgroup_block_read_flat_u8_m32k16v2( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); // 8bits VNNI transform No transpose SYCL_DEVICE_BUILTIN( @@ -523,6 +526,17 @@ struct XeSubgroup2DBlockLoad<1, 32, 32, 1> { } }; +template<> +struct XeSubgroup2DBlockLoad<1, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + template<> struct XeSubgroup2DBlockLoad<1, 32, 1, 2> { template diff --git a/include/cute/arch/copy_xe_spirv.hpp b/include/cute/arch/copy_xe_spirv.hpp index d37db7cbe0..9abdf93e5f 100644 --- a/include/cute/arch/copy_xe_spirv.hpp +++ b/include/cute/arch/copy_xe_spirv.hpp @@ -34,6 +34,11 @@ #include "cute/config.hpp" // TODO(Codeplay): These builtins are not available on SPIRV +SYCL_EXTERNAL extern "C" +cute::intel::uchar64 __builtin_IB_subgroup_block_read_flat_u8_m32k16v2( + long baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord); + SYCL_EXTERNAL extern "C" cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( intptr_t baseoffset, int width_minus_one, int height_minus_one, @@ -271,6 +276,17 @@ struct XeSubgroup2DBlockStore { } }; +template<> +struct XeSubgroup2DBlockLoad<1, 16, 32, 2> { + template + CUTE_HOST_DEVICE void + operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch, + cute::intel::coord_t coordinate, T* dstPointer) { + *reinterpret_cast(dstPointer) = __builtin_IB_subgroup_block_read_flat_u8_m32k16v2( + (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate); + } +}; + template<> struct XeSubgroup2DBlockTranspose<4, 2, 16, 1> { template diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 83cc6f63ae..293e7c2504 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -86,6 +86,130 @@ auto get_logical_layout(LayoutIn &&, BlockShape &&) { } } // end namespace detail +template +struct choose_prefetch_for_type { + static_assert(dependent_false<>, "Invalid prefetch"); +}; + +// U4 +template <> +struct choose_prefetch_for_type<4, 1> { + using Prefetch = XE_2D_Packed_U8x1x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<4, 2> { + using Prefetch = XE_2D_Packed_U8x2x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<4, 8> { + using Prefetch = XE_2D_Packed_U8x8x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<4, 16> { + using Prefetch = XE_2D_Packed_U8x16x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<4, 32> { + using Prefetch = XE_2D_Packed_U8x32x32_LD_N; +}; + +// U8 +template <> +struct choose_prefetch_for_type<8, 1> { + using Prefetch = XE_2D_Packed_U8x1x64_LD_N; +}; + +template <> +struct choose_prefetch_for_type<8, 2> { + using Prefetch = XE_2D_Packed_U8x2x64_LD_N; +}; + +template <> +struct choose_prefetch_for_type<8, 4> { + using Prefetch = XE_2D_Packed_U8x4x64_LD_N; +}; + +template <> +struct choose_prefetch_for_type<8, 8> { + using Prefetch = XE_2D_Packed_U8x8x64_LD_N; +}; + +template <> +struct choose_prefetch_for_type<8, 16> { + using Prefetch = XE_2D_Packed_U8x16x64_LD_N; +}; + +template <> +struct choose_prefetch_for_type<8, 32> { + using Prefetch = XE_2D_Packed_U8x32x64_LD_N; +}; + +// U16 +template <> +struct choose_prefetch_for_type<16, 1> { + using Prefetch = XE_2D_U16x1x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<16, 2> { + using Prefetch = XE_2D_U16x2x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<16, 4> { + using Prefetch = XE_2D_U16x4x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<16, 8> { + using Prefetch = XE_2D_U16x8x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<16, 16> { + using Prefetch = XE_2D_U16x16x32_LD_N; +}; + +template <> +struct choose_prefetch_for_type<16, 32> { + using Prefetch = XE_2D_U16x32x32_LD_N; +}; + +// U32 +template <> +struct choose_prefetch_for_type<32, 1> { + using Prefetch = XE_2D_U32x1x16_LD_N; +}; + +template <> +struct choose_prefetch_for_type<32, 2> { + using Prefetch = XE_2D_U32x2x16_LD_N; +}; + +template <> +struct choose_prefetch_for_type<32, 4> { + using Prefetch = XE_2D_U32x4x16_LD_N; +}; + +template <> +struct choose_prefetch_for_type<32, 8> { + using Prefetch = XE_2D_U32x8x16_LD_N; +}; + +template <> +struct choose_prefetch_for_type<32, 16> { + using Prefetch = XE_2D_U32x16x16_LD_N; +}; + +template <> +struct choose_prefetch_for_type<32, 32> { + using Prefetch = XE_2D_U32x32x16_LD_N; +}; + template CUTE_HOST_DEVICE auto prefetch_selector(Tensor const& tensor) { constexpr size_t cacheline_bytes = 64; @@ -119,48 +243,17 @@ CUTE_HOST_DEVICE auto prefetch_selector(Tensor const& tensor) { Stride, Stride<_1, Int>>> >; - #define RETURN_STATEMENT(NON_CONTIG, DTYPE_SIZE, CONTIG) \ - using PrefetchTraits = Copy_Traits; \ - using PrefetchAtom = Copy_Atom; \ - using Scalar = Int; \ - using ScalarLayout = std::conditional_t>, Layout>>; \ - using ScalarPrefetchShape = decltype(product_each(raked_product(ScalarLayout{}, \ - Layout{}).shape())); \ - using PrefetchValLayout = decltype(make_layout(shape_div(ScalarPrefetchShape{}, CopyThreadShape{}))); \ - return make_tiled_copy(PrefetchAtom{}.with(tensor), \ - PrefetchTilingLayout{}, \ - PrefetchValLayout{}); - - #define CHOOSE_PREFETCH_FOR_TYPE(NON_CONTIG) \ - if constexpr (dtype_size_bits == 4){ \ - RETURN_STATEMENT(NON_CONTIG, 8, 32); \ - } else if constexpr (dtype_size_bits == 8){ \ - RETURN_STATEMENT(NON_CONTIG, 8, 64); \ - } else if constexpr (dtype_size_bits == 16){ \ - RETURN_STATEMENT(NON_CONTIG, 16, 32); \ - } else if constexpr (dtype_size_bits == 32){ \ - RETURN_STATEMENT(NON_CONTIG, 32, 16); \ - } else { \ - static_assert(dependent_false && "Invalid TileShape and dtype"); \ - } + using PrefetchOp = typename choose_prefetch_for_type::Prefetch; + using PrefetchTraits = Copy_Traits; + using PrefetchAtom = Copy_Atom; + using Scalar = Int; + using ScalarLayout = std::conditional_t>, + Layout>>; + using ScalarPrefetchShape = decltype(product_each(raked_product(ScalarLayout{}, + Layout{}).shape())); + using PrefetchValLayout = decltype(make_layout(shape_div(ScalarPrefetchShape{}, CopyThreadShape{}))); + return make_tiled_copy(PrefetchAtom{}.with(tensor), PrefetchTilingLayout{}, PrefetchValLayout{}); - if constexpr (block_non_contig_size == 1){ - CHOOSE_PREFETCH_FOR_TYPE(1) - } else if constexpr (block_non_contig_size == 2) { - CHOOSE_PREFETCH_FOR_TYPE(2) - } else if constexpr (block_non_contig_size == 4) { - CHOOSE_PREFETCH_FOR_TYPE(4) - } else if constexpr (block_non_contig_size == 8) { - CHOOSE_PREFETCH_FOR_TYPE(8) - } else if constexpr (block_non_contig_size == 16) { - CHOOSE_PREFETCH_FOR_TYPE(16) - } else if constexpr (block_non_contig_size == 32) { - CHOOSE_PREFETCH_FOR_TYPE(32) - } else { - static_assert(dependent_false && "Invalid TileShape[0]"); - } - #undef CHOOSE_PREFETCH_FOR_TYPE - #undef RETURN_STATEMENT } template @@ -450,8 +543,8 @@ struct Copy_Traits_{ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout, @@ -464,12 +557,12 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, @@ -482,12 +575,12 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, @@ -500,12 +593,12 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, @@ -518,12 +611,12 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, @@ -536,7 +629,25 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} +}; + +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0,Stride< _1,_128,_256>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_16,Stride< _1,_128,_256>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + + template + Copy_Traits_(ArgT... args) + : XE_2D_LD_Unpack(args...) {} }; template @@ -629,8 +740,8 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout, @@ -643,12 +754,12 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit @@ -663,8 +774,8 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout, @@ -677,13 +788,13 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit @@ -698,8 +809,8 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (dst-thr,dst-val) to bit using SrcLayout = Layout>, @@ -712,12 +823,12 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit @@ -732,8 +843,8 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout, @@ -746,12 +857,12 @@ struct Copy_Traits_ template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { // Logical thread id to thread idx using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit @@ -767,8 +878,8 @@ struct Copy_Traits_ template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout, @@ -780,12 +891,12 @@ struct Copy_Traits_ using RefLayout = DstLayout; template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, @@ -798,8 +909,8 @@ struct Copy_Traits_ }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout, @@ -811,12 +922,12 @@ struct Copy_Traits_ using RefLayout = DstLayout; template Copy_Traits_(ArgT... args) - : XE_2D_LD_Unpack(args...) {} + : XE_2D_LD_Unpack(args...) {} }; template -struct Copy_Traits_ - : XE_2D_LD_Unpack { +struct Copy_Traits_ + : XE_2D_LD_Unpack { using ThrID = Layout<_16>; // Map from (src-thr,src-val) to bit using SrcLayout = Layout>, @@ -2205,21 +2316,22 @@ struct Copy_Traits : Copy_Traits_{ \ : Copy_Traits_(args...) {} \ }; -COPY_TRAIT_LD_DEF(XE_2D_U8x1x32_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x2x32_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x4x32_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x8x32_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x1x64_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x2x64_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x4x64_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x8x64_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x32_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x32_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x32_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x8x32_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x64_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x64_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x64_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x8x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U64x8x1_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U64x8x2_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U64x8x4_LD_T) -COPY_TRAIT_LD_DEF(XE_2D_U8x16x32_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x16x32_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x32x32_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x16x64_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x32x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U8x32x32_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x16x64_LD_N) -COPY_TRAIT_LD_DEF(XE_2D_U8x32x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U16x1x16_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U16x2x16_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U16x4x16_LD_N) @@ -2266,12 +2378,12 @@ COPY_TRAIT_LD_DEF(XE_2D_U4x32x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x16x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x32x16_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U4x16x16_LD_T) -COPY_TRAIT_LD_DEF(XE_2D_U8x1x64_LD_N::PREFETCH) -COPY_TRAIT_LD_DEF(XE_2D_U8x2x64_LD_N::PREFETCH) -COPY_TRAIT_LD_DEF(XE_2D_U8x4x64_LD_N::PREFETCH) -COPY_TRAIT_LD_DEF(XE_2D_U8x8x64_LD_N::PREFETCH) -COPY_TRAIT_LD_DEF(XE_2D_U8x16x64_LD_N::PREFETCH) -COPY_TRAIT_LD_DEF(XE_2D_U8x32x64_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x1x64_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x2x64_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x4x64_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x8x64_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x16x64_LD_N::PREFETCH) +COPY_TRAIT_LD_DEF(XE_2D_Packed_U8x32x64_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x8x16_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x1x32_LD_N::PREFETCH) COPY_TRAIT_LD_DEF(XE_2D_U16x2x32_LD_N::PREFETCH) diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp index fdaba345a0..501efd0415 100644 --- a/include/cute/util/sycl_vec.hpp +++ b/include/cute/util/sycl_vec.hpp @@ -52,6 +52,7 @@ using uchar2 = vector_t; using uchar4 = vector_t; using uchar8 = vector_t; using uchar16 = vector_t; +using uchar64 = vector_t; using float2 = vector_t; using float4 = vector_t; diff --git a/test/unit/cute/intel_xe/copy_block.cpp b/test/unit/cute/intel_xe/copy_block.cpp index b94e56fdfe..b0cd2bbd40 100644 --- a/test/unit/cute/intel_xe/copy_block.cpp +++ b/test/unit/cute/intel_xe/copy_block.cpp @@ -334,11 +334,11 @@ TEST(PVC_CuTe_Xe, block_2d_32bits_n) { } TEST(PVC_CuTe_Xe, block_2d_8bits_n) { - copy_op{}(); - copy_op{}(); - copy_op{}(); - copy_op{}(); - copy_op{}(); + copy_op{}(); + copy_op{}(); + copy_op{}(); + copy_op{}(); + copy_op{}(); } TEST(PVC_CuTE_Xe, block_2d_16bits_n_v2) { diff --git a/test/unit/gemm/device/default_gemm_configuration.hpp b/test/unit/gemm/device/default_gemm_configuration.hpp index f48f291fb9..d08d9f8150 100644 --- a/test/unit/gemm/device/default_gemm_configuration.hpp +++ b/test/unit/gemm/device/default_gemm_configuration.hpp @@ -1612,7 +1612,7 @@ struct DefaultGemm_TensorOpXe_OperandB; template <> struct DefaultGemm_TensorOpXe_OperandA { - using GmemTiledCopy = XE_2D_U8x32x32_LD_N; + using GmemTiledCopy = XE_2D_Packed_U8x32x32_LD_N; }; /// Operand A - Column-major (M-major) @@ -1621,7 +1621,7 @@ struct DefaultGemm_TensorOpXe_OperandA { // Gmem // TODO(Codeplay): transposed version is not implemented - using GmemTiledCopy = XE_2D_U8x32x32_LD_N; + using GmemTiledCopy = XE_2D_Packed_U8x32x32_LD_N; }; /// Operand B - Row-major (N-Major) diff --git a/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_xe.cpp b/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_xe.cpp index 14faf05fdd..63f5c5cc49 100644 --- a/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_xe.cpp +++ b/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_xe.cpp @@ -76,7 +76,7 @@ TEST(XE_Device_GemmUniversal_s8t_bf16n_f32t_mixed_input_tensor_op_f32, 128x128x6 using LayoutD = cutlass::layout::RowMajor; // Note: XE_2D_U18x32x32_LD_N is incompatible with our bf16 MMA atoms - using GmemTiledCopyA = XE_2D_U8x32x32_LD_V; + using GmemTiledCopyA = XE_2D_U8x32x32_LD_N; using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; static_assert(sizeof(ElementInputA) == 1, "ElementA width must match GmemTiledCopyA U8"); From b962239ed998d236fba26fd51a9adfe9ecabbbb9 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 27 May 2025 15:29:45 +0100 Subject: [PATCH 17/21] fix merge conflict --- include/cute/arch/copy_xe_U8.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp index 731a444742..cc9d3a143b 100644 --- a/include/cute/arch/copy_xe_U8.hpp +++ b/include/cute/arch/copy_xe_U8.hpp @@ -223,7 +223,7 @@ struct XE_2D_U8x32x32_LD_N { T *dst) { #if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 32, 32, 1>{}(baseoffset, width, height, pitch, coord, dst); + detail::XeSubgroup2DBlockLoad<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif From d8e855e963ef577be888c18acd2d6163c3c25c0c Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 27 May 2025 15:46:24 +0100 Subject: [PATCH 18/21] Revert changes in the tests --- test/unit/cute/intel_xe/mma.cpp | 24 ++---------------------- test/unit/cute/intel_xe/utils.hpp | 17 +++++++++++------ 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/test/unit/cute/intel_xe/mma.cpp b/test/unit/cute/intel_xe/mma.cpp index 59d91c9df8..1c0e3d8a61 100755 --- a/test/unit/cute/intel_xe/mma.cpp +++ b/test/unit/cute/intel_xe/mma.cpp @@ -263,26 +263,6 @@ TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F32BF16BF16F32_TT) { bfloat16_t, float>(512, 512, 256); } -TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_BF16BF16BF16BF16_TT) { - MMA_Test(512, 512, 256); -} - -TEST(PVC_CuTe_Xe, MMA_XE_4x16x16_BF16BF16BF16BF16_TT) { - MMA_Test(512, 512, 256); -} - -TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_BF16BF16BF16BF16_TT) { - MMA_Test(512, 512, 256); -} - -TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_BF16BF16BF16BF16_TT) { - MMA_Test(512, 512, 256); -} - TEST(PVC_CuTe_Xe, MMA_XE_8x16x16_F32F16F16F32_TT) { MMA_Test(512, 512, 256); @@ -299,8 +279,8 @@ TEST(PVC_CuTe_Xe, MMA_XE_2x16x16_F32F16F16F32_TT) { } TEST(PVC_CuTe_Xe, MMA_XE_1x16x16_F32F16F16F32_TT) { - MMA_Test - (512, 512, 256); + MMA_Test( + 512, 512, 256); } TEST(PVC_CuTe_Xe, FMA_XE_UniversalFMA_F32F32F32F32) { diff --git a/test/unit/cute/intel_xe/utils.hpp b/test/unit/cute/intel_xe/utils.hpp index 48973a0de9..e109d9fe27 100755 --- a/test/unit/cute/intel_xe/utils.hpp +++ b/test/unit/cute/intel_xe/utils.hpp @@ -59,10 +59,10 @@ void verify(uint32_t m, uint32_t n, uint32_t k, atype *A, btype *B, ctype *C, bool row_a = true, bool row_b = true) { int cnt = 0; bool is_normal = true; - using accum_type = conditional_t == 32, ctype, float>; + for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { - accum_type expect = accum_type(0); + ctype expect = ctype(0); for (int z = 0; z < k; z++) { auto a = row_a ? A[i * k + z] : A[i + z * m]; auto b = row_b ? B[z * n + j] : B[z + j * k]; @@ -71,10 +71,15 @@ void verify(uint32_t m, uint32_t n, uint32_t k, atype *A, btype *B, ctype *C, ctype val = C[i * n + j]; - if (isnormal(val) && isnormal(expect)) { - auto error = std::abs((expect - val) / val); - if (error > 0.02f) { - cnt++; + if constexpr(std::is_floating_point_v) { + if (isnormal(val) && isnormal(expect)) { + auto error = std::abs((expect - val) / val); + if (error > 0.01f) { + cnt++; + } + } else { + // TODO(codeplay): Assert that at least some values are non-zero. + if(!(expect == 0 && val == 0)) is_normal = false; } } else { if (val != expect) { From d0e2c94d0376e5cc1a45828862e295566565b826 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 27 May 2025 15:47:35 +0100 Subject: [PATCH 19/21] Update GEMM FP8 example --- examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp | 2 +- include/cute/arch/copy_xe_U8.hpp | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp index 80304eaf47..969af7d48d 100644 --- a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp +++ b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp @@ -346,7 +346,7 @@ int launcher(Options& options) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; - using GmemTiledCopyA = XE_2D_U8x32x32_LD_V; + using GmemTiledCopyA = XE_2D_U8x32x32_LD_N; using GmemTiledCopyB = XE_2D_U8x32x32_LD_V; using TileShape = Shape<_256, _256, _32>; diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp index cc9d3a143b..859c6539f1 100644 --- a/include/cute/arch/copy_xe_U8.hpp +++ b/include/cute/arch/copy_xe_U8.hpp @@ -223,7 +223,10 @@ struct XE_2D_U8x32x32_LD_N { T *dst) { #if defined(CUTE_ARCH_COPY_XE_ENABLED) static_assert(sizeof(T) == 1, "Expected T to have size 1"); - detail::XeSubgroup2DBlockLoad<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); + // detail::XeSubgroup2DBlockLoad<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); + // Use the transform (VNNI) version as it provides better performance when loading the A matrix for + // GEMM FP8 and GEMM mixed-precision types. + detail::XeSubgroup2DBlockLoadTransform<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst); #else CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware"); #endif From 9dd7fa1bbaa8a003c301c5dee517aa4972a34638 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 29 May 2025 17:28:08 +0100 Subject: [PATCH 20/21] Update include/cute/util/sycl_vec.hpp Co-authored-by: Joe Todd --- include/cute/util/sycl_vec.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp index 501efd0415..1c1d4f8cb1 100644 --- a/include/cute/util/sycl_vec.hpp +++ b/include/cute/util/sycl_vec.hpp @@ -52,6 +52,7 @@ using uchar2 = vector_t; using uchar4 = vector_t; using uchar8 = vector_t; using uchar16 = vector_t; +using uchar32 = vector_t; using uchar64 = vector_t; using float2 = vector_t; From 3b06331233288ddc7d6524450bdbcca08e5b70c6 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 29 May 2025 18:36:34 +0100 Subject: [PATCH 21/21] Update include/cute/atom/copy_traits_xe.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tadej Ciglarič --- include/cute/atom/copy_traits_xe.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 293e7c2504..93de713f8b 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -641,7 +641,7 @@ struct Copy_Traits_ Stride< _0,Stride< _1,_128,_256>>>; // Map from (dst-thr,dst-val) to bit using DstLayout = Layout>, - Stride<_16,Stride< _1,_128,_256>>>; + Stride<_8,Stride< _1,_128,_256>>>; // Reference map from (thr,val) to bit using RefLayout = DstLayout;