codeplaysoftware · aacostadiaz · Mar 12, 2025 · Apr 7, 2025 · Apr 14, 2025 · Apr 22, 2025
diff --git a/examples/sycl/02_bmg_gemm_mixed_dtype/02_bmg_gemm_mixed_dtype.cpp b/examples/sycl/02_bmg_gemm_mixed_dtype/02_bmg_gemm_mixed_dtype.cpp
@@ -535,7 +535,7 @@ int main(int argc, const char** argv)
   using ElementScale = MmaType;
 
   // Note: XE_2D_U18x32x32_LD_N is incompatible with our bf16 MMA atoms
-  using GmemTiledCopyA = XE_2D_U8x32x32_LD_V;  // U8  (1-byte) block copy for A (narrower type)
+  using GmemTiledCopyA = XE_2D_U8x32x32_LD_N;  // U8  (1-byte) block copy for A (narrower type)
   using GmemTiledCopyB = XE_2D_U16x32x32_LD_V; // U16 (2-byte) block copy for B (wider type)
   static_assert(sizeof(ElementInputA) == 1, "ElementA width must match GmemTiledCopyA U8");
 

diff --git a/examples/sycl/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp b/examples/sycl/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp
@@ -346,7 +346,7 @@ int launcher(Options& options)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  using GmemTiledCopyA = XE_2D_U8x32x32_LD_V;
+  using GmemTiledCopyA = XE_2D_U8x32x32_LD_N;
   using GmemTiledCopyB = XE_2D_U8x32x32_LD_V;
 
   using TileShape = Shape<_256, _256, _32>;

diff --git a/include/cute/arch/copy_xe_U8.hpp b/include/cute/arch/copy_xe_U8.hpp
@@ -35,7 +35,7 @@
 
 namespace cute
 {
-struct XE_2D_U8x1x32_LD_N {
+struct XE_2D_Packed_U8x1x32_LD_N {
   using BlockShape = Shape<_1, _32>;
   using inst_dtype = int8_t;
 
@@ -65,7 +65,7 @@ struct XE_2D_U8x1x32_LD_N {
   };
 };
 
-struct XE_2D_U8x2x32_LD_N {
+struct XE_2D_Packed_U8x2x32_LD_N {
   using BlockShape = Shape<_2, _32>;
   using inst_dtype = int8_t;
 
@@ -111,7 +111,7 @@ struct XE_2D_U8x2x32_ST_N {
   }
 };
 
-struct XE_2D_U8x4x32_LD_N {
+struct XE_2D_Packed_U8x4x32_LD_N {
   using BlockShape = Shape<_4, _32>;
 
   template <class T>
@@ -140,7 +140,7 @@ struct XE_2D_U8x4x32_LD_N {
   };
 };
 
-struct XE_2D_U8x8x32_LD_N {
+struct XE_2D_Packed_U8x8x32_LD_N {
   using BlockShape = Shape<_8, _32>;
 
   template <class T>
@@ -169,7 +169,7 @@ struct XE_2D_U8x8x32_LD_N {
   };
 };
 
-struct XE_2D_U8x16x32_LD_N {
+struct XE_2D_Packed_U8x16x32_LD_N {
   using BlockShape = Shape<_16, _32>;
 
   template <class T>
@@ -198,7 +198,7 @@ struct XE_2D_U8x16x32_LD_N {
   };
 };
 
-struct XE_2D_U8x32x32_LD_N {
+struct XE_2D_Packed_U8x32x32_LD_N {
   using BlockShape = Shape<_32, _32>;
 
   template <class T>
@@ -214,7 +214,26 @@ struct XE_2D_U8x32x32_LD_N {
   }
 };
 
-struct XE_2D_U8x1x64_LD_N {
+struct XE_2D_U8x32x32_LD_N {
+  using BlockShape = Shape<_32, _32>;
+
+  template <class T>
+  CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
+                                    int height, int pitch, intel::coord_t coord,
+                                    T *dst) {
+#if defined(CUTE_ARCH_COPY_XE_ENABLED)
+    static_assert(sizeof(T) == 1, "Expected T to have size 1");
+    // detail::XeSubgroup2DBlockLoad<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst);
+    // Use the transform (VNNI) version as it provides better performance when loading the A matrix for
+    // GEMM FP8 and GEMM mixed-precision types.
+    detail::XeSubgroup2DBlockLoadTransform<1, 16, 32, 2>{}(baseoffset, width, height, pitch, coord, dst);
+#else
+    CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-Xe hardware");
+#endif
+  }
+};
+
+struct XE_2D_Packed_U8x1x64_LD_N {
   using BlockShape = Shape<_1, _64>;
 
   template <class T>
@@ -243,7 +262,7 @@ struct XE_2D_U8x1x64_LD_N {
   };
 };
 
-struct XE_2D_U8x2x64_LD_N {
+struct XE_2D_Packed_U8x2x64_LD_N {
   using BlockShape = Shape<_2, _64>;
 
   template <class T>
@@ -272,7 +291,7 @@ struct XE_2D_U8x2x64_LD_N {
   };
 };
 
-struct XE_2D_U8x4x64_LD_N {
+struct XE_2D_Packed_U8x4x64_LD_N {
   using BlockShape = Shape<_4, _64>;
 
   template <class T>
@@ -301,7 +320,7 @@ struct XE_2D_U8x4x64_LD_N {
   };
 };
 
-struct XE_2D_U8x8x64_LD_N {
+struct XE_2D_Packed_U8x8x64_LD_N {
   using BlockShape = Shape<_8, _64>;
 
   template <class T>
@@ -330,7 +349,7 @@ struct XE_2D_U8x8x64_LD_N {
   };
 };
 
-struct XE_2D_U8x16x64_LD_N {
+struct XE_2D_Packed_U8x16x64_LD_N {
   using BlockShape = Shape<_16, _64>;
 
   template <class T>
@@ -359,7 +378,7 @@ struct XE_2D_U8x16x64_LD_N {
   };
 };
 
-struct XE_2D_U8x32x64_LD_N {
+struct XE_2D_Packed_U8x32x64_LD_N {
   using BlockShape = Shape<_32, _64>;
 
   template <class T>

diff --git a/include/cute/arch/copy_xe_builtin.hpp b/include/cute/arch/copy_xe_builtin.hpp
@@ -146,7 +146,10 @@ SYCL_DEVICE_BUILTIN(
     cute::intel::ushort64 __builtin_IB_subgroup_block_read_flat_u8_m32k32v2(
         intptr_t baseoffset, int width_minus_one, int height_minus_one,
         int pitch_minus_one, cute::intel::coord_t coord));
-
+SYCL_DEVICE_BUILTIN(
+    cute::intel::uchar64 __builtin_IB_subgroup_block_read_flat_u8_m32k16v2(
+        long baseoffset, int width_minus_one, int height_minus_one,
+        int pitch_minus_one, cute::intel::coord_t coord));
 
 // 8bits VNNI transform No transpose
 SYCL_DEVICE_BUILTIN(
@@ -523,6 +526,17 @@ struct XeSubgroup2DBlockLoad<1, 32, 32, 1> {
     }
 };
 
+template<>
+struct XeSubgroup2DBlockLoad<1, 16, 32, 2> {
+    template<typename T>
+    CUTE_HOST_DEVICE void
+    operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch,
+            cute::intel::coord_t coordinate, T* dstPointer) {
+        *reinterpret_cast<intel::uchar64 *>(dstPointer) =  __builtin_IB_subgroup_block_read_flat_u8_m32k16v2(
+           (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate);
+    }
+};
+
 template<>
 struct XeSubgroup2DBlockLoad<1, 32, 1, 2> {
     template<typename T>

diff --git a/include/cute/arch/copy_xe_spirv.hpp b/include/cute/arch/copy_xe_spirv.hpp
@@ -34,6 +34,11 @@
 #include "cute/config.hpp"
 
 // TODO(Codeplay): These builtins are not available on SPIRV
+SYCL_EXTERNAL extern "C"
+cute::intel::uchar64 __builtin_IB_subgroup_block_read_flat_u8_m32k16v2(
+  long baseoffset, int width_minus_one, int height_minus_one,
+  int pitch_minus_one, cute::intel::coord_t coord);
+
 SYCL_EXTERNAL extern "C"
 cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2(
   intptr_t baseoffset, int width_minus_one, int height_minus_one,
@@ -271,6 +276,17 @@ struct XeSubgroup2DBlockStore {
   }
 };
 
+template<>
+struct XeSubgroup2DBlockLoad<1, 16, 32, 2> {
+  template<typename T>
+  CUTE_HOST_DEVICE void
+  operator()(const void* srcBasePointer, int memoryWidth, int memoryHeight, int memoryPitch,
+          cute::intel::coord_t coordinate, T* dstPointer) {
+    *reinterpret_cast<intel::uchar64 *>(dstPointer) =  __builtin_IB_subgroup_block_read_flat_u8_m32k16v2(
+       (intptr_t)(srcBasePointer), memoryWidth - 1, memoryHeight - 1, memoryPitch - 1, coordinate);
+  }
+};
+
 template<>
 struct XeSubgroup2DBlockLoadTranspose<4, 2, 16, 1> {
   template<typename T>