fix s2r loader.

KuangjuX · KuangjuX · commit 61ba350d7cee · 2025-02-03T04:25:08.000Z
diff --git a/include/cell/copy/shared_to_register.hpp b/include/cell/copy/shared_to_register.hpp
@@ -84,24 +84,13 @@ struct SharedToRegLoaderImpl<Shared, Reg_, kRowExec_, kColExec_,
         // int lane_offset = in_base_tile_(lane_row, lane_col);
         int offset = 0;
 
-        if (thread0()) {
-            printf("kRowExec: %d, kColExec: %d\n", kRowExec, kColExec);
-            printf("kSwizzledBlockRows: %d, kSwizzledBlockCols: %d\n",
-                   kSwizzledBlockRows, kSwizzledBlockCols);
-        }
-
 #pragma unroll
         for (int i = 0; i < kRowExec; ++i) {
 #pragma unroll
             for (int j = 0; j < kColExec; ++j) {
-                tile_offset += i * SharedCols * 16 + j * 16;
-                int thrd_offset =
-                    tile_offset + lane_row * SharedCols + lane_col;
+                int thrd_offset = tile_offset + i * SharedCols * 16 + j * 16 +
+                                  lane_row * SharedCols + lane_col;
                 offset = get_swizzle_offset(thrd_offset);
-                // auto base_tile_id = get_base_tile_id(tile_offset);
-                // auto swizzled_tile_id = get_swizzled_tile_id(tile_offset);
-                // auto in_swizzled_tile_id =
-                // get_in_swizzle_tile_id(tile_offset);
 
                 // if (thread0()) {
                 //     printf("i: %d, j: %d\n", i, j);
@@ -133,9 +122,8 @@ struct SharedToRegLoaderImpl<Shared, Reg_, kRowExec_, kColExec_,
     //     tl::SharedLayoutWrapper<Shared, LoadMat::kAccessInBits>::Layout;
     // BaseTileSharedLayout in_base_tile_;
 
-    using SrcLayout =
-        tl::MatrixLayout<kSwizzledBlockRows, kSwizzledBlockCols * 8,
-                         Shared::kRowStride, 64>;
+    using SrcLayout = tl::MatrixLayout<kSwizzledBlockRows, kSwizzledBlockCols,
+                                       Shared::kRowStride * 8, 64>;
     SrcLayout src_tile_;
 
     using SwizzledBaseShape = traits::SwizzleBaseTileShape<DType>;
@@ -240,8 +228,11 @@ struct RegToSharedStorerImpl<Reg_, Shared_, kRowExec_, kColExec_,
   private:
     using BaseShape = BaseTileShape<DType>;
 
+    // static constexpr int kRowStride = BaseShape::kRows * Shared::kRowStride;
+    // static constexpr int kColStride = BaseShape::kNumel;
+
     static constexpr int kRowStride = BaseShape::kRows * Shared::kRowStride;
-    static constexpr int kColStride = BaseShape::kNumel;
+    static constexpr int kColStride = BaseShape::kCols;
 };
 
 template <typename Reg_, typename Shared_, const int kRowExec_,
diff --git a/tests/cpp/cell/test_s2r_copy.cu b/tests/cpp/cell/test_s2r_copy.cu
@@ -70,11 +70,16 @@ __global__ void run_test_load(Copy& copy) {
 
     copy(s_tile, r_tile);
 
-    // #if defined(DEBUG)
+#if defined(DEBUG)
     if (thread0()) {
         r_tile.dump_value();
     }
-    // #endif
+#endif
+
+    if (threadIdx.x == 4) {
+        printf("threadIdx.x: %d\n", threadIdx.x);
+        r_tile.dump_value();
+    }
 }
 
 template <typename Shared, typename Reg, typename Loader, typename Storer>
@@ -210,62 +215,62 @@ TEST(TestShared2Reg, operand_A) {  // load mode for loading operand A in gemm
 //     cudaDeviceSynchronize();
 // }
 
-// TEST(TestReg2Shared, operand_C_half) {
-//     using Element = __half;
-
-//     using WarpLayout = tl::RowMajor<1, 1>;
-//     const int kThreads = tl::get_numel<WarpLayout> * 32;
-
-//     // using Shared = SharedTile<Element, tl::RowMajor<16, 16>>;
-//     using Shared = SharedTile<Element, tl::RowMajor<16, 64>>;
-//     // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 1>>;
-//     using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 4>>;
-
-//     using Loader = SharedToRegLoader<Reg, WarpLayout, WarpReuse::kCont>;
-//     Loader loader;
-
-//     using Storer = RegToSharedStorer<Reg, WarpLayout>;
-//     Storer storer;
-
-//     dim3 dim_grid(1, 1, 1);
-//     dim3 dim_block(kThreads, 1, 1);
-//     int shm_size = Shared::kNumel * sizeof(Element);
-
-//     run_test_store<Shared, Reg, Loader, Storer>
-//         <<<dim_grid, dim_block, shm_size>>>(loader, storer);
-//     cudaDeviceSynchronize();
-// }
-
-TEST(TestShared2Reg, operand_A_swizzle) {
+TEST(TestReg2Shared, operand_C_half) {
     using Element = __half;
 
     using WarpLayout = tl::RowMajor<1, 1>;
     const int kThreads = tl::get_numel<WarpLayout> * 32;
 
-    // const int kRows = 64;
-    // const int kCols = 32;
-
-    const int kRows = 16;
-    const int kCols = 64;
-
-    using SharedLayout = tl::RowMajor<kRows, kCols>;
-    const bool kUseSwizzledLayout = true;
-    using Shared = SharedTile<Element, SharedLayout, kUseSwizzledLayout>;
-    // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<2, 2>>;
+    // using Shared = SharedTile<Element, tl::RowMajor<16, 16>>;
+    using Shared = SharedTile<Element, tl::RowMajor<16, 64>>;
+    // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 1>>;
     using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 4>>;
 
-    using Copy = SharedToRegLoader<Reg, WarpLayout, WarpReuse::kRowReuseCont>;
-    Copy copy;
+    using Loader = SharedToRegLoader<Reg, WarpLayout, WarpReuse::kCont>;
+    Loader loader;
+
+    using Storer = RegToSharedStorer<Reg, WarpLayout>;
+    Storer storer;
 
     dim3 dim_grid(1, 1, 1);
     dim3 dim_block(kThreads, 1, 1);
     int shm_size = Shared::kNumel * sizeof(Element);
 
-    run_test_load<Element, Shared, Reg, Copy>
-        <<<dim_grid, dim_block, shm_size>>>(copy);
+    run_test_store<Shared, Reg, Loader, Storer>
+        <<<dim_grid, dim_block, shm_size>>>(loader, storer);
     cudaDeviceSynchronize();
 }
 
+// TEST(TestShared2Reg, operand_A_swizzle) {
+//     using Element = __half;
+
+//     using WarpLayout = tl::RowMajor<1, 1>;
+//     const int kThreads = tl::get_numel<WarpLayout> * 32;
+
+//     // const int kRows = 64;
+//     // const int kCols = 32;
+
+//     const int kRows = 16;
+//     const int kCols = 64;
+
+//     using SharedLayout = tl::RowMajor<kRows, kCols>;
+//     const bool kUseSwizzledLayout = true;
+//     using Shared = SharedTile<Element, SharedLayout, kUseSwizzledLayout>;
+//     // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<2, 2>>;
+//     using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 4>>;
+
+//     using Copy = SharedToRegLoader<Reg, WarpLayout,
+//     WarpReuse::kRowReuseCont>; Copy copy;
+
+//     dim3 dim_grid(1, 1, 1);
+//     dim3 dim_block(kThreads, 1, 1);
+//     int shm_size = Shared::kNumel * sizeof(Element);
+
+//     run_test_load<Element, Shared, Reg, Copy>
+//         <<<dim_grid, dim_block, shm_size>>>(copy);
+//     cudaDeviceSynchronize();
+// }
+
 // TEST(TestReg2Shared, operand_C_float) {
 //     using Element = __half;
 //     using AccType = float;