@@ -70,11 +70,16 @@ __global__ void run_test_load(Copy& copy) {
70
70
71
71
copy (s_tile, r_tile);
72
72
73
- // #if defined(DEBUG)
73
+ #if defined(DEBUG)
74
74
if (thread0 ()) {
75
75
r_tile.dump_value ();
76
76
}
77
- // #endif
77
+ #endif
78
+
79
+ if (threadIdx .x == 4 ) {
80
+ printf (" threadIdx.x: %d\n " , threadIdx .x );
81
+ r_tile.dump_value ();
82
+ }
78
83
}
79
84
80
85
template <typename Shared, typename Reg, typename Loader, typename Storer>
@@ -210,62 +215,62 @@ TEST(TestShared2Reg, operand_A) { // load mode for loading operand A in gemm
210
215
// cudaDeviceSynchronize();
211
216
// }
212
217
213
- // TEST(TestReg2Shared, operand_C_half) {
214
- // using Element = __half;
215
-
216
- // using WarpLayout = tl::RowMajor<1, 1>;
217
- // const int kThreads = tl::get_numel<WarpLayout> * 32;
218
-
219
- // // using Shared = SharedTile<Element, tl::RowMajor<16, 16>>;
220
- // using Shared = SharedTile<Element, tl::RowMajor<16, 64>>;
221
- // // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 1>>;
222
- // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 4>>;
223
-
224
- // using Loader = SharedToRegLoader<Reg, WarpLayout, WarpReuse::kCont>;
225
- // Loader loader;
226
-
227
- // using Storer = RegToSharedStorer<Reg, WarpLayout>;
228
- // Storer storer;
229
-
230
- // dim3 dim_grid(1, 1, 1);
231
- // dim3 dim_block(kThreads, 1, 1);
232
- // int shm_size = Shared::kNumel * sizeof(Element);
233
-
234
- // run_test_store<Shared, Reg, Loader, Storer>
235
- // <<<dim_grid, dim_block, shm_size>>>(loader, storer);
236
- // cudaDeviceSynchronize();
237
- // }
238
-
239
- TEST (TestShared2Reg, operand_A_swizzle) {
218
+ TEST (TestReg2Shared, operand_C_half) {
240
219
using Element = __half;
241
220
242
221
using WarpLayout = tl::RowMajor<1 , 1 >;
243
222
const int kThreads = tl::get_numel<WarpLayout> * 32 ;
244
223
245
- // const int kRows = 64;
246
- // const int kCols = 32;
247
-
248
- const int kRows = 16 ;
249
- const int kCols = 64 ;
250
-
251
- using SharedLayout = tl::RowMajor<kRows , kCols >;
252
- const bool kUseSwizzledLayout = true ;
253
- using Shared = SharedTile<Element, SharedLayout, kUseSwizzledLayout >;
254
- // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<2, 2>>;
224
+ // using Shared = SharedTile<Element, tl::RowMajor<16, 16>>;
225
+ using Shared = SharedTile<Element, tl::RowMajor<16 , 64 >>;
226
+ // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 1>>;
255
227
using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1 , 4 >>;
256
228
257
- using Copy = SharedToRegLoader<Reg, WarpLayout, WarpReuse::kRowReuseCont >;
258
- Copy copy;
229
+ using Loader = SharedToRegLoader<Reg, WarpLayout, WarpReuse::kCont >;
230
+ Loader loader;
231
+
232
+ using Storer = RegToSharedStorer<Reg, WarpLayout>;
233
+ Storer storer;
259
234
260
235
dim3 dim_grid (1 , 1 , 1 );
261
236
dim3 dim_block (kThreads , 1 , 1 );
262
237
int shm_size = Shared::kNumel * sizeof (Element);
263
238
264
- run_test_load<Element, Shared, Reg, Copy >
265
- <<<dim_grid, dim_block, shm_size>>> (copy );
239
+ run_test_store< Shared, Reg, Loader, Storer >
240
+ <<<dim_grid, dim_block, shm_size>>> (loader, storer );
266
241
cudaDeviceSynchronize ();
267
242
}
268
243
244
+ // TEST(TestShared2Reg, operand_A_swizzle) {
245
+ // using Element = __half;
246
+
247
+ // using WarpLayout = tl::RowMajor<1, 1>;
248
+ // const int kThreads = tl::get_numel<WarpLayout> * 32;
249
+
250
+ // // const int kRows = 64;
251
+ // // const int kCols = 32;
252
+
253
+ // const int kRows = 16;
254
+ // const int kCols = 64;
255
+
256
+ // using SharedLayout = tl::RowMajor<kRows, kCols>;
257
+ // const bool kUseSwizzledLayout = true;
258
+ // using Shared = SharedTile<Element, SharedLayout, kUseSwizzledLayout>;
259
+ // // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<2, 2>>;
260
+ // using Reg = RegTile<BaseTileRowMajor<Element>, tl::RowMajor<1, 4>>;
261
+
262
+ // using Copy = SharedToRegLoader<Reg, WarpLayout,
263
+ // WarpReuse::kRowReuseCont>; Copy copy;
264
+
265
+ // dim3 dim_grid(1, 1, 1);
266
+ // dim3 dim_block(kThreads, 1, 1);
267
+ // int shm_size = Shared::kNumel * sizeof(Element);
268
+
269
+ // run_test_load<Element, Shared, Reg, Copy>
270
+ // <<<dim_grid, dim_block, shm_size>>>(copy);
271
+ // cudaDeviceSynchronize();
272
+ // }
273
+
269
274
// TEST(TestReg2Shared, operand_C_float) {
270
275
// using Element = __half;
271
276
// using AccType = float;
0 commit comments