diff --git a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu index 2d4799f9fe..8e02a368a7 100644 --- a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu +++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu @@ -243,6 +243,9 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) using TmemAllocator = cute::TMEM::Allocator2Sm; TmemAllocator tmem_allocator{}; + // Make sure all CTAs in Cluster are alive. 2SM mode TMEM alloc requires the leader and peer CTA have both + // been launched. We need this explicit cluster sync here to guarantee that. + cute::cluster_sync(); if (elect_one_warp) { tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr); } diff --git a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu index 8a058c5cf5..311012d56c 100644 --- a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu +++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu @@ -256,6 +256,9 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) using TmemAllocator = cute::TMEM::Allocator2Sm; TmemAllocator tmem_allocator{}; + // Make sure all CTAs in Cluster are alive. 2SM mode TMEM alloc requires the leader and peer CTA have both + // been launched. We need this explicit cluster sync here to guarantee that. + cute::cluster_sync(); if (elect_one_warp) { tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr); }