diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 5ab2498ee96..aa026ae8e8c 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -719,6 +719,7 @@ public: // Each warp will strip-mine its section of input, one strip at a time volatile DigitCounterT* digit_counters[KEYS_PER_THREAD]; + using ::cuda::std::uint32_t; uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; uint32_t lane_mask_lt = ::cuda::ptx::get_sreg_lanemask_lt(); @@ -726,7 +727,7 @@ public: for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // My digit - ::cuda::std::uint32_t digit = digit_extractor.Digit(keys[ITEM]); + uint32_t digit = digit_extractor.Digit(keys[ITEM]); if (IS_DESCENDING) { @@ -929,7 +930,7 @@ struct BlockRadixRankMatchEarlyCounts int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS]; }; - int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS]; + ::cuda::std::uint32_t match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS]; typename BlockScan::TempStorage prefix_tmp; }; @@ -975,7 +976,7 @@ struct BlockRadixRankMatchEarlyCounts } if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR) { - int* match_masks = &s.match_masks[warp][0]; + ::cuda::std::uint32_t* match_masks = &s.match_masks[warp][0]; _CCCL_PRAGMA_UNROLL_FULL() for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) @@ -1066,21 +1067,21 @@ struct BlockRadixRankMatchEarlyCounts UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], detail::constant_t) { // compute key ranks - int lane_mask = 1 << lane; - int* warp_offsets = &s.warp_offsets[warp][0]; - int* match_masks = &s.match_masks[warp][0]; + ::cuda::std::uint32_t lane_mask = 1 << lane; + int* warp_offsets = &s.warp_offsets[warp][0]; + ::cuda::std::uint32_t* match_masks = &s.match_masks[warp][0]; _CCCL_PRAGMA_UNROLL_FULL() for (int u = 0; u < KEYS_PER_THREAD; ++u) { - ::cuda::std::uint32_t bin = Digit(keys[u]); - int* p_match_mask = &match_masks[bin]; + ::cuda::std::uint32_t bin = Digit(keys[u]); + ::cuda::std::uint32_t* p_match_mask = &match_masks[bin]; atomicOr(p_match_mask, lane_mask); __syncwarp(WARP_MASK); - int bin_mask = *p_match_mask; - int leader = ::cuda::std::__bit_log2(static_cast(bin_mask)); - int warp_offset = 0; - int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); + ::cuda::std::uint32_t bin_mask = *p_match_mask; + int leader = ::cuda::std::__bit_log2(bin_mask); + int warp_offset = 0; + int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) { // atomic is a bit faster @@ -1106,11 +1107,11 @@ struct BlockRadixRankMatchEarlyCounts for (int u = 0; u < KEYS_PER_THREAD; ++u) { ::cuda::std::uint32_t bin = Digit(keys[u]); - int bin_mask = + ::cuda::std::uint32_t bin_mask = detail::warp_in_block_matcher_t::match_any(bin, warp); - int leader = ::cuda::std::__bit_log2(static_cast(bin_mask)); + int leader = ::cuda::std::__bit_log2(bin_mask); int warp_offset = 0; - int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); + int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) { // atomic is a bit faster