Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -719,14 +719,15 @@ public:
// Each warp will strip-mine its section of input, one strip at a time

volatile DigitCounterT* digit_counters[KEYS_PER_THREAD];
using ::cuda::std::uint32_t;
uint32_t warp_id = linear_tid >> LOG_WARP_THREADS;
uint32_t lane_mask_lt = ::cuda::ptx::get_sreg_lanemask_lt();

_CCCL_PRAGMA_UNROLL_FULL()
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
{
// My digit
::cuda::std::uint32_t digit = digit_extractor.Digit(keys[ITEM]);
uint32_t digit = digit_extractor.Digit(keys[ITEM]);
Comment on lines +722 to +730
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is only one occurrence, please use the fully qualified one there


if (IS_DESCENDING)
{
Expand Down Expand Up @@ -929,7 +930,7 @@ struct BlockRadixRankMatchEarlyCounts
int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS];
};

int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS];
::cuda::std::uint32_t match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS];

typename BlockScan::TempStorage prefix_tmp;
};
Expand Down Expand Up @@ -975,7 +976,7 @@ struct BlockRadixRankMatchEarlyCounts
}
if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR)
{
int* match_masks = &s.match_masks[warp][0];
::cuda::std::uint32_t* match_masks = &s.match_masks[warp][0];

_CCCL_PRAGMA_UNROLL_FULL()
for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
Expand Down Expand Up @@ -1066,21 +1067,21 @@ struct BlockRadixRankMatchEarlyCounts
UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], detail::constant_t<WARP_MATCH_ATOMIC_OR>)
{
// compute key ranks
int lane_mask = 1 << lane;
int* warp_offsets = &s.warp_offsets[warp][0];
int* match_masks = &s.match_masks[warp][0];
::cuda::std::uint32_t lane_mask = 1 << lane;
Copy link
Contributor

@fbusato fbusato Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A better alternative is

Suggested change
::cuda::std::uint32_t lane_mask = 1 << lane;
auto lane_mask = 1u << lane;

signed shift is UB before C++20

int* warp_offsets = &s.warp_offsets[warp][0];
::cuda::std::uint32_t* match_masks = &s.match_masks[warp][0];

_CCCL_PRAGMA_UNROLL_FULL()
for (int u = 0; u < KEYS_PER_THREAD; ++u)
{
::cuda::std::uint32_t bin = Digit(keys[u]);
int* p_match_mask = &match_masks[bin];
::cuda::std::uint32_t bin = Digit(keys[u]);
::cuda::std::uint32_t* p_match_mask = &match_masks[bin];
atomicOr(p_match_mask, lane_mask);
__syncwarp(WARP_MASK);
int bin_mask = *p_match_mask;
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int warp_offset = 0;
int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
::cuda::std::uint32_t bin_mask = *p_match_mask;
int leader = ::cuda::std::__bit_log2(bin_mask);
int warp_offset = 0;
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to use ::cuda::std::popcount

if (lane == leader)
{
// atomic is a bit faster
Expand All @@ -1106,11 +1107,11 @@ struct BlockRadixRankMatchEarlyCounts
for (int u = 0; u < KEYS_PER_THREAD; ++u)
{
::cuda::std::uint32_t bin = Digit(keys[u]);
int bin_mask =
::cuda::std::uint32_t bin_mask =
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int leader = ::cuda::std::__bit_log2(bin_mask);
int warp_offset = 0;
int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
{
// atomic is a bit faster
Expand Down