diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 0ccd5e10c33..dc9e58ef252 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -48,8 +48,12 @@ #include #include -#include -#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -1070,7 +1074,7 @@ struct BlockRadixRankMatchEarlyCounts atomicOr(p_match_mask, lane_mask); __syncwarp(WARP_MASK); int bin_mask = *p_match_mask; - int leader = (WARP_THREADS - 1) - __clz(bin_mask); + int leader = ::cuda::std::__bit_log2(static_cast(bin_mask)); int warp_offset = 0; int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) @@ -1100,7 +1104,7 @@ struct BlockRadixRankMatchEarlyCounts ::cuda::std::uint32_t bin = Digit(keys[u]); int bin_mask = detail::warp_in_block_matcher_t::match_any(bin, warp); - int leader = (WARP_THREADS - 1) - __clz(bin_mask); + int leader = ::cuda::std::__bit_log2(static_cast(bin_mask)); int warp_offset = 0; int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le()); if (lane == leader) diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index 550270adbcd..cda2ca578a9 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -48,7 +48,16 @@ #include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -694,7 +703,7 @@ struct WarpReduceShfl _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op) { // Get the start flags for each thread in the warp. - int warp_flags = __ballot_sync(member_mask, flag); + unsigned warp_flags = __ballot_sync(member_mask, flag); // Convert to tail-segmented if (HEAD_SEGMENTED) @@ -715,7 +724,7 @@ struct WarpReduceShfl warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1); // Find the next set flag - int last_lane = __clz(__brev(warp_flags)); + int last_lane = ::cuda::std::countr_zero(warp_flags); T output = input; // Template-iterate reduction steps diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index b82259d7d3b..64d72921dfd 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -48,7 +48,9 @@ #include #include -#include +#include +#include +#include CUB_NAMESPACE_BEGIN namespace detail @@ -213,7 +215,7 @@ struct WarpReduceSmem SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/) { // Get the start flags for each thread in the warp. - int warp_flags = __ballot_sync(member_mask, flag); + unsigned warp_flags = __ballot_sync(member_mask, flag); if (!HEAD_SEGMENTED) { @@ -230,7 +232,7 @@ struct WarpReduceSmem } // Find next flag - int next_flag = __clz(__brev(warp_flags)); + int next_flag = ::cuda::std::countr_zero(warp_flags); // Clip the next segment at the warp boundary if necessary if (LOGICAL_WARP_THREADS != 32) diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index d775b3a99a0..fc90e487891 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -48,8 +48,15 @@ #include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include CUB_NAMESPACE_BEGIN namespace detail @@ -552,7 +559,7 @@ struct WarpScanShfl ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le(); // Find index of first set bit - int segment_first_lane = _CUDA_VSTD::max(0, 31 - __clz(ballot)); + int segment_first_lane = ::cuda::std::__bit_log2(ballot); // Iterate scan steps _CCCL_PRAGMA_UNROLL_FULL() diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h index 083aed4eafe..397aeeb7782 100644 --- a/libcudacxx/include/cuda/std/__bit/countl.h +++ b/libcudacxx/include/cuda/std/__bit/countl.h @@ -100,7 +100,14 @@ template template [[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept { - return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast(__v)) : ::__clzll(static_cast(__v)); + if constexpr (sizeof(_Tp) == sizeof(uint32_t)) + { + return static_cast(::__clz(static_cast(__v))); + } + else + { + return static_cast(::__clzll(static_cast(__v))); + } } #endif // _CCCL_CUDA_COMPILATION() diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h index ab4b584c33a..bcf758a141b 100644 --- a/libcudacxx/include/cuda/std/__bit/countr.h +++ b/libcudacxx/include/cuda/std/__bit/countr.h @@ -114,11 +114,11 @@ template { if constexpr (sizeof(_Tp) == sizeof(uint32_t)) { - return ::__clz(static_cast(::__brev(__v))); + return static_cast(::__clz(static_cast(::__brev(__v)))); } else { - return ::__clzll(static_cast(::__brevll(__v))); + return static_cast(::__clzll(static_cast(::__brevll(__v)))); } } #endif // _CCCL_CUDA_COMPILATION() diff --git a/libcudacxx/include/cuda/std/__bit/reference.h b/libcudacxx/include/cuda/std/__bit/reference.h index 58b06c52d06..ee8623956fb 100644 --- a/libcudacxx/include/cuda/std/__bit/reference.h +++ b/libcudacxx/include/cuda/std/__bit/reference.h @@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned( // do first word if (__first.__ctz_ != 0) { - unsigned __clz = __bits_per_word - __first.__ctz_; - difference_type __dn = _CUDA_VSTD::min(static_cast(__clz), __n); + unsigned __clz_f = __bits_per_word - __first.__ctz_; + difference_type __dn = ::cuda::std::min(static_cast(__clz_f), __n); __n -= __dn; - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn)); + __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); __storage_type __b = *__first.__seg_ & __m; *__result.__seg_ &= ~__m; *__result.__seg_ |= __b; @@ -419,8 +419,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned( { difference_type __dn = _CUDA_VSTD::min(static_cast(__last.__ctz_), __n); __n -= __dn; - unsigned __clz = __bits_per_word - __last.__ctz_; - __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz); + unsigned __clz_f = __bits_per_word - __last.__ctz_; + __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f); __storage_type __b = *__last.__seg_ & __m; *__result.__seg_ &= ~__m; *__result.__seg_ |= __b; @@ -633,10 +633,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned( // do first word if (__first.__ctz_ != 0) { - unsigned __clz = __bits_per_word - __first.__ctz_; - difference_type __dn = _CUDA_VSTD::min(static_cast(__clz), __n); + unsigned __clz_f = __bits_per_word - __first.__ctz_; + difference_type __dn = ::cuda::std::min(static_cast(__clz_f), __n); __n -= __dn; - __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn)); + __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); __storage_type __b1 = *__first.__seg_ & __m; *__first.__seg_ &= ~__m; __storage_type __b2 = *__result.__seg_ & __m; @@ -986,10 +986,10 @@ _CCCL_API constexpr bool __equal_aligned( // do first word if (__first1.__ctz_ != 0) { - unsigned __clz = __bits_per_word - __first1.__ctz_; - difference_type __dn = _CUDA_VSTD::min(static_cast(__clz), __n); + unsigned __clz_f = __bits_per_word - __first1.__ctz_; + difference_type __dn = ::cuda::std::min(static_cast(__clz_f), __n); __n -= __dn; - __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn)); + __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn)); if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m)) { return false; diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h index ab8114fc220..7a0661094c7 100644 --- a/thrust/thrust/detail/integer_math.h +++ b/thrust/thrust/detail/integer_math.h @@ -27,6 +27,8 @@ #endif // no system header #include +#include +#include #include #include @@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN namespace detail { -template -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x) -{ - Integer result; - - NV_IF_TARGET(NV_IS_DEVICE, - (result = ::__clz(x);), - (int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits; - for (int i = num_bits_minus_one; i >= 0; --i) { - if ((Integer(1) << i) & x) - { - result = num_bits_minus_one - i; - break; - } - })); - - return result; -} - template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x) { @@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x) Integer num_bits = 8 * sizeof(Integer); Integer num_bits_minus_one = num_bits - 1; - return num_bits_minus_one - clz(x); + return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x)); } template