Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,12 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/ptx>
#include <cuda/std/__algorithm_>
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/integral.h>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__type_traits/conditional.h>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks unused

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use cuda::std::_If AFAIK

#include <cuda/std/__type_traits/is_same.h>
#include <cuda/std/cstdint>
#include <cuda/std/limits>
#include <cuda/std/span>
Expand Down Expand Up @@ -1070,7 +1074,7 @@ struct BlockRadixRankMatchEarlyCounts
atomicOr(p_match_mask, lane_mask);
__syncwarp(WARP_MASK);
int bin_mask = *p_match_mask;
int leader = (WARP_THREADS - 1) - __clz(bin_mask);
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int warp_offset = 0;
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
Expand Down Expand Up @@ -1100,7 +1104,7 @@ struct BlockRadixRankMatchEarlyCounts
::cuda::std::uint32_t bin = Digit(keys[u]);
int bin_mask =
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
int leader = (WARP_THREADS - 1) - __clz(bin_mask);
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int warp_offset = 0;
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
Expand Down
15 changes: 12 additions & 3 deletions cub/cub/warp/specializations/warp_reduce_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,16 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/ptx>
#include <cuda/__functional/maximum.h>
#include <cuda/__functional/minimum.h>
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/std/__bit/countr.h>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/__type_traits/is_same.h>
#include <cuda/std/__type_traits/is_unsigned.h>
#include <cuda/std/cstdint>
#include <cuda/std/type_traits>

Expand Down Expand Up @@ -694,7 +703,7 @@ struct WarpReduceShfl
_CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
{
// Get the start flags for each thread in the warp.
int warp_flags = __ballot_sync(member_mask, flag);
unsigned warp_flags = __ballot_sync(member_mask, flag);

// Convert to tail-segmented
if (HEAD_SEGMENTED)
Expand All @@ -715,7 +724,7 @@ struct WarpReduceShfl
warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);

// Find the next set flag
int last_lane = __clz(__brev(warp_flags));
int last_lane = ::cuda::std::countr_zero(warp_flags);

T output = input;
// Template-iterate reduction steps
Expand Down
8 changes: 5 additions & 3 deletions cub/cub/warp/specializations/warp_reduce_smem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/ptx>
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/std/__bit/countr.h>
#include <cuda/std/__type_traits/integral_constant.h>

CUB_NAMESPACE_BEGIN
namespace detail
Expand Down Expand Up @@ -213,7 +215,7 @@ struct WarpReduceSmem
SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
{
// Get the start flags for each thread in the warp.
int warp_flags = __ballot_sync(member_mask, flag);
unsigned warp_flags = __ballot_sync(member_mask, flag);

if (!HEAD_SEGMENTED)
{
Expand All @@ -230,7 +232,7 @@ struct WarpReduceSmem
}

// Find next flag
int next_flag = __clz(__brev(warp_flags));
int next_flag = ::cuda::std::countr_zero(warp_flags);

// Clip the next segment at the warp boundary if necessary
if (LOGICAL_WARP_THREADS != 32)
Expand Down
13 changes: 10 additions & 3 deletions cub/cub/warp/specializations/warp_scan_shfl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,15 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/ptx>
#include <cuda/std/__algorithm_>
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/std/__algorithm/clamp.h>
#include <cuda/std/__bit/has_single_bit.h>
Comment on lines +52 to +53
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these two headers are never used if I'm not wrong

#include <cuda/std/__bit/integral.h>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/__type_traits/is_unsigned.h>
#include <cuda/warp>

CUB_NAMESPACE_BEGIN
namespace detail
Expand Down Expand Up @@ -552,7 +559,7 @@ struct WarpScanShfl
ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();

// Find index of first set bit
int segment_first_lane = _CUDA_VSTD::max(0, 31 - __clz(ballot));
int segment_first_lane = ::cuda::std::__bit_log2(ballot);

// Iterate scan steps
_CCCL_PRAGMA_UNROLL_FULL()
Expand Down
9 changes: 8 additions & 1 deletion libcudacxx/include/cuda/std/__bit/countl.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,14 @@ template <typename _Tp>
template <typename _Tp>
[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
{
return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
{
return static_cast<int>(::__clz(static_cast<int>(__v)));
}
else
{
return static_cast<int>(::__clzll(static_cast<long long>(__v)));
}
}
#endif // _CCCL_CUDA_COMPILATION()

Expand Down
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/std/__bit/countr.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,11 @@ template <typename _Tp>
{
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
{
return ::__clz(static_cast<int>(::__brev(__v)));
return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
}
else
{
return ::__clzll(static_cast<long long>(::__brevll(__v)));
return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
}
}
#endif // _CCCL_CUDA_COMPILATION()
Expand Down
22 changes: 11 additions & 11 deletions libcudacxx/include/cuda/std/__bit/reference.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
// do first word
if (__first.__ctz_ != 0)
{
unsigned __clz = __bits_per_word - __first.__ctz_;
difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
unsigned __clz_f = __bits_per_word - __first.__ctz_;
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
__storage_type __b = *__first.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
Expand Down Expand Up @@ -419,8 +419,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
{
difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
__n -= __dn;
unsigned __clz = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
unsigned __clz_f = __bits_per_word - __last.__ctz_;
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
__storage_type __b = *__last.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
Expand Down Expand Up @@ -633,10 +633,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
// do first word
if (__first.__ctz_ != 0)
{
unsigned __clz = __bits_per_word - __first.__ctz_;
difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
unsigned __clz_f = __bits_per_word - __first.__ctz_;
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
__storage_type __b1 = *__first.__seg_ & __m;
*__first.__seg_ &= ~__m;
__storage_type __b2 = *__result.__seg_ & __m;
Expand Down Expand Up @@ -986,10 +986,10 @@ _CCCL_API constexpr bool __equal_aligned(
// do first word
if (__first1.__ctz_ != 0)
{
unsigned __clz = __bits_per_word - __first1.__ctz_;
difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
unsigned __clz_f = __bits_per_word - __first1.__ctz_;
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
{
return false;
Expand Down
23 changes: 3 additions & 20 deletions thrust/thrust/detail/integer_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#endif // no system header
#include <thrust/detail/type_deduction.h>

#include <cuda/std/__bit/countl.h>
#include <cuda/std/__type_traits/make_unsigned.h>
#include <cuda/std/limits>
#include <cuda/std/type_traits>

Expand All @@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
namespace detail
{

template <typename Integer>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
{
Integer result;

NV_IF_TARGET(NV_IS_DEVICE,
(result = ::__clz(x);),
(int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
for (int i = num_bits_minus_one; i >= 0; --i) {
if ((Integer(1) << i) & x)
{
result = num_bits_minus_one - i;
break;
}
}));

return result;
}

template <typename Integer>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
{
Expand Down Expand Up @@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
Integer num_bits = 8 * sizeof(Integer);
Integer num_bits_minus_one = num_bits - 1;

return num_bits_minus_one - clz(x);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thrust::detail::log2 will be deleted in #6188

return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
}

template <typename Integer>
Expand Down
Loading