diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 0ccd5e10c33..dc9e58ef252 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -48,8 +48,12 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/ptx>
-#include <cuda/std/__algorithm_>
+#include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__bit/integral.h>
+#include <cuda/std/__functional/operations.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 #include <cuda/std/span>
@@ -1070,7 +1074,7 @@ struct BlockRadixRankMatchEarlyCounts
         atomicOr(p_match_mask, lane_mask);
         __syncwarp(WARP_MASK);
         int bin_mask    = *p_match_mask;
-        int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
+        int leader      = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
         int warp_offset = 0;
         int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)
@@ -1100,7 +1104,7 @@ struct BlockRadixRankMatchEarlyCounts
         ::cuda::std::uint32_t bin = Digit(keys[u]);
         int bin_mask =
           detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
-        int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
+        int leader      = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
         int warp_offset = 0;
         int popc        = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
         if (lane == leader)
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index 550270adbcd..cda2ca578a9 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -48,7 +48,16 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/ptx>
+#include <cuda/__functional/maximum.h>
+#include <cuda/__functional/minimum.h>
+#include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
+#include <cuda/std/__functional/operations.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
 
@@ -694,7 +703,7 @@ struct WarpReduceShfl
   _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
 
     // Convert to tail-segmented
     if (HEAD_SEGMENTED)
@@ -715,7 +724,7 @@ struct WarpReduceShfl
     warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
 
     // Find the next set flag
-    int last_lane = __clz(__brev(warp_flags));
+    int last_lane = ::cuda::std::countr_zero(warp_flags);
 
     T output = input;
     // Template-iterate reduction steps
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index b82259d7d3b..64d72921dfd 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -48,7 +48,9 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/ptx>
+#include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__bit/countr.h>
+#include <cuda/std/__type_traits/integral_constant.h>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -213,7 +215,7 @@ struct WarpReduceSmem
   SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
   {
     // Get the start flags for each thread in the warp.
-    int warp_flags = __ballot_sync(member_mask, flag);
+    unsigned warp_flags = __ballot_sync(member_mask, flag);
 
     if (!HEAD_SEGMENTED)
     {
@@ -230,7 +232,7 @@ struct WarpReduceSmem
     }
 
     // Find next flag
-    int next_flag = __clz(__brev(warp_flags));
+    int next_flag = ::cuda::std::countr_zero(warp_flags);
 
     // Clip the next segment at the warp boundary if necessary
     if (LOGICAL_WARP_THREADS != 32)
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index d775b3a99a0..fc90e487891 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -48,8 +48,15 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/ptx>
-#include <cuda/std/__algorithm_>
+#include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/std/__algorithm/clamp.h>
+#include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__bit/integral.h>
+#include <cuda/std/__functional/operations.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+#include <cuda/warp>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -552,7 +559,7 @@ struct WarpScanShfl
     ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
 
     // Find index of first set bit
-    int segment_first_lane = _CUDA_VSTD::max(0, 31 - __clz(ballot));
+    int segment_first_lane = ::cuda::std::__bit_log2(ballot);
 
     // Iterate scan steps
     _CCCL_PRAGMA_UNROLL_FULL()
diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h
index 083aed4eafe..397aeeb7782 100644
--- a/libcudacxx/include/cuda/std/__bit/countl.h
+++ b/libcudacxx/include/cuda/std/__bit/countl.h
@@ -100,7 +100,14 @@ template <typename _Tp>
 template <typename _Tp>
 [[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
 {
-  return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
+  if constexpr (sizeof(_Tp) == sizeof(uint32_t))
+  {
+    return static_cast<int>(::__clz(static_cast<int>(__v)));
+  }
+  else
+  {
+    return static_cast<int>(::__clzll(static_cast<long long>(__v)));
+  }
 }
 #endif // _CCCL_CUDA_COMPILATION()
 
diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h
index ab4b584c33a..bcf758a141b 100644
--- a/libcudacxx/include/cuda/std/__bit/countr.h
+++ b/libcudacxx/include/cuda/std/__bit/countr.h
@@ -114,11 +114,11 @@ template <typename _Tp>
 {
   if constexpr (sizeof(_Tp) == sizeof(uint32_t))
   {
-    return ::__clz(static_cast<int>(::__brev(__v)));
+    return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
   }
   else
   {
-    return ::__clzll(static_cast<long long>(::__brevll(__v)));
+    return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
   }
 }
 #endif // _CCCL_CUDA_COMPILATION()
diff --git a/libcudacxx/include/cuda/std/__bit/reference.h b/libcudacxx/include/cuda/std/__bit/reference.h
index 58b06c52d06..ee8623956fb 100644
--- a/libcudacxx/include/cuda/std/__bit/reference.h
+++ b/libcudacxx/include/cuda/std/__bit/reference.h
@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
     // do first word
     if (__first.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -419,8 +419,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
     {
       difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
       __n -= __dn;
-      unsigned __clz     = __bits_per_word - __last.__ctz_;
-      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
+      unsigned __clz_f   = __bits_per_word - __last.__ctz_;
+      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
       __storage_type __b = *__last.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -633,10 +633,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
     // do first word
     if (__first.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       __storage_type __b1 = *__first.__seg_ & __m;
       *__first.__seg_ &= ~__m;
       __storage_type __b2 = *__result.__seg_ & __m;
@@ -986,10 +986,10 @@ _CCCL_API constexpr bool __equal_aligned(
     // do first word
     if (__first1.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first1.__ctz_;
-      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first1.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
       {
         return false;
diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h
index ab8114fc220..7a0661094c7 100644
--- a/thrust/thrust/detail/integer_math.h
+++ b/thrust/thrust/detail/integer_math.h
@@ -27,6 +27,8 @@
 #endif // no system header
 #include <thrust/detail/type_deduction.h>
 
+#include <cuda/std/__bit/countl.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
-template <typename Integer>
-_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
-{
-  Integer result;
-
-  NV_IF_TARGET(NV_IS_DEVICE,
-               (result = ::__clz(x);),
-               (int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
-                for (int i = num_bits_minus_one; i >= 0; --i) {
-                  if ((Integer(1) << i) & x)
-                  {
-                    result = num_bits_minus_one - i;
-                    break;
-                  }
-                }));
-
-  return result;
-}
-
 template <typename Integer>
 _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
 {
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
   Integer num_bits           = 8 * sizeof(Integer);
   Integer num_bits_minus_one = num_bits - 1;
 
-  return num_bits_minus_one - clz(x);
+  return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
 }
 
 template <typename Integer>