From 0b4ccfed0f8480eb2001460a9cd014702d9ac26c Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 11:37:15 -0800 Subject: [PATCH 01/32] more operations --- .../cuda/experimental/__simd/declaration.h | 68 ++++ .../experimental/__simd/fixed_size_impl.h | 338 ++++++++++++++++++ .../cuda/experimental/__simd/reference.h | 239 +++++++++++++ cudax/include/cuda/experimental/__simd/simd.h | 248 +++++++++++++ .../include/cuda/experimental/__simd/traits.h | 85 +++++ .../cuda/experimental/__simd/utility.h | 100 ++++++ 6 files changed, 1078 insertions(+) create mode 100644 cudax/include/cuda/experimental/__simd/declaration.h create mode 100644 cudax/include/cuda/experimental/__simd/fixed_size_impl.h create mode 100644 cudax/include/cuda/experimental/__simd/reference.h create mode 100644 cudax/include/cuda/experimental/__simd/simd.h create mode 100644 cudax/include/cuda/experimental/__simd/traits.h create mode 100644 cudax/include/cuda/experimental/__simd/utility.h diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h new file mode 100644 index 00000000000..c19d9f1dcb0 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___SIMD_DECLARATION_H +#define _CUDA_EXPERIMENTAL___SIMD_DECLARATION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +namespace cuda::experimental::datapar +{ +namespace simd_abi +{ +struct vector_abi; + +template +struct __fixed_size; + +template +using fixed_size = __fixed_size<_Np>; +} // namespace simd_abi + +template +struct __simd_storage; + +template +struct __simd_operations; + +template +struct __mask_storage; + +template +struct __mask_operations; + +template +class simd; + +template +class basic_simd; + +template +class simd_mask; + +template +class basic_simd_mask; +} // namespace cuda::experimental::datapar + +#include + +#endif // _CUDA_EXPERIMENTAL___SIMD_DECLARATION_H diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h new file mode 100644 index 00000000000..1615f9e792d --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -0,0 +1,338 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SCALAR_H +#define _CUDAX___SIMD_SCALAR_H + +#include + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuda::experimental::datapar +{ +namespace simd_abi +{ +template +struct __fixed_size +{ + static constexpr ::cuda::std::size_t __simd_size = _Np; +}; +} // namespace simd_abi + +template +struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> +{ + _Tp __data; + + [[nodiscard]] _CCCL_API constexpr _Tp __get([[maybe_unused]] ::cuda::std::size_t __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, 0, __simd_size), "Index is out of bounds"); + return __data; + } + + _CCCL_API constexpr void __set([[maybe_unused]] ::cuda::std::size_t __idx, _Tp __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, 0, __simd_size), "Index is out of bounds"); + __data = __v; + } +}; + +template +struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> + : __simd_storage<::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>, simd_abi::__fixed_size<_Np>> +{}; + +// ********************************************************************************************************************* +// * SIMD Arithmetic Operations +// ********************************************************************************************************************* + +template +struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> +{ + using _SimdStorage _CCCL_NODEBUG = __simd_storage<_Tp, simd_abi::__fixed_size<_Np>>; + using _MaskStorage _CCCL_NODEBUG = __mask_storage<_Tp, simd_abi::__fixed_size<_Np>>; + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __v; + } + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) + { + return _SimdStorage{{__g(std::integral_constant<::cuda::std::size_t, _Is>())...}}; + } + + template + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) noexcept + { + return __generate_init(::cuda::std::forward<_Generator>(__g), ::cuda::std::make_index_sequence<_Np>()); + } + + template + _CCCL_API static constexpr void __load(_SimdStorage& __s, const _Up* __mem) noexcept + { + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __s.__data[__i] = static_cast<_Tp>(__mem[__i]); + } + } + + template + _CCCL_API static constexpr void __store(const _SimdStorage& __s, _Up* __mem) noexcept + { + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __mem[__i] = static_cast<_Up>(__s.__data[__i]); + } + } + + _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept + { + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __s.__data[__i] += 1; + } + } + + _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept + { + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __s.__data[__i] -= 1; + } + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept + { + return {!__s.__data}; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept + { + return {~__s.__data}; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = -__s.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] + __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] - __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] * __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __divides(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] / __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] == __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] < __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] <= __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] > __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] >= __rhs.__data[__i]; + } + return __result; + } +}; + +// ********************************************************************************************************************* +// * SIMD Mask Operations +// ********************************************************************************************************************* + +template +[[nodiscard]] _CCCL_API constexpr auto __set_all_bits(bool __v) noexcept +{ + static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>, "set_all_bits() requires unsigned integer types"); + using _Up = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; + return __v ? (::cuda::std::numeric_limits<_Up>::max()) : 0; +} + +template +struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> +{ + using _MaskStorage = __mask_storage<_Tp, simd_abi::__fixed_size<_Np>>; + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept + { + _MaskStorage __result; + const auto __all_bits_v = ::cuda::experimental::datapar::__set_all_bits<_Tp>(__v); + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; ++__i) + { + __result.__set(__i, __all_bits_v); + } + return __result; + } + + _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept + { + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __s.__data[__i] = ::cuda::experimental::datapar::__set_all_bits<_Tp>(__mem[__i]); + } + } + + _CCCL_API static constexpr void __store(const _MaskStorage& __s, bool* __mem) noexcept + { + _CCCL_PRAGMA_NOUNROLL() + for (int __i = 0; __i < _Np; __i++) + { + __mem[__i] = static_cast(__s.__data[__i]); + } + } +}; +} // namespace cuda::experimental::datapar + +#include + +#endif // _CUDAX___SIMD_FIXED_SIZE_IMPL_H diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h new file mode 100644 index 00000000000..2cdbf213777 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -0,0 +1,239 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD_EXPERIMENTAL___SIMD_REFERENCE_H +#define _CUDA_STD_EXPERIMENTAL___SIMD_REFERENCE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#if _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED + +# include +# include +# include +# include +# include +# include +# include +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +namespace experimental +{ +inline namespace parallelism_v2 +{ + +template +class __simd_reference +{ + template + friend class simd; + template + friend class simd_mask; + + _Storage& __s_; + size_t __idx_; + + _CCCL_HIDE_FROM_ABI __simd_reference(_Storage& __s, size_t __idx) + : __s_(__s) + , __idx_(__idx) + {} + + _CCCL_HIDE_FROM_ABI _Vp __get() const noexcept + { + return __s_.__get(__idx_); + } + + _CCCL_HIDE_FROM_ABI void __set(_Vp __v) + { + if constexpr (is_same_v<_Vp, bool>) + { + __s_.__set(__idx_, experimental::__set_all_bits<_Tp>(__v)); + } + else + { + __s_.__set(__idx_, __v); + } + } + +public: + using value_type = _Vp; + + __simd_reference() = delete; + __simd_reference(const __simd_reference&) = delete; + + _CCCL_HIDE_FROM_ABI operator value_type() const noexcept + { + return __get(); + } + + template , int> = 0> + _CCCL_HIDE_FROM_ABI __simd_reference operator=(_Up&& __v) && noexcept + { + __set(static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template + friend _CCCL_HIDE_FROM_ABI void swap( + __simd_reference<_Tp1, _Storage1, _Vp1>&& __a, + __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; + + template + friend _CCCL_HIDE_FROM_ABI void swap(_Vp1& __a, __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; + + template + friend _CCCL_HIDE_FROM_ABI void swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, _Vp1& __b) noexcept; + + template () += ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator+=(_Up&& __v) && noexcept + { + __set(__get() + static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () -= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator-=(_Up&& __v) && noexcept + { + __set(__get() - static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () *= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator*=(_Up&& __v) && noexcept + { + __set(__get() * static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () /= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator/=(_Up&& __v) && noexcept + { + __set(__get() / static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () %= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator%=(_Up&& __v) && noexcept + { + __set(__get() % static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () &= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator&=(_Up&& __v) && noexcept + { + __set(__get() & static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () |= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator|=(_Up&& __v) && noexcept + { + __set(__get() | static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () ^= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator^=(_Up&& __v) && noexcept + { + __set(__get() ^ static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () <<= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator<<=(_Up&& __v) && noexcept + { + __set(__get() << static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + template () >>= ::cuda::std::declval<_Up>())> + _CCCL_HIDE_FROM_ABI __simd_reference operator>>=(_Up&& __v) && noexcept + { + __set(__get() >> static_cast(::cuda::std::forward<_Up>(__v))); + return {__s_, __idx_}; + } + + _CCCL_HIDE_FROM_ABI __simd_reference operator++() && noexcept + { + __set(__get() + 1); + return {__s_, __idx_}; + } + + _CCCL_HIDE_FROM_ABI value_type operator++(int) && noexcept + { + auto __r = __get(); + __set(__get() + 1); + return __r; + } + + _CCCL_HIDE_FROM_ABI __simd_reference operator--() && noexcept + { + __set(__get() - 1); + return {__s_, __idx_}; + } + + _CCCL_HIDE_FROM_ABI value_type operator--(int) && noexcept + { + auto __r = __get(); + __set(__get() - 1); + return __r; + } +}; + +template +_CCCL_HIDE_FROM_ABI void +swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept +{ + _Vp __tmp(::cuda::std::move(__a)); + ::cuda::std::move(__a) = ::cuda::std::move(__b); + ::cuda::std::move(__b) = ::cuda::std::move(__tmp); +} + +template +_CCCL_HIDE_FROM_ABI void swap(_Vp& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept +{ + _Vp __tmp(::cuda::std::move(__a)); + __a = ::cuda::std::move(__b); + ::cuda::std::move(__b) = ::cuda::std::move(__tmp); +} + +template +_CCCL_HIDE_FROM_ABI void swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, _Vp& __b) noexcept +{ + _Vp __tmp(::cuda::std::move(__a)); + ::cuda::std::move(__a) = ::cuda::std::move(__b); + __b = ::cuda::std::move(__tmp); +} + +} // namespace parallelism_v2 +} // namespace experimental + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED + +#endif // _CUDA_STD_EXPERIMENTAL___SIMD_REFERENCE_H + diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h new file mode 100644 index 00000000000..3ef90bacc24 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -0,0 +1,248 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SIMD_H +#define _CUDAX___SIMD_SIMD_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace cuda::experimental::datapar +{ +template +class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> +{ + using _Impl = __simd_operations<_Tp, simd_abi::fixed_size<_Np>>; + using _Storage = typename _Impl::_SimdStorage; + + _Storage __s_; + +public: + using value_type = _Tp; + using reference = __simd_reference<_Tp, _Storage, value_type>; + using mask_type = simd_mask<_Tp, _Abi>; + using abi_type = _Abi; + + _CCCL_API static constexpr ::cuda::std::size_t size() noexcept + { + return simd_size_v; + } + + _CCCL_API simd() noexcept = default; + + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + + _CCCL_API explicit operator _Storage() const + { + return __s_; + } + + _CCCL_API explicit simd(const _Storage& __s, __storage_tag_t) + : __s_(__s) + {} + + template >, int> = 0> + _CCCL_API simd(_Up&& __v) noexcept + : __s_(_Impl::__broadcast(static_cast(::cuda::std::forward<_Up>(__v)))) + {} + + template && is_same_v> + && __is_non_narrowing_convertible_v<_Up, value_type>, + int> = 0> + _CCCL_API simd(const simd<_Up, simd_abi::fixed_size>& __v) noexcept + { + for (::cuda::std::size_t __i = 0; __i < size(); __i++) + { + (*this)[__i] = static_cast(__v[__i]); + } + } + + template , int> = 0> + _CCCL_API explicit simd(_Generator&& __g) noexcept + : __s_(_Impl::__generate(::cuda::std::forward<_Generator>(__g))) + {} + + template && is_simd_flag_type_v<_Flags>, int> = 0> + _CCCL_API simd(const _Up* __mem, _Flags) + { + _Impl::__load(__s_, _Flags::template __apply(__mem)); + } + + template && is_simd_flag_type_v<_Flags>, int> = 0> + _CCCL_API void copy_from(const _Up* __mem, _Flags) + { + _Impl::__load(__s_, _Flags::template __apply(__mem)); + } + + template && is_simd_flag_type_v<_Flags>, int> = 0> + _CCCL_API void copy_to(_Up* __mem, _Flags) const + { + _Impl::__store(__s_, _Flags::template __apply(__mem)); + } + + _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept + { + return reference(__s_, __i); + } + + _CCCL_API value_type operator[](::cuda::std::size_t __i) const noexcept + { + return __s_.__get(__i); + } + + _CCCL_API simd& operator++() noexcept + { + _Impl::__increment(__s_); + return *this; + } + + _CCCL_API simd operator++(int) noexcept + { + const simd __r = *this; + _Impl::__increment(__s_); + return __r; + } + + _CCCL_API simd& operator--() noexcept + { + _Impl::__decrement(__s_); + return *this; + } + + _CCCL_API simd operator--(int) noexcept + { + const simd __r = *this; + _Impl::__decrement(__s_); + return __r; + } + + [[nodiscard]] _CCCL_API simd operator+() const noexcept + { + return *this; + } + + [[nodiscard]] _CCCL_API simd operator-() const noexcept + { + return {_Impl::__unary_minus(__s_), __storage_tag}; + } + + _CCCL_API constexpr friend simd& operator+=(simd& __lhs, const simd& __rhs) + { + return __lhs = {__lhs + __rhs, __storage_tag}; + } + + _CCCL_API constexpr friend simd& operator-=(simd& __lhs, const simd& __rhs) + { + return __lhs = {__lhs - __rhs, __storage_tag}; + } + + _CCCL_API constexpr friend simd& operator*=(simd& __lhs, const simd& __rhs) + { + return __lhs = {__lhs * __rhs, __storage_tag}; + } + + _CCCL_API constexpr friend simd& operator/=(simd& __lhs, const simd& __rhs) + { + return __lhs = {__lhs / __rhs, __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend simd operator+(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend simd operator-(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend simd operator*(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend simd operator/(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const simd& __lhs, const simd& __rhs) + { + return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; + } +}; + +template +inline constexpr bool is_simd_v> = true; + +template +using native_simd = simd<_Tp, simd_abi::native<_Tp>>; + +template +using fixed_size_simd = simd<_Tp, simd_abi::fixed_size<_Np>>; +} // namespace cuda::experimental::datapar + +#include + +#endif // _CUDAX___SIMD_SIMD_H diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h new file mode 100644 index 00000000000..c36149ff369 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -0,0 +1,85 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD_EXPERIMENTAL___SIMD_TRAITS_H +#define _CUDA_STD_EXPERIMENTAL___SIMD_TRAITS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#if _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED + +# include +# include +# include +# include +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +namespace experimental +{ +inline namespace parallelism_v2 +{ + +template +inline constexpr bool is_abi_tag_v = false; + +template +struct is_abi_tag : bool_constant> +{}; + +template +inline constexpr bool is_simd_v = false; + +template +struct is_simd : bool_constant> +{}; + +template +inline constexpr bool is_simd_flag_type_v = false; + +template +struct is_simd_flag_type : bool_constant> +{}; + +template , bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> +struct simd_size : integral_constant +{}; + +template +struct simd_size<_Tp, _Abi, false> +{}; + +template > +inline constexpr size_t simd_size_v = simd_size<_Tp, _Abi>::value; + +} // namespace parallelism_v2 +} // namespace experimental + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED + +#endif // _CUDA_STD_EXPERIMENTAL___SIMD_TRAITS_H + diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h new file mode 100644 index 00000000000..d11ea878a5b --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -0,0 +1,100 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD_EXPERIMENTAL___SIMD_UTILITY_H +#define _CUDA_STD_EXPERIMENTAL___SIMD_UTILITY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#if _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +namespace experimental +{ +inline namespace parallelism_v2 +{ + +template +inline constexpr bool __is_vectorizable_v = + is_arithmetic_v<_Tp> && !is_const_v<_Tp> && !is_volatile_v<_Tp> && !is_same_v<_Tp, bool>; + +template +inline constexpr bool __is_non_narrowing_convertible_v = false; + +template +inline constexpr bool + __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = true; + +template +inline constexpr bool __can_broadcast_v = + (__is_vectorizable_v<_Up> && __is_non_narrowing_convertible_v<_Up, _Tp>) || + (!__is_vectorizable_v<_Up> && is_convertible_v<_Up, _Tp>) || is_same_v<_Up, int> || + (is_same_v<_Up, unsigned int> && is_unsigned_v<_Tp>); + +template +inline constexpr bool __is_well_formed = false; + +template +inline constexpr bool + __is_well_formed<_Tp, + _Generator, + _Idx, + ::cuda::std::void_t()( + ::cuda::std::integral_constant())))> = + __can_broadcast_v<_Tp, + decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant()))>; + +template +_CCCL_HIDE_FROM_ABI constexpr bool __can_generate(::cuda::std::index_sequence<_Idxes...>) +{ + return (true && ... && __is_well_formed<_Tp, _Generator, _Idxes>); +} + +template +inline constexpr bool __can_generate_v = + experimental::__can_generate<_Tp, _Generator>(::cuda::std::make_index_sequence<_Size>()); + +} // namespace parallelism_v2 +} // namespace experimental + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED + +#endif // _CUDA_STD_EXPERIMENTAL___SIMD_UTILITY_H + From 9e46a30de433043e3e1e33d97cc8388ac8c706a9 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 12:39:58 -0800 Subject: [PATCH 02/32] fixes --- .../cuda/experimental/__simd/declaration.h | 18 ++- .../experimental/__simd/fixed_size_impl.h | 20 +-- .../cuda/experimental/__simd/reference.h | 153 ++++++++---------- cudax/include/cuda/experimental/__simd/simd.h | 48 +++--- .../include/cuda/experimental/__simd/traits.h | 72 ++++----- .../cuda/experimental/__simd/utility.h | 113 +++++++------ 6 files changed, 193 insertions(+), 231 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h index c19d9f1dcb0..da2ab0df37f 100644 --- a/cudax/include/cuda/experimental/__simd/declaration.h +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -1,14 +1,15 @@ //===----------------------------------------------------------------------===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef _CUDA_EXPERIMENTAL___SIMD_DECLARATION_H -#define _CUDA_EXPERIMENTAL___SIMD_DECLARATION_H +#ifndef _CUDAX___SIMD_DECLARATION_H +#define _CUDAX___SIMD_DECLARATION_H #include @@ -20,9 +21,6 @@ # pragma system_header #endif // no system header -#include -#include - #include namespace cuda::experimental::datapar @@ -36,6 +34,12 @@ struct __fixed_size; template using fixed_size = __fixed_size<_Np>; + +template +using compatible = fixed_size<1>; + +template +using native = fixed_size<1>; } // namespace simd_abi template @@ -65,4 +69,4 @@ class basic_simd_mask; #include -#endif // _CUDA_EXPERIMENTAL___SIMD_DECLARATION_H +#endif // _CUDAX___SIMD_DECLARATION_H diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index 1615f9e792d..58f070ea780 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -1,19 +1,18 @@ //===----------------------------------------------------------------------===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef _CUDAX___SIMD_SCALAR_H -#define _CUDAX___SIMD_SCALAR_H +#ifndef _CUDAX___SIMD_FIXED_SIZE_IMPL_H +#define _CUDAX___SIMD_FIXED_SIZE_IMPL_H #include -#include - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -29,10 +28,9 @@ #include #include #include -#include #include -#include +#include #include @@ -288,14 +286,6 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> // * SIMD Mask Operations // ********************************************************************************************************************* -template -[[nodiscard]] _CCCL_API constexpr auto __set_all_bits(bool __v) noexcept -{ - static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>, "set_all_bits() requires unsigned integer types"); - using _Up = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; - return __v ? (::cuda::std::numeric_limits<_Up>::max()) : 0; -} - template struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> { diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index 2cdbf213777..f1b02b34530 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -1,15 +1,15 @@ -// -*- C++ -*- //===----------------------------------------------------------------------===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef _CUDA_STD_EXPERIMENTAL___SIMD_REFERENCE_H -#define _CUDA_STD_EXPERIMENTAL___SIMD_REFERENCE_H +#ifndef _CUDAX___SIMD_REFERENCE_H +#define _CUDAX___SIMD_REFERENCE_H #include @@ -21,53 +21,47 @@ # pragma system_header #endif // no system header -#include -#include +#include +#include +#include +#include +#include +#include +#include -#if _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED +#include -# include -# include -# include -# include -# include -# include -# include -# include +#include -_CCCL_BEGIN_NAMESPACE_CUDA_STD - -namespace experimental -{ -inline namespace parallelism_v2 +namespace cuda::experimental::datapar { - -template +template class __simd_reference { - template + template friend class simd; - template + + template friend class simd_mask; _Storage& __s_; - size_t __idx_; + ::cuda::std::size_t __idx_; - _CCCL_HIDE_FROM_ABI __simd_reference(_Storage& __s, size_t __idx) - : __s_(__s) - , __idx_(__idx) + _CCCL_API __simd_reference(_Storage& __s, ::cuda::std::size_t __idx) + : __s_{__s} + , __idx_{__idx} {} - _CCCL_HIDE_FROM_ABI _Vp __get() const noexcept + [[nodiscard]] _CCCL_API constexpr _Vp __get() const noexcept { return __s_.__get(__idx_); } - _CCCL_HIDE_FROM_ABI void __set(_Vp __v) + _CCCL_API constexpr void __set(_Vp __v) noexcept { - if constexpr (is_same_v<_Vp, bool>) + if constexpr (::cuda::std::is_same_v<_Vp, bool>) { - __s_.__set(__idx_, experimental::__set_all_bits<_Tp>(__v)); + __s_.__set(__idx_, ::cuda::experimental::datapar::__set_all_bits<_Tp>(__v)); } else { @@ -81,119 +75,118 @@ class __simd_reference __simd_reference() = delete; __simd_reference(const __simd_reference&) = delete; - _CCCL_HIDE_FROM_ABI operator value_type() const noexcept + _CCCL_API constexpr operator value_type() const noexcept { return __get(); } - template , int> = 0> - _CCCL_HIDE_FROM_ABI __simd_reference operator=(_Up&& __v) && noexcept + template , int> = 0> + _CCCL_API __simd_reference operator=(_Up&& __v) && noexcept { __set(static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template - friend _CCCL_HIDE_FROM_ABI void swap( - __simd_reference<_Tp1, _Storage1, _Vp1>&& __a, - __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; + template + friend _CCCL_API void swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, + __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; - template - friend _CCCL_HIDE_FROM_ABI void swap(_Vp1& __a, __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; + template + friend _CCCL_API void swap(_Vp1& __a, __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; - template - friend _CCCL_HIDE_FROM_ABI void swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, _Vp1& __b) noexcept; + template + friend _CCCL_API void swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, _Vp1& __b) noexcept; - template () += ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator+=(_Up&& __v) && noexcept + template () += ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator+=(_Up&& __v) && noexcept { __set(__get() + static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () -= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator-=(_Up&& __v) && noexcept + template () -= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator-=(_Up&& __v) && noexcept { __set(__get() - static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () *= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator*=(_Up&& __v) && noexcept + template () *= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator*=(_Up&& __v) && noexcept { __set(__get() * static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () /= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator/=(_Up&& __v) && noexcept + template () /= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator/=(_Up&& __v) && noexcept { __set(__get() / static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () %= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator%=(_Up&& __v) && noexcept + template () %= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator%=(_Up&& __v) && noexcept { __set(__get() % static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () &= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator&=(_Up&& __v) && noexcept + template () &= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator&=(_Up&& __v) && noexcept { __set(__get() & static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () |= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator|=(_Up&& __v) && noexcept + template () |= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator|=(_Up&& __v) && noexcept { __set(__get() | static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () ^= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator^=(_Up&& __v) && noexcept + template () ^= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator^=(_Up&& __v) && noexcept { __set(__get() ^ static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () <<= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator<<=(_Up&& __v) && noexcept + template () <<= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator<<=(_Up&& __v) && noexcept { __set(__get() << static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - template () >>= ::cuda::std::declval<_Up>())> - _CCCL_HIDE_FROM_ABI __simd_reference operator>>=(_Up&& __v) && noexcept + template () >>= ::cuda::std::declval<_Up>())> + _CCCL_API __simd_reference operator>>=(_Up&& __v) && noexcept { __set(__get() >> static_cast(::cuda::std::forward<_Up>(__v))); return {__s_, __idx_}; } - _CCCL_HIDE_FROM_ABI __simd_reference operator++() && noexcept + _CCCL_API constexpr __simd_reference operator++() && noexcept { __set(__get() + 1); return {__s_, __idx_}; } - _CCCL_HIDE_FROM_ABI value_type operator++(int) && noexcept + _CCCL_API constexpr value_type operator++(int) && noexcept { auto __r = __get(); __set(__get() + 1); return __r; } - _CCCL_HIDE_FROM_ABI __simd_reference operator--() && noexcept + _CCCL_API constexpr __simd_reference operator--() && noexcept { __set(__get() - 1); return {__s_, __idx_}; } - _CCCL_HIDE_FROM_ABI value_type operator--(int) && noexcept + _CCCL_API constexpr value_type operator--(int) && noexcept { auto __r = __get(); __set(__get() - 1); @@ -201,39 +194,31 @@ class __simd_reference } }; -template -_CCCL_HIDE_FROM_ABI void -swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept +template +_CCCL_API void swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept { _Vp __tmp(::cuda::std::move(__a)); ::cuda::std::move(__a) = ::cuda::std::move(__b); ::cuda::std::move(__b) = ::cuda::std::move(__tmp); } -template -_CCCL_HIDE_FROM_ABI void swap(_Vp& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept +template +_CCCL_API void swap(_Vp& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept { _Vp __tmp(::cuda::std::move(__a)); __a = ::cuda::std::move(__b); ::cuda::std::move(__b) = ::cuda::std::move(__tmp); } -template -_CCCL_HIDE_FROM_ABI void swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, _Vp& __b) noexcept +template +_CCCL_API void swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, _Vp& __b) noexcept { _Vp __tmp(::cuda::std::move(__a)); ::cuda::std::move(__a) = ::cuda::std::move(__b); __b = ::cuda::std::move(__tmp); } +} // namespace cuda::experimental::datapar -} // namespace parallelism_v2 -} // namespace experimental - -_CCCL_END_NAMESPACE_CUDA_STD - -# include - -#endif // _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED - -#endif // _CUDA_STD_EXPERIMENTAL___SIMD_REFERENCE_H +#include +#endif // _CUDAX___SIMD_REFERENCE_H diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index 3ef90bacc24..b1fd65d249a 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -1,6 +1,7 @@ //===----------------------------------------------------------------------===// // // Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. @@ -20,9 +21,8 @@ # pragma system_header #endif // no system header -#include +#include #include -#include #include #include #include @@ -47,10 +47,10 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> public: using value_type = _Tp; using reference = __simd_reference<_Tp, _Storage, value_type>; - using mask_type = simd_mask<_Tp, _Abi>; - using abi_type = _Abi; + using mask_type = simd_mask<_Tp, _Np>; + using abi_type = simd_abi::fixed_size<_Np>; - _CCCL_API static constexpr ::cuda::std::size_t size() noexcept + [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept { return simd_size_v; } @@ -70,16 +70,18 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> : __s_(__s) {} - template >, int> = 0> + template >, int> = 0> _CCCL_API simd(_Up&& __v) noexcept : __s_(_Impl::__broadcast(static_cast(::cuda::std::forward<_Up>(__v)))) {} - template && is_same_v> - && __is_non_narrowing_convertible_v<_Up, value_type>, - int> = 0> - _CCCL_API simd(const simd<_Up, simd_abi::fixed_size>& __v) noexcept + template && ::cuda::std::is_same_v> + && __is_non_narrowing_convertible_v<_Up, value_type>, + int> = 0> + _CCCL_API simd(const simd<_Up, size()>& __v) noexcept { for (::cuda::std::size_t __i = 0; __i < size(); __i++) { @@ -87,29 +89,29 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> } } - template , int> = 0> + template , int> = 0> _CCCL_API explicit simd(_Generator&& __g) noexcept : __s_(_Impl::__generate(::cuda::std::forward<_Generator>(__g))) {} - template && is_simd_flag_type_v<_Flags>, int> = 0> _CCCL_API simd(const _Up* __mem, _Flags) { _Impl::__load(__s_, _Flags::template __apply(__mem)); } - template && is_simd_flag_type_v<_Flags>, int> = 0> _CCCL_API void copy_from(const _Up* __mem, _Flags) { _Impl::__load(__s_, _Flags::template __apply(__mem)); } - template && is_simd_flag_type_v<_Flags>, int> = 0> _CCCL_API void copy_to(_Up* __mem, _Flags) const { @@ -233,14 +235,14 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> } }; -template -inline constexpr bool is_simd_v> = true; +template +inline constexpr bool is_simd_v> = true; -template +template using native_simd = simd<_Tp, simd_abi::native<_Tp>>; -template -using fixed_size_simd = simd<_Tp, simd_abi::fixed_size<_Np>>; +template +using fixed_size_simd = simd<_Tp, _Np>; } // namespace cuda::experimental::datapar #include diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index c36149ff369..60848c31ce4 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -1,15 +1,15 @@ -// -*- C++ -*- //===----------------------------------------------------------------------===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef _CUDA_STD_EXPERIMENTAL___SIMD_TRAITS_H -#define _CUDA_STD_EXPERIMENTAL___SIMD_TRAITS_H +#ifndef _CUDAX___SIMD_TRAITS_H +#define _CUDAX___SIMD_TRAITS_H #include @@ -21,65 +21,51 @@ # pragma system_header #endif // no system header -#include -#include -#include +#include +#include -#if _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED +#include +#include -# include -# include -# include -# include -# include +#include -_CCCL_BEGIN_NAMESPACE_CUDA_STD - -namespace experimental -{ -inline namespace parallelism_v2 +namespace cuda::experimental::datapar { - -template +template inline constexpr bool is_abi_tag_v = false; -template -struct is_abi_tag : bool_constant> +template +struct is_abi_tag : ::cuda::std::bool_constant> {}; -template +template inline constexpr bool is_simd_v = false; -template -struct is_simd : bool_constant> +template +struct is_simd : ::cuda::std::bool_constant> {}; -template +template inline constexpr bool is_simd_flag_type_v = false; -template -struct is_simd_flag_type : bool_constant> +template +struct is_simd_flag_type : ::cuda::std::bool_constant> {}; -template , bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> -struct simd_size : integral_constant +template , + bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> +struct simd_size : ::cuda::std::integral_constant<::cuda::std::size_t, _Abi::__simd_size> {}; -template +template struct simd_size<_Tp, _Abi, false> {}; -template > -inline constexpr size_t simd_size_v = simd_size<_Tp, _Abi>::value; - -} // namespace parallelism_v2 -} // namespace experimental - -_CCCL_END_NAMESPACE_CUDA_STD - -# include - -#endif // _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED +template > +inline constexpr ::cuda::std::size_t simd_size_v = simd_size<_Tp, _Abi>::value; +} // namespace cuda::experimental::datapar -#endif // _CUDA_STD_EXPERIMENTAL___SIMD_TRAITS_H +#include +#endif // _CUDAX___SIMD_TRAITS_H diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index d11ea878a5b..c6d10e60d4f 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -1,15 +1,15 @@ -// -*- C++ -*- //===----------------------------------------------------------------------===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef _CUDA_STD_EXPERIMENTAL___SIMD_UTILITY_H -#define _CUDA_STD_EXPERIMENTAL___SIMD_UTILITY_H +#ifndef _CUDAX___SIMD_UTILITY_H +#define _CUDAX___SIMD_UTILITY_H #include @@ -21,80 +21,75 @@ # pragma system_header #endif // no system header -#include -#include - -#if _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED - -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include - -_CCCL_BEGIN_NAMESPACE_CUDA_STD - -namespace experimental +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda::experimental::datapar { -inline namespace parallelism_v2 +template +[[nodiscard]] _CCCL_API constexpr auto __set_all_bits(bool __v) noexcept { + static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>, "set_all_bits() requires unsigned integer types"); + using _Up = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; + return __v ? (::cuda::std::numeric_limits<_Up>::max()) : 0; +} -template +template inline constexpr bool __is_vectorizable_v = - is_arithmetic_v<_Tp> && !is_const_v<_Tp> && !is_volatile_v<_Tp> && !is_same_v<_Tp, bool>; + ::cuda::std::is_arithmetic_v<_Tp> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp> + && !::cuda::std::is_same_v<_Tp, bool>; template inline constexpr bool __is_non_narrowing_convertible_v = false; -template +template inline constexpr bool - __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = true; + __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = + true; -template +template inline constexpr bool __can_broadcast_v = - (__is_vectorizable_v<_Up> && __is_non_narrowing_convertible_v<_Up, _Tp>) || - (!__is_vectorizable_v<_Up> && is_convertible_v<_Up, _Tp>) || is_same_v<_Up, int> || - (is_same_v<_Up, unsigned int> && is_unsigned_v<_Tp>); + (__is_vectorizable_v<_Up> && __is_non_narrowing_convertible_v<_Up, _Tp>) + || (!__is_vectorizable_v<_Up> && ::cuda::std::is_convertible_v<_Up, _Tp>) || ::cuda::std::is_same_v<_Up, int> + || (::cuda::std::is_same_v<_Up, unsigned int> && ::cuda::std::is_unsigned_v<_Tp>); -template +template inline constexpr bool __is_well_formed = false; -template -inline constexpr bool - __is_well_formed<_Tp, - _Generator, - _Idx, - ::cuda::std::void_t()( - ::cuda::std::integral_constant())))> = - __can_broadcast_v<_Tp, - decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant()))>; - -template +template +inline constexpr bool __is_well_formed<_Tp, + _Generator, + _Idx, + ::cuda::std::void_t()( + ::cuda::std::integral_constant<::cuda::std::size_t, _Idx>()))>> = + __can_broadcast_v< + _Tp, + decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<::cuda::std::size_t, _Idx>()))>; + +template _CCCL_HIDE_FROM_ABI constexpr bool __can_generate(::cuda::std::index_sequence<_Idxes...>) { return (true && ... && __is_well_formed<_Tp, _Generator, _Idxes>); } -template +template inline constexpr bool __can_generate_v = - experimental::__can_generate<_Tp, _Generator>(::cuda::std::make_index_sequence<_Size>()); - -} // namespace parallelism_v2 -} // namespace experimental - -_CCCL_END_NAMESPACE_CUDA_STD - -# include - -#endif // _LIBCUDACXX_EXPERIMENTAL_SIMD_ENABLED + ::cuda::experimental::datapar::__can_generate<_Tp, _Generator>(::cuda::std::make_index_sequence<_Size>()); +} // namespace cuda::experimental::datapar -#endif // _CUDA_STD_EXPERIMENTAL___SIMD_UTILITY_H +#include +#endif // _CUDAX___SIMD_UTILITY_H From dd6599aea95c7574197acfa27bd2ea9a91a7e3ff Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 12:56:05 -0800 Subject: [PATCH 03/32] other fixes --- .../experimental/__simd/fixed_size_impl.h | 8 +-- .../cuda/experimental/__simd/reference.h | 3 +- cudax/include/cuda/experimental/__simd/simd.h | 62 +++++++++---------- 3 files changed, 35 insertions(+), 38 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index 58f070ea780..7cb5ee54703 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -48,18 +48,18 @@ struct __fixed_size template struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> { - _Tp __data; + _Tp __data[_Np]; [[nodiscard]] _CCCL_API constexpr _Tp __get([[maybe_unused]] ::cuda::std::size_t __idx) const noexcept { _CCCL_ASSERT(::cuda::in_range(__idx, 0, __simd_size), "Index is out of bounds"); - return __data; + return __data[__idx]; } _CCCL_API constexpr void __set([[maybe_unused]] ::cuda::std::size_t __idx, _Tp __v) noexcept { _CCCL_ASSERT(::cuda::in_range(__idx, 0, __simd_size), "Index is out of bounds"); - __data = __v; + __data[__idx] = __v; } }; @@ -97,7 +97,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> } template - [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) noexcept + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) { return __generate_init(::cuda::std::forward<_Generator>(__g), ::cuda::std::make_index_sequence<_Np>()); } diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index f1b02b34530..921cb78fc48 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -80,7 +80,8 @@ class __simd_reference return __get(); } - template , int> = 0> + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_assignable_v) _CCCL_API __simd_reference operator=(_Up&& __v) && noexcept { __set(static_cast(::cuda::std::forward<_Up>(__v))); diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index b1fd65d249a..1d61fd2b110 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -70,17 +70,15 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> : __s_(__s) {} - template >, int> = 0> + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v>) _CCCL_API simd(_Up&& __v) noexcept : __s_(_Impl::__broadcast(static_cast(::cuda::std::forward<_Up>(__v)))) {} - template && ::cuda::std::is_same_v> - && __is_non_narrowing_convertible_v<_Up, value_type>, - int> = 0> + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(!::cuda::std::is_same_v<_Up, _Tp> && ::cuda::std::is_same_v> + && __is_non_narrowing_convertible_v<_Up, value_type>) _CCCL_API simd(const simd<_Up, size()>& __v) noexcept { for (::cuda::std::size_t __i = 0; __i < size(); __i++) @@ -89,31 +87,29 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> } } - template , int> = 0> + _CCCL_TEMPLATE(typename _Generator) + _CCCL_REQUIRES(__can_generate_v) _CCCL_API explicit simd(_Generator&& __g) noexcept : __s_(_Impl::__generate(::cuda::std::forward<_Generator>(__g))) {} - template && is_simd_flag_type_v<_Flags>, int> = 0> - _CCCL_API simd(const _Up* __mem, _Flags) + _CCCL_TEMPLATE(typename _Up, typename _Flags) + _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) + _CCCL_API simd(const _Up* __mem, _Flags) noexcept { _Impl::__load(__s_, _Flags::template __apply(__mem)); } - template && is_simd_flag_type_v<_Flags>, int> = 0> - _CCCL_API void copy_from(const _Up* __mem, _Flags) + _CCCL_TEMPLATE(typename _Up, typename _Flags) + _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) + _CCCL_API void copy_from(const _Up* __mem, _Flags) noexcept { _Impl::__load(__s_, _Flags::template __apply(__mem)); } - template && is_simd_flag_type_v<_Flags>, int> = 0> - _CCCL_API void copy_to(_Up* __mem, _Flags) const + _CCCL_TEMPLATE(typename _Up, typename _Flags) + _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) + _CCCL_API void copy_to(_Up* __mem, _Flags) const noexcept { _Impl::__store(__s_, _Flags::template __apply(__mem)); } @@ -164,47 +160,47 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> return {_Impl::__unary_minus(__s_), __storage_tag}; } - _CCCL_API constexpr friend simd& operator+=(simd& __lhs, const simd& __rhs) + _CCCL_API constexpr friend simd& operator+=(simd& __lhs, const simd& __rhs) noexcept { return __lhs = {__lhs + __rhs, __storage_tag}; } - _CCCL_API constexpr friend simd& operator-=(simd& __lhs, const simd& __rhs) + _CCCL_API constexpr friend simd& operator-=(simd& __lhs, const simd& __rhs) noexcept { return __lhs = {__lhs - __rhs, __storage_tag}; } - _CCCL_API constexpr friend simd& operator*=(simd& __lhs, const simd& __rhs) + _CCCL_API constexpr friend simd& operator*=(simd& __lhs, const simd& __rhs) noexcept { return __lhs = {__lhs * __rhs, __storage_tag}; } - _CCCL_API constexpr friend simd& operator/=(simd& __lhs, const simd& __rhs) + _CCCL_API constexpr friend simd& operator/=(simd& __lhs, const simd& __rhs) noexcept { return __lhs = {__lhs / __rhs, __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd operator+(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend simd operator+(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd operator-(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend simd operator-(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd operator*(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend simd operator*(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd operator/(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend simd operator/(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } @@ -214,22 +210,22 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> return {_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; } From a917c3002837f2813886becef4c9cab64b470016 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 14:19:10 -0800 Subject: [PATCH 04/32] add simd_mask --- .../experimental/__simd/fixed_size_impl.h | 141 ++++++++++++--- cudax/include/cuda/experimental/__simd/simd.h | 113 ++++++++++-- .../cuda/experimental/__simd/simd_mask.h | 164 ++++++++++++++++++ .../include/cuda/experimental/__simd/traits.h | 3 + 4 files changed, 389 insertions(+), 32 deletions(-) create mode 100644 cudax/include/cuda/experimental/__simd/simd_mask.h diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index 7cb5ee54703..fa6cd45a70b 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -21,9 +21,10 @@ # pragma system_header #endif // no system header -#include +#include #include #include +#include #include #include #include @@ -81,7 +82,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; ++__i) { __result.__data[__i] = __v; @@ -105,7 +106,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> template _CCCL_API static constexpr void __load(_SimdStorage& __s, const _Up* __mem) noexcept { - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __s.__data[__i] = static_cast<_Tp>(__mem[__i]); @@ -115,7 +116,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> template _CCCL_API static constexpr void __store(const _SimdStorage& __s, _Up* __mem) noexcept { - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __mem[__i] = static_cast<_Up>(__s.__data[__i]); @@ -124,7 +125,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept { - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __s.__data[__i] += 1; @@ -133,7 +134,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept { - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __s.__data[__i] -= 1; @@ -142,18 +143,30 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept { - return {!__s.__data}; + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = !__s.__data[__i]; + } + return __result; } [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept { - return {~__s.__data}; + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = ~__s.__data[__i]; + } + return __result; } [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = -__s.__data[__i]; @@ -165,7 +178,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] + __rhs.__data[__i]; @@ -177,7 +190,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] - __rhs.__data[__i]; @@ -189,7 +202,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] * __rhs.__data[__i]; @@ -201,7 +214,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __divides(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] / __rhs.__data[__i]; @@ -213,7 +226,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] == __rhs.__data[__i]; @@ -225,7 +238,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i]; @@ -237,7 +250,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] < __rhs.__data[__i]; @@ -249,7 +262,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] <= __rhs.__data[__i]; @@ -261,7 +274,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] > __rhs.__data[__i]; @@ -273,13 +286,97 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { _SimdStorage __result; - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __result.__data[__i] = __lhs.__data[__i] >= __rhs.__data[__i]; } return __result; } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __modulo(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] % __rhs.__data[__i]; + } + return __result; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_and(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] & __rhs.__data[__i]; + } + return __result; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_or(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] | __rhs.__data[__i]; + } + return __result; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_xor(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] ^ __rhs.__data[__i]; + } + return __result; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __shift_left(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] << __rhs.__data[__i]; + } + return __result; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __shift_right(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] >> __rhs.__data[__i]; + } + return __result; + } }; // ********************************************************************************************************************* @@ -295,7 +392,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> { _MaskStorage __result; const auto __all_bits_v = ::cuda::experimental::datapar::__set_all_bits<_Tp>(__v); - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; ++__i) { __result.__set(__i, __all_bits_v); @@ -305,7 +402,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept { - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __s.__data[__i] = ::cuda::experimental::datapar::__set_all_bits<_Tp>(__mem[__i]); @@ -314,7 +411,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_API static constexpr void __store(const _MaskStorage& __s, bool* __mem) noexcept { - _CCCL_PRAGMA_NOUNROLL() + _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { __mem[__i] = static_cast(__s.__data[__i]); diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index 1d61fd2b110..8de74b47f39 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -21,13 +21,15 @@ # pragma system_header #endif // no system header +#include #include -#include +#include #include #include #include #include +#include #include #include #include @@ -162,22 +164,64 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> _CCCL_API constexpr friend simd& operator+=(simd& __lhs, const simd& __rhs) noexcept { - return __lhs = {__lhs + __rhs, __storage_tag}; + return __lhs = __lhs + __rhs; } _CCCL_API constexpr friend simd& operator-=(simd& __lhs, const simd& __rhs) noexcept { - return __lhs = {__lhs - __rhs, __storage_tag}; + return __lhs = __lhs - __rhs; } _CCCL_API constexpr friend simd& operator*=(simd& __lhs, const simd& __rhs) noexcept { - return __lhs = {__lhs * __rhs, __storage_tag}; + return __lhs = __lhs * __rhs; } _CCCL_API constexpr friend simd& operator/=(simd& __lhs, const simd& __rhs) noexcept { - return __lhs = {__lhs / __rhs, __storage_tag}; + return __lhs = __lhs / __rhs; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + _CCCL_API constexpr friend simd& operator%=(simd& __lhs, const simd& __rhs) noexcept + { + return __lhs = __lhs % __rhs; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + _CCCL_API constexpr friend simd& operator&=(simd& __lhs, const simd& __rhs) noexcept + { + return __lhs = __lhs & __rhs; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + _CCCL_API constexpr friend simd& operator|=(simd& __lhs, const simd& __rhs) noexcept + { + return __lhs = __lhs | __rhs; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + _CCCL_API constexpr friend simd& operator^=(simd& __lhs, const simd& __rhs) noexcept + { + return __lhs = __lhs ^ __rhs; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + _CCCL_API constexpr friend simd& operator<<=(simd& __lhs, const simd& __rhs) noexcept + { + return __lhs = __lhs << __rhs; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + _CCCL_API constexpr friend simd& operator>>=(simd& __lhs, const simd& __rhs) noexcept + { + return __lhs = __lhs >> __rhs; } [[nodiscard]] _CCCL_API constexpr friend simd operator+(const simd& __lhs, const simd& __rhs) noexcept @@ -200,12 +244,61 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> return {_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + [[nodiscard]] _CCCL_API constexpr friend simd operator%(const simd& __lhs, const simd& __rhs) noexcept + { + return {_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + [[nodiscard]] _CCCL_API constexpr friend simd operator&(const simd& __lhs, const simd& __rhs) noexcept + { + return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + [[nodiscard]] _CCCL_API constexpr friend simd operator|(const simd& __lhs, const simd& __rhs) noexcept + { + return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + [[nodiscard]] _CCCL_API constexpr friend simd operator^(const simd& __lhs, const simd& __rhs) noexcept + { + return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + [[nodiscard]] _CCCL_API constexpr friend simd operator<<(const simd& __lhs, const simd& __rhs) noexcept + { + return {_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + [[nodiscard]] _CCCL_API constexpr friend simd operator>>(const simd& __lhs, const simd& __rhs) noexcept + { + return {_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) + [[nodiscard]] _CCCL_API constexpr simd operator~() const noexcept + { + return {_Impl::__bitwise_not(__s_), __storage_tag}; + } + [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const simd& __lhs, const simd& __rhs) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const simd& __lhs, const simd& __rhs) noexcept { return {_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } @@ -222,20 +315,20 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const simd& __lhs, const simd& __rhs) noexcept { - return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; + return {_Impl::__greater(__lhs.__s_, __rhs.__s_), __storage_tag}; } [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const simd& __lhs, const simd& __rhs) noexcept { - return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; + return {_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; } }; template inline constexpr bool is_simd_v> = true; -template -using native_simd = simd<_Tp, simd_abi::native<_Tp>>; +// Note: native_simd would require platform-specific ABI specializations +// For now, use fixed_size_simd directly or specialize with a known size template using fixed_size_simd = simd<_Tp, _Np>; diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h new file mode 100644 index 00000000000..d18cf961ef5 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -0,0 +1,164 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SIMD_MASK_H +#define _CUDAX___SIMD_SIMD_MASK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include +#include + +#include + +namespace cuda::experimental::datapar +{ +template +class simd_mask : public __mask_operations<_Tp, simd_abi::fixed_size<_Np>> +{ + using _Impl = __mask_operations<_Tp, simd_abi::fixed_size<_Np>>; + using _Storage = typename _Impl::_MaskStorage; + + _Storage __s_; + +public: + using value_type = bool; + using reference = __simd_reference<_Tp, _Storage, bool>; + using abi_type = simd_abi::fixed_size<_Np>; + + [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept + { + return _Np; + } + + _CCCL_API simd_mask() noexcept = default; + + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + + _CCCL_API explicit operator _Storage() const noexcept + { + return __s_; + } + + _CCCL_API explicit simd_mask(const _Storage& __s, __storage_tag_t) + : __s_(__s) + {} + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, bool>) + _CCCL_API simd_mask(_Up __v) noexcept + : __s_(_Impl::__broadcast(__v)) + {} + + _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept + { + return reference(__s_, __i); + } + + _CCCL_API value_type operator[](::cuda::std::size_t __i) const noexcept + { + return static_cast(__s_.__get(__i)); + } + + // Bitwise operations + [[nodiscard]] _CCCL_API constexpr friend simd_mask operator&(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + { + return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend simd_mask operator|(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + { + return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr friend simd_mask operator^(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + { + return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API constexpr simd_mask operator!() const noexcept + { + return {_Impl::__bitwise_not(__s_), __storage_tag}; + } + + _CCCL_API simd_mask& operator&=(const simd_mask& __rhs) noexcept + { + return *this = *this & __rhs; + } + + _CCCL_API simd_mask& operator|=(const simd_mask& __rhs) noexcept + { + return *this = *this | __rhs; + } + + _CCCL_API simd_mask& operator^=(const simd_mask& __rhs) noexcept + { + return *this = *this ^ __rhs; + } + + // Comparison operations + [[nodiscard]] _CCCL_API constexpr friend bool operator==(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + { + return _Impl::__equal_to(__lhs.__s_, __rhs.__s_); + } + +#if _CCCL_STD_VER < 2020 + [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + { + return !(__lhs == __rhs); + } +#endif // _CCCL_STD_VER < 2020 + + [[nodiscard]] _CCCL_API constexpr bool all() const noexcept + { + return _Impl::__all(__s_); + } + + [[nodiscard]] _CCCL_API constexpr bool any() const noexcept + { + return _Impl::__any(__s_); + } + + [[nodiscard]] _CCCL_API constexpr bool none() const noexcept + { + return !any(); + } + + [[nodiscard]] _CCCL_API constexpr int count() const noexcept + { + return _Impl::__count(__s_); + } +}; + +template +inline constexpr bool is_simd_v> = true; + +template +using fixed_size_simd_mask = simd_mask<_Tp, _Np>; +} // namespace cuda::experimental::datapar + +#include + +#endif // _CUDAX___SIMD_SIMD_MASK_H diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index 60848c31ce4..3da28d9831a 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -38,6 +38,9 @@ template struct is_abi_tag : ::cuda::std::bool_constant> {}; +template +inline constexpr bool is_abi_tag_v> = true; + template inline constexpr bool is_simd_v = false; From 443837a7e73ff360e97be313e09c69cbb6e8e39f Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 14:36:42 -0800 Subject: [PATCH 05/32] remove explicit --- cudax/include/cuda/experimental/__simd/simd_mask.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index d18cf961ef5..0a0e652e4b9 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -57,12 +57,12 @@ class simd_mask : public __mask_operations<_Tp, simd_abi::fixed_size<_Np>> {}; static constexpr __storage_tag_t __storage_tag{}; - _CCCL_API explicit operator _Storage() const noexcept + _CCCL_API operator _Storage() const noexcept { return __s_; } - _CCCL_API explicit simd_mask(const _Storage& __s, __storage_tag_t) + _CCCL_API simd_mask(const _Storage& __s, __storage_tag_t) noexcept : __s_(__s) {} From f702a0161dd91296ac036235a1b20f48c038b30f Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 16:08:52 -0800 Subject: [PATCH 06/32] follow the standard --- .../cuda/experimental/__simd/declaration.h | 12 +- .../experimental/__simd/fixed_size_impl.h | 177 +++++++++++++---- .../cuda/experimental/__simd/reference.h | 6 +- cudax/include/cuda/experimental/__simd/simd.h | 187 +++++++++++------- .../cuda/experimental/__simd/simd_mask.h | 49 +++-- .../include/cuda/experimental/__simd/traits.h | 61 +++++- .../cuda/experimental/__simd/utility.h | 12 +- 7 files changed, 357 insertions(+), 147 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h index da2ab0df37f..14c9f3e14af 100644 --- a/cudax/include/cuda/experimental/__simd/declaration.h +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -54,17 +54,17 @@ struct __mask_storage; template struct __mask_operations; -template -class simd; - template class basic_simd; -template -class simd_mask; +template +using simd = basic_simd<_Tp, simd_abi::fixed_size<_Np>>; -template +template class basic_simd_mask; + +template +using simd_mask = basic_simd_mask<_Tp, simd_abi::fixed_size<_Np>>; } // namespace cuda::experimental::datapar #include diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index fa6cd45a70b..fe04e30867a 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -21,13 +21,13 @@ # pragma system_header #endif // no system header +#include #include #include #include #include #include #include -#include #include #include @@ -49,17 +49,20 @@ struct __fixed_size template struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> { + using value_type = _Tp; _Tp __data[_Np]; [[nodiscard]] _CCCL_API constexpr _Tp __get([[maybe_unused]] ::cuda::std::size_t __idx) const noexcept { - _CCCL_ASSERT(::cuda::in_range(__idx, 0, __simd_size), "Index is out of bounds"); + using ::cuda::std::size_t; + _CCCL_ASSERT(::cuda::in_range(__idx, size_t{0}, size_t{_Np}), "Index is out of bounds"); return __data[__idx]; } _CCCL_API constexpr void __set([[maybe_unused]] ::cuda::std::size_t __idx, _Tp __v) noexcept { - _CCCL_ASSERT(::cuda::in_range(__idx, 0, __simd_size), "Index is out of bounds"); + using ::cuda::std::size_t; + _CCCL_ASSERT(::cuda::in_range(__idx, size_t{0}, size_t{_Np}), "Index is out of bounds"); __data[__idx] = __v; } }; @@ -67,7 +70,9 @@ struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> template struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> : __simd_storage<::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>, simd_abi::__fixed_size<_Np>> -{}; +{ + using value_type = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; +}; // ********************************************************************************************************************* // * SIMD Arithmetic Operations @@ -94,13 +99,13 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) { - return _SimdStorage{{__g(std::integral_constant<::cuda::std::size_t, _Is>())...}}; + return _SimdStorage{{__g(::cuda::std::integral_constant<::cuda::std::size_t, _Is>())...}}; } template [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) { - return __generate_init(::cuda::std::forward<_Generator>(__g), ::cuda::std::make_index_sequence<_Np>()); + return __generate_init(__g, ::cuda::std::make_index_sequence<_Np>()); } template @@ -147,7 +152,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = !__s.__data[__i]; + __result.__data[__i] = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(!__s.__data[__i]); } return __result; } @@ -222,79 +227,85 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } - [[nodiscard]] _CCCL_API static constexpr _SimdStorage + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _SimdStorage __result; + _MaskStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = __lhs.__data[__i] == __rhs.__data[__i]; + __result.__data[__i] = + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] == __rhs.__data[__i]); } return __result; } - [[nodiscard]] _CCCL_API static constexpr _SimdStorage + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _SimdStorage __result; + _MaskStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i]; + __result.__data[__i] = + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] != __rhs.__data[__i]); } return __result; } - [[nodiscard]] _CCCL_API static constexpr _SimdStorage + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _SimdStorage __result; + _MaskStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = __lhs.__data[__i] < __rhs.__data[__i]; + __result.__data[__i] = + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] < __rhs.__data[__i]); } return __result; } - [[nodiscard]] _CCCL_API static constexpr _SimdStorage + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _SimdStorage __result; + _MaskStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = __lhs.__data[__i] <= __rhs.__data[__i]; + __result.__data[__i] = + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] <= __rhs.__data[__i]); } return __result; } - [[nodiscard]] _CCCL_API static constexpr _SimdStorage + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _SimdStorage __result; + _MaskStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = __lhs.__data[__i] > __rhs.__data[__i]; + __result.__data[__i] = + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] > __rhs.__data[__i]); } return __result; } - [[nodiscard]] _CCCL_API static constexpr _SimdStorage + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _SimdStorage __result; + _MaskStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = __lhs.__data[__i] >= __rhs.__data[__i]; + __result.__data[__i] = + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] >= __rhs.__data[__i]); } return __result; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) [[nodiscard]] _CCCL_API static constexpr _SimdStorage __modulo(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept @@ -308,7 +319,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_and(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept @@ -322,7 +333,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_or(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept @@ -336,7 +347,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_xor(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept @@ -350,7 +361,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) [[nodiscard]] _CCCL_API static constexpr _SimdStorage __shift_left(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept @@ -364,7 +375,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) [[nodiscard]] _CCCL_API static constexpr _SimdStorage __shift_right(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept @@ -391,7 +402,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept { _MaskStorage __result; - const auto __all_bits_v = ::cuda::experimental::datapar::__set_all_bits<_Tp>(__v); + const auto __all_bits_v = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__v); _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; ++__i) { @@ -405,7 +416,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __s.__data[__i] = ::cuda::experimental::datapar::__set_all_bits<_Tp>(__mem[__i]); + __s.__data[__i] = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__mem[__i]); } } @@ -417,6 +428,106 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> __mem[__i] = static_cast(__s.__data[__i]); } } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] & __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] | __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = __lhs.__data[__i] ^ __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + __result.__data[__i] = ~__s.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr bool __equal_to(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + if (__lhs.__data[__i] != __rhs.__data[__i]) + { + return false; + } + } + return true; + } + + [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + if (!__s.__data[__i]) + { + return false; + } + } + return true; + } + + [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + if (__s.__data[__i]) + { + return true; + } + } + return false; + } + + [[nodiscard]] _CCCL_API static constexpr int __count(const _MaskStorage& __s) noexcept + { + int __cnt = 0; + _CCCL_PRAGMA_UNROLL_FULL() + for (int __i = 0; __i < _Np; __i++) + { + if (__s.__data[__i]) + { + ++__cnt; + } + } + return __cnt; + } }; } // namespace cuda::experimental::datapar diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index 921cb78fc48..44bd1d40026 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -39,10 +39,10 @@ template class __simd_reference { template - friend class simd; + friend class basic_simd; template - friend class simd_mask; + friend class basic_simd_mask; _Storage& __s_; ::cuda::std::size_t __idx_; @@ -61,7 +61,7 @@ class __simd_reference { if constexpr (::cuda::std::is_same_v<_Vp, bool>) { - __s_.__set(__idx_, ::cuda::experimental::datapar::__set_all_bits<_Tp>(__v)); + __s_.__set(__idx_, ::cuda::experimental::datapar::__mask_bits_from_bool<_Storage>(__v)); } else { diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index 8de74b47f39..d9b04ae436b 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -26,11 +26,10 @@ #include #include #include -#include #include -#include #include +#include #include #include @@ -38,26 +37,33 @@ namespace cuda::experimental::datapar { -template -class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> +template +class basic_simd : public __simd_operations<_Tp, _Abi> { - using _Impl = __simd_operations<_Tp, simd_abi::fixed_size<_Np>>; + static_assert(is_abi_tag_v<_Abi>, "basic_simd requires a valid ABI tag"); + + using _Impl = __simd_operations<_Tp, _Abi>; using _Storage = typename _Impl::_SimdStorage; _Storage __s_; + template + static constexpr bool __is_value_preserving_broadcast = + (__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && __is_non_narrowing_convertible_v<_Up, _Tp>) + || (!__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && ::cuda::std::is_convertible_v<_Up, _Tp>); + public: using value_type = _Tp; using reference = __simd_reference<_Tp, _Storage, value_type>; - using mask_type = simd_mask<_Tp, _Np>; - using abi_type = simd_abi::fixed_size<_Np>; + using abi_type = _Abi; + using mask_type = basic_simd_mask; [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept { return simd_size_v; } - _CCCL_API simd() noexcept = default; + _CCCL_API basic_simd() noexcept = default; struct __storage_tag_t {}; @@ -68,20 +74,26 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> return __s_; } - _CCCL_API explicit simd(const _Storage& __s, __storage_tag_t) + _CCCL_API explicit basic_simd(const _Storage& __s, __storage_tag_t) : __s_(__s) {} _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v>) - _CCCL_API simd(_Up&& __v) noexcept - : __s_(_Impl::__broadcast(static_cast(::cuda::std::forward<_Up>(__v)))) + _CCCL_REQUIRES(__can_broadcast_v>&& __is_value_preserving_broadcast<_Up>) + _CCCL_API constexpr basic_simd(_Up&& __v) noexcept + : __s_(_Impl::__broadcast(static_cast(__v))) + {} + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v> + && !__is_value_preserving_broadcast<_Up>) + _CCCL_API constexpr explicit basic_simd(_Up&& __v) noexcept + : __s_(_Impl::__broadcast(static_cast(__v))) {} _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(!::cuda::std::is_same_v<_Up, _Tp> && ::cuda::std::is_same_v> - && __is_non_narrowing_convertible_v<_Up, value_type>) - _CCCL_API simd(const simd<_Up, size()>& __v) noexcept + _CCCL_REQUIRES(!::cuda::std::is_same_v<_Up, _Tp> && __is_non_narrowing_convertible_v<_Up, value_type>) + _CCCL_API basic_simd(const basic_simd<_Up, abi_type>& __v) noexcept { for (::cuda::std::size_t __i = 0; __i < size(); __i++) { @@ -91,29 +103,29 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> _CCCL_TEMPLATE(typename _Generator) _CCCL_REQUIRES(__can_generate_v) - _CCCL_API explicit simd(_Generator&& __g) noexcept - : __s_(_Impl::__generate(::cuda::std::forward<_Generator>(__g))) + _CCCL_API explicit basic_simd(_Generator&& __g) + : __s_(_Impl::__generate(__g)) {} - _CCCL_TEMPLATE(typename _Up, typename _Flags) + _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) - _CCCL_API simd(const _Up* __mem, _Flags) noexcept + _CCCL_API basic_simd(const _Up* __mem, _Flags = {}) noexcept { - _Impl::__load(__s_, _Flags::template __apply(__mem)); + _Impl::__load(__s_, _Flags::template __apply(__mem)); } - _CCCL_TEMPLATE(typename _Up, typename _Flags) + _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) - _CCCL_API void copy_from(const _Up* __mem, _Flags) noexcept + _CCCL_API void copy_from(const _Up* __mem, _Flags = {}) noexcept { - _Impl::__load(__s_, _Flags::template __apply(__mem)); + _Impl::__load(__s_, _Flags::template __apply(__mem)); } - _CCCL_TEMPLATE(typename _Up, typename _Flags) + _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) - _CCCL_API void copy_to(_Up* __mem, _Flags) const noexcept + _CCCL_API void copy_to(_Up* __mem, _Flags = {}) const noexcept { - _Impl::__store(__s_, _Flags::template __apply(__mem)); + _Impl::__store(__s_, _Flags::template __apply(__mem)); } _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept @@ -126,206 +138,229 @@ class simd : public __simd_operations<_Tp, simd_abi::fixed_size<_Np>> return __s_.__get(__i); } - _CCCL_API simd& operator++() noexcept + _CCCL_API basic_simd& operator++() noexcept { _Impl::__increment(__s_); return *this; } - _CCCL_API simd operator++(int) noexcept + _CCCL_API basic_simd operator++(int) noexcept { - const simd __r = *this; + const basic_simd __r = *this; _Impl::__increment(__s_); return __r; } - _CCCL_API simd& operator--() noexcept + _CCCL_API basic_simd& operator--() noexcept { _Impl::__decrement(__s_); return *this; } - _CCCL_API simd operator--(int) noexcept + _CCCL_API basic_simd operator--(int) noexcept { - const simd __r = *this; + const basic_simd __r = *this; _Impl::__decrement(__s_); return __r; } - [[nodiscard]] _CCCL_API simd operator+() const noexcept + [[nodiscard]] _CCCL_API basic_simd operator+() const noexcept { return *this; } - [[nodiscard]] _CCCL_API simd operator-() const noexcept + [[nodiscard]] _CCCL_API basic_simd operator-() const noexcept { return {_Impl::__unary_minus(__s_), __storage_tag}; } - _CCCL_API constexpr friend simd& operator+=(simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API friend constexpr mask_type operator!(const basic_simd& __v) noexcept + { + return {_Impl::__negate(__v.__s_), mask_type::__storage_tag}; + } + + _CCCL_API constexpr friend basic_simd& operator+=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs + __rhs; } - _CCCL_API constexpr friend simd& operator-=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator-=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs - __rhs; } - _CCCL_API constexpr friend simd& operator*=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator*=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs * __rhs; } - _CCCL_API constexpr friend simd& operator/=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator/=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs / __rhs; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend simd& operator%=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator%=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs % __rhs; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend simd& operator&=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator&=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs & __rhs; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend simd& operator|=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator|=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs | __rhs; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend simd& operator^=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator^=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs ^ __rhs; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend simd& operator<<=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator<<=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs << __rhs; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend simd& operator>>=(simd& __lhs, const simd& __rhs) noexcept + _CCCL_API constexpr friend basic_simd& operator>>=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs >> __rhs; } - [[nodiscard]] _CCCL_API constexpr friend simd operator+(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator+(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd operator-(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator-(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd operator*(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator*(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd operator/(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator/(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend simd operator%(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator%(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend simd operator&(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator&(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend simd operator|(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator|(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend simd operator^(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator^(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend simd operator<<(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator<<(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend simd operator>>(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd + operator>>(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_TEMPLATE(typename _Up) + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr simd operator~() const noexcept + [[nodiscard]] _CCCL_API constexpr basic_simd operator~() const noexcept { return {_Impl::__bitwise_not(__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend mask_type + operator==(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend mask_type + operator!=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend mask_type + operator<=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__greater(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const simd& __lhs, const simd& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend mask_type + operator>=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; } }; -template -inline constexpr bool is_simd_v> = true; +template +class simd : public basic_simd<_Tp, simd_abi::fixed_size<_Np>> +{ +public: + using basic_simd<_Tp, simd_abi::fixed_size<_Np>>::basic_simd; +}; // Note: native_simd would require platform-specific ABI specializations // For now, use fixed_size_simd directly or specialize with a known size diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index 0a0e652e4b9..c4ac502365f 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -33,10 +33,12 @@ namespace cuda::experimental::datapar { -template -class simd_mask : public __mask_operations<_Tp, simd_abi::fixed_size<_Np>> +template +class basic_simd_mask : public __mask_operations<_Tp, _Abi> { - using _Impl = __mask_operations<_Tp, simd_abi::fixed_size<_Np>>; + static_assert(is_abi_tag_v<_Abi>, "basic_simd_mask requires a valid ABI tag"); + + using _Impl = __mask_operations<_Tp, _Abi>; using _Storage = typename _Impl::_MaskStorage; _Storage __s_; @@ -44,14 +46,14 @@ class simd_mask : public __mask_operations<_Tp, simd_abi::fixed_size<_Np>> public: using value_type = bool; using reference = __simd_reference<_Tp, _Storage, bool>; - using abi_type = simd_abi::fixed_size<_Np>; + using abi_type = _Abi; [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept { - return _Np; + return simd_size_v<_Tp, abi_type>; } - _CCCL_API simd_mask() noexcept = default; + _CCCL_API basic_simd_mask() noexcept = default; struct __storage_tag_t {}; @@ -62,13 +64,13 @@ class simd_mask : public __mask_operations<_Tp, simd_abi::fixed_size<_Np>> return __s_; } - _CCCL_API simd_mask(const _Storage& __s, __storage_tag_t) noexcept + _CCCL_API basic_simd_mask(const _Storage& __s, __storage_tag_t) noexcept : __s_(__s) {} _CCCL_TEMPLATE(typename _Up) _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, bool>) - _CCCL_API simd_mask(_Up __v) noexcept + _CCCL_API basic_simd_mask(_Up __v) noexcept : __s_(_Impl::__broadcast(__v)) {} @@ -83,49 +85,54 @@ class simd_mask : public __mask_operations<_Tp, simd_abi::fixed_size<_Np>> } // Bitwise operations - [[nodiscard]] _CCCL_API constexpr friend simd_mask operator&(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd_mask + operator&(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept { return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd_mask operator|(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd_mask + operator|(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept { return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend simd_mask operator^(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend basic_simd_mask + operator^(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept { return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr simd_mask operator!() const noexcept + [[nodiscard]] _CCCL_API constexpr basic_simd_mask operator!() const noexcept { return {_Impl::__bitwise_not(__s_), __storage_tag}; } - _CCCL_API simd_mask& operator&=(const simd_mask& __rhs) noexcept + _CCCL_API basic_simd_mask& operator&=(const basic_simd_mask& __rhs) noexcept { return *this = *this & __rhs; } - _CCCL_API simd_mask& operator|=(const simd_mask& __rhs) noexcept + _CCCL_API basic_simd_mask& operator|=(const basic_simd_mask& __rhs) noexcept { return *this = *this | __rhs; } - _CCCL_API simd_mask& operator^=(const simd_mask& __rhs) noexcept + _CCCL_API basic_simd_mask& operator^=(const basic_simd_mask& __rhs) noexcept { return *this = *this ^ __rhs; } // Comparison operations - [[nodiscard]] _CCCL_API constexpr friend bool operator==(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend bool + operator==(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept { return _Impl::__equal_to(__lhs.__s_, __rhs.__s_); } #if _CCCL_STD_VER < 2020 - [[nodiscard]] _CCCL_API constexpr friend bool operator!=(const simd_mask& __lhs, const simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API constexpr friend bool + operator!=(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept { return !(__lhs == __rhs); } @@ -152,8 +159,12 @@ class simd_mask : public __mask_operations<_Tp, simd_abi::fixed_size<_Np>> } }; -template -inline constexpr bool is_simd_v> = true; +template +class simd_mask : public basic_simd_mask<_Tp, simd_abi::fixed_size<_Np>> +{ +public: + using basic_simd_mask<_Tp, simd_abi::fixed_size<_Np>>::basic_simd_mask; +}; template using fixed_size_simd_mask = simd_mask<_Tp, _Np>; diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index 3da28d9831a..fc02d21c302 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -31,6 +31,42 @@ namespace cuda::experimental::datapar { +struct element_aligned_tag +{ + template + _CCCL_HIDE_FROM_ABI static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept + { + return __ptr; + } +}; + +struct vector_aligned_tag +{ + template + _CCCL_HIDE_FROM_ABI static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept + { + return __ptr; + } +}; + +template <::cuda::std::size_t _Alignment> +struct overaligned_tag +{ + template + _CCCL_HIDE_FROM_ABI static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept + { + _CCCL_ASSERT(reinterpret_cast<::cuda::std::uintptr_t>(__ptr) % _Alignment == 0, + "Pointer does not satisfy overaligned_tag alignment requirement"); + return __ptr; + } +}; + +inline constexpr element_aligned_tag element_aligned{}; +inline constexpr vector_aligned_tag vector_aligned{}; + +template <::cuda::std::size_t _Alignment> +inline constexpr overaligned_tag<_Alignment> overaligned{}; + template inline constexpr bool is_abi_tag_v = false; @@ -55,18 +91,33 @@ template struct is_simd_flag_type : ::cuda::std::bool_constant> {}; -template , - bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> +template , bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> struct simd_size : ::cuda::std::integral_constant<::cuda::std::size_t, _Abi::__simd_size> {}; template struct simd_size<_Tp, _Abi, false> -{}; +{ + static constexpr ::cuda::std::size_t value = 0; +}; -template > +template > inline constexpr ::cuda::std::size_t simd_size_v = simd_size<_Tp, _Abi>::value; + +template +inline constexpr bool is_simd_v> = true; + +template +inline constexpr bool is_simd_v> = true; + +template <> +inline constexpr bool is_simd_flag_type_v = true; + +template <> +inline constexpr bool is_simd_flag_type_v = true; + +template <::cuda::std::size_t _Alignment> +inline constexpr bool is_simd_flag_type_v> = true; } // namespace cuda::experimental::datapar #include diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index c6d10e60d4f..85f2f9131e1 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -39,12 +39,14 @@ namespace cuda::experimental::datapar { -template -[[nodiscard]] _CCCL_API constexpr auto __set_all_bits(bool __v) noexcept +template +[[nodiscard]] _CCCL_API constexpr typename _Storage::value_type __mask_bits_from_bool(bool __v) noexcept { - static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>, "set_all_bits() requires unsigned integer types"); - using _Up = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; - return __v ? (::cuda::std::numeric_limits<_Up>::max()) : 0; + using _MaskValueType = typename _Storage::value_type; + static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_MaskValueType>, + "__mask_bits_from_bool requires unsigned integer storage"); + using _Up = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_MaskValueType>>; + return __v ? (::cuda::std::numeric_limits<_Up>::max()) : _MaskValueType{0}; } template From d1d7bece39f796431c6c51df3c8728b4681619c8 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 16:46:57 -0800 Subject: [PATCH 07/32] reduce redundancy --- .../experimental/__simd/fixed_size_impl.h | 287 ++++-------------- cudax/include/cuda/experimental/__simd/simd.h | 245 ++++++++++++++- .../cuda/experimental/__simd/simd_mask.h | 21 +- 3 files changed, 312 insertions(+), 241 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index fe04e30867a..991179fdd0d 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -74,6 +74,39 @@ struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> using value_type = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; }; +// Helper macros to generate repeated fixed-size operations. +#define _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_StorageType, _Name, _Op) \ + [[nodiscard]] _CCCL_API static constexpr _StorageType _Name( \ + const _StorageType& __lhs, const _StorageType& __rhs) noexcept \ + { \ + _StorageType __result; \ + _CCCL_PRAGMA_UNROLL_FULL() \ + for (int __i = 0; __i < _Np; ++__i) \ + { \ + __result.__data[__i] = (__lhs.__data[__i] _Op __rhs.__data[__i]); \ + } \ + return __result; \ + } + +#define _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(_Name, _Op) \ + [[nodiscard]] _CCCL_API static constexpr _MaskStorage _Name( \ + const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept \ + { \ + _MaskStorage __result; \ + _CCCL_PRAGMA_UNROLL_FULL() \ + for (int __i = 0; __i < _Np; ++__i) \ + { \ + __result.__data[__i] = \ + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>((__lhs.__data[__i] _Op __rhs.__data[__i])); \ + } \ + return __result; \ + } + +#define _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_StorageType, _Name, _Op) \ + _CCCL_TEMPLATE(typename _Up = _Tp) \ + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) \ + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_StorageType, _Name, _Op) + // ********************************************************************************************************************* // * SIMD Arithmetic Operations // ********************************************************************************************************************* @@ -81,8 +114,8 @@ struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> template struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> { - using _SimdStorage _CCCL_NODEBUG = __simd_storage<_Tp, simd_abi::__fixed_size<_Np>>; - using _MaskStorage _CCCL_NODEBUG = __mask_storage<_Tp, simd_abi::__fixed_size<_Np>>; + using _SimdStorage = __simd_storage<_Tp, simd_abi::__fixed_size<_Np>>; + using _MaskStorage = __mask_storage<_Tp, simd_abi::__fixed_size<_Np>>; [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept { @@ -179,215 +212,37 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] + __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __plus, +) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] - __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __minus, -) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] * __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __multiplies, *) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __divides(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] / __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __divides, /) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] == __rhs.__data[__i]); - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__equal_to, ==) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] != __rhs.__data[__i]); - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__not_equal_to, !=) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] < __rhs.__data[__i]); - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__less, <) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] <= __rhs.__data[__i]); - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__less_equal, <=) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] > __rhs.__data[__i]); - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__greater, >) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data[__i] >= __rhs.__data[__i]); - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__greater_equal, >=) - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __modulo(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] % __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __modulo, %) - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __bitwise_and(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] & __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __bitwise_and, &) - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __bitwise_or(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] | __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __bitwise_or, |) - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __bitwise_xor(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] ^ __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __bitwise_xor, ^) - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __shift_left(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] << __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __shift_left, <<) - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __shift_right(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] >> __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __shift_right, >>) }; // ********************************************************************************************************************* @@ -429,41 +284,11 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> } } - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] & __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_and, &) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] | __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_or, |) - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = __lhs.__data[__i] ^ __rhs.__data[__i]; - } - return __result; - } + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_xor, ^) [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept { @@ -529,6 +354,10 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> return __cnt; } }; + +#undef _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP +#undef _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP +#undef _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP } // namespace cuda::experimental::datapar #include diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index d9b04ae436b..e95cef5cfe4 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -58,6 +58,19 @@ class basic_simd : public __simd_operations<_Tp, _Abi> using abi_type = _Abi; using mask_type = basic_simd_mask; + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES(::cuda::std::is_same_v::abi_type, abi_type>&& ::cuda::std:: + is_same_v::value_type, value_type>) + _CCCL_API explicit operator basic_simd_mask<_Up, _Ap>() const noexcept + { + basic_simd_mask<_Up, _Ap> __result; + for (::cuda::std::size_t __i = 0; __i < size(); ++__i) + { + __result[__i] = static_cast((*this)[__i]); + } + return __result; + } + [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept { return simd_size_v; @@ -174,11 +187,6 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__unary_minus(__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API friend constexpr mask_type operator!(const basic_simd& __v) noexcept - { - return {_Impl::__negate(__v.__s_), mask_type::__storage_tag}; - } - _CCCL_API constexpr friend basic_simd& operator+=(basic_simd& __lhs, const basic_simd& __rhs) noexcept { return __lhs = __lhs + __rhs; @@ -247,24 +255,80 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator+(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs + basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator+(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) + __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator-(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator-(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs - basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator-(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) - __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator*(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator*(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs * basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator*(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) * __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator/(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator/(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs / basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator/(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) / __rhs; + } + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr friend basic_simd @@ -273,6 +337,20 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs % basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) % __rhs; + } + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr friend basic_simd @@ -281,6 +359,20 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs & basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) & __rhs; + } + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr friend basic_simd @@ -289,6 +381,20 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs | basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) | __rhs; + } + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr friend basic_simd @@ -297,6 +403,20 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs ^ basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) ^ __rhs; + } + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr friend basic_simd @@ -305,6 +425,20 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs << basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) << __rhs; + } + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr friend basic_simd @@ -313,6 +447,20 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs >> basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) >> __rhs; + } + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr basic_simd operator~() const noexcept @@ -326,33 +474,117 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return {_Impl::__equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs == basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) == __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs != basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) != __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs < basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) < __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs <= basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) <= __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__greater(__lhs.__s_, __rhs.__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs > basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) > __rhs; + } + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { return {_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const basic_simd& __lhs, _Up&& __rhs) noexcept + { + return __lhs >= basic_simd(static_cast(__rhs)); + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(__can_broadcast_v) + [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(_Up&& __lhs, const basic_simd& __rhs) noexcept + { + return basic_simd(static_cast(__lhs)) >= __rhs; + } }; template @@ -362,9 +594,6 @@ class simd : public basic_simd<_Tp, simd_abi::fixed_size<_Np>> using basic_simd<_Tp, simd_abi::fixed_size<_Np>>::basic_simd; }; -// Note: native_simd would require platform-specific ABI specializations -// For now, use fixed_size_simd directly or specialize with a known size - template using fixed_size_simd = simd<_Tp, _Np>; } // namespace cuda::experimental::datapar diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index c4ac502365f..98354189b11 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -65,13 +65,13 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> } _CCCL_API basic_simd_mask(const _Storage& __s, __storage_tag_t) noexcept - : __s_(__s) + : __s_{__s} {} _CCCL_TEMPLATE(typename _Up) _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, bool>) _CCCL_API basic_simd_mask(_Up __v) noexcept - : __s_(_Impl::__broadcast(__v)) + : __s_{_Impl::__broadcast(__v)} {} _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept @@ -108,6 +108,19 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> return {_Impl::__bitwise_not(__s_), __storage_tag}; } + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES(simd_size_v<_Up, _Ap> == simd_size_v<_Tp, abi_type>) + _CCCL_API constexpr explicit(sizeof(_Up) != sizeof(value_type)) operator basic_simd<_Up, _Ap>() const noexcept + { + basic_simd<_Up, _Ap> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (::cuda::std::size_t __i = 0; __i < size(); ++__i) + { + __result[__i] = static_cast<_Up>((*this)[__i]); + } + return __result; + } + _CCCL_API basic_simd_mask& operator&=(const basic_simd_mask& __rhs) noexcept { return *this = *this & __rhs; @@ -131,8 +144,8 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> } #if _CCCL_STD_VER < 2020 - [[nodiscard]] _CCCL_API constexpr friend bool - operator!=(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept + [[nodiscard]] + _CCCL_API constexpr friend bool operator!=(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept { return !(__lhs == __rhs); } From 21bb217eb7838d9499174b060e05b6cca6aae4ad Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 17:10:13 -0800 Subject: [PATCH 08/32] headers and explicit usage --- .../cuda/experimental/__simd/reference.h | 2 +- cudax/include/cuda/experimental/__simd/simd.h | 14 +++++++++++++- .../cuda/experimental/__simd/simd_mask.h | 5 +++-- cudax/include/cuda/experimental/__simd/traits.h | 1 + .../include/cuda/experimental/__simd/utility.h | 2 ++ cudax/include/cuda/experimental/simd.cuh | 17 +++++++++++++++++ 6 files changed, 37 insertions(+), 4 deletions(-) create mode 100644 cudax/include/cuda/experimental/simd.cuh diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index 44bd1d40026..90c45c23a62 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -21,8 +21,8 @@ # pragma system_header #endif // no system header +#include #include -#include #include #include #include diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index e95cef5cfe4..91522ef626e 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -114,6 +115,17 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } } + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(!::cuda::std::is_same_v<_Up, _Tp> && !__is_non_narrowing_convertible_v<_Up, value_type> && + ::cuda::std::is_convertible_v<_Up, value_type>) + _CCCL_API explicit basic_simd(const basic_simd<_Up, abi_type>& __v) noexcept + { + for (::cuda::std::size_t __i = 0; __i < size(); __i++) + { + (*this)[__i] = static_cast(__v[__i]); + } + } + _CCCL_TEMPLATE(typename _Generator) _CCCL_REQUIRES(__can_generate_v) _CCCL_API explicit basic_simd(_Generator&& __g) @@ -122,7 +134,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) - _CCCL_API basic_simd(const _Up* __mem, _Flags = {}) noexcept + _CCCL_API explicit basic_simd(const _Up* __mem, _Flags = {}) noexcept { _Impl::__load(__s_, _Flags::template __apply(__mem)); } diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index 98354189b11..d5076042d58 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -59,7 +60,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> {}; static constexpr __storage_tag_t __storage_tag{}; - _CCCL_API operator _Storage() const noexcept + _CCCL_API explicit operator _Storage() const noexcept { return __s_; } @@ -110,7 +111,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up, typename _Ap) _CCCL_REQUIRES(simd_size_v<_Up, _Ap> == simd_size_v<_Tp, abi_type>) - _CCCL_API constexpr explicit(sizeof(_Up) != sizeof(value_type)) operator basic_simd<_Up, _Ap>() const noexcept + _CCCL_API constexpr explicit operator basic_simd<_Up, _Ap>() const noexcept { basic_simd<_Up, _Ap> __result; _CCCL_PRAGMA_UNROLL_FULL() diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index fc02d21c302..78f880212f1 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -23,6 +23,7 @@ #include #include +#include #include #include diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index 85f2f9131e1..441889ed7df 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -28,12 +28,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include diff --git a/cudax/include/cuda/experimental/simd.cuh b/cudax/include/cuda/experimental/simd.cuh new file mode 100644 index 00000000000..b14f852610e --- /dev/null +++ b/cudax/include/cuda/experimental/simd.cuh @@ -0,0 +1,17 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_SIMD___ +#define __CUDAX_SIMD___ + +#include +#include + +#endif // __CUDAX_SIMD___ From c2a9d1c025b33d5e39c75dc664525f0fde0d8403 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 17:26:08 -0800 Subject: [PATCH 09/32] add simd_mask generator --- .../experimental/__simd/fixed_size_impl.h | 18 +++++ .../cuda/experimental/__simd/simd_mask.h | 30 ++++++++- .../include/cuda/experimental/__simd/traits.h | 66 +++++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index 991179fdd0d..d7c0a2f6b0c 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -266,6 +266,24 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> return __result; } + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) noexcept + { + _MaskStorage __result; + ((__result.__set(_Is, + ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>( + static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, _Is>()))))), + ...); + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) noexcept + { + return __generate_init(__g, ::cuda::std::make_index_sequence<_Np>()); + } + _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept { _CCCL_PRAGMA_UNROLL_FULL() diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index d5076042d58..d3d94f3bd50 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -71,10 +72,37 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up) _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, bool>) - _CCCL_API basic_simd_mask(_Up __v) noexcept + _CCCL_API explicit basic_simd_mask(_Up __v) noexcept : __s_{_Impl::__broadcast(__v)} {} + _CCCL_TEMPLATE(typename _Generator) + _CCCL_REQUIRES(__can_generate_v>) + _CCCL_API explicit basic_simd_mask(_Generator&& __g) noexcept + : __s_(_Impl::__generate(__g)) + {} + + _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) + _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) + _CCCL_API explicit basic_simd_mask(const bool* __mem, _Flags = {}) noexcept + { + _Impl::__load(__s_, _Flags::template __apply(__mem)); + } + + _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) + _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) + _CCCL_API void copy_from(const bool* __mem, _Flags = {}) noexcept + { + _Impl::__load(__s_, _Flags::template __apply(__mem)); + } + + _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) + _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) + _CCCL_API void copy_to(bool* __mem, _Flags = {}) const noexcept + { + _Impl::__store(__s_, _Flags::template __apply(__mem)); + } + _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept { return reference(__s_, __i); diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index 78f880212f1..53e28317976 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -85,6 +85,13 @@ template struct is_simd : ::cuda::std::bool_constant> {}; +template +inline constexpr bool is_simd_mask_v = false; + +template +struct is_simd_mask : ::cuda::std::bool_constant> +{}; + template inline constexpr bool is_simd_flag_type_v = false; @@ -111,6 +118,9 @@ inline constexpr bool is_simd_v> = true; template inline constexpr bool is_simd_v> = true; +template +inline constexpr bool is_simd_mask_v> = true; + template <> inline constexpr bool is_simd_flag_type_v = true; @@ -119,6 +129,62 @@ inline constexpr bool is_simd_flag_type_v = true; template <::cuda::std::size_t _Alignment> inline constexpr bool is_simd_flag_type_v> = true; + +// Memory alignment queries +template +struct memory_alignment; + +template +struct memory_alignment, element_aligned_tag> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Tp)> +{}; + +template +struct memory_alignment, vector_aligned_tag> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Tp) * simd_size_v<_Tp, _Abi>> +{}; + +template +struct memory_alignment, overaligned_tag<_Alignment>> + : ::cuda::std::integral_constant<::cuda::std::size_t, _Alignment> +{}; + +template +struct memory_alignment, element_aligned_tag> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(bool)> +{}; + +template +struct memory_alignment, vector_aligned_tag> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(bool) * simd_size_v<_Tp, _Abi>> +{}; + +template +struct memory_alignment, overaligned_tag<_Alignment>> + : ::cuda::std::integral_constant<::cuda::std::size_t, _Alignment> +{}; + +template +inline constexpr ::cuda::std::size_t memory_alignment_v = memory_alignment<_Tp, _Flags>::value; + +// Rebind simd element type +template +struct rebind_simd; + +template +struct rebind_simd<_Tp, basic_simd<_Up, _Abi>> +{ + using type = basic_simd<_Tp, _Abi>; +}; + +template +struct rebind_simd<_Tp, basic_simd_mask<_Up, _Abi>> +{ + using type = basic_simd_mask<_Tp, _Abi>; +}; + +template +using rebind_simd_t = typename rebind_simd<_Tp, _Simd>::type; } // namespace cuda::experimental::datapar #include From 523a7c44321096ec23d7a402c6c8e9e4d29086b5 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 18:02:28 -0800 Subject: [PATCH 10/32] fix initialization --- cudax/include/cuda/experimental/__simd/simd.h | 89 ++++--- .../cuda/experimental/__simd/simd_mask.h | 11 +- .../include/cuda/experimental/__simd/traits.h | 9 +- cudax/test/CMakeLists.txt | 4 + cudax/test/simd/simd.cu | 219 ++++++++++++++++++ 5 files changed, 271 insertions(+), 61 deletions(-) create mode 100644 cudax/test/simd/simd.cu diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index 91522ef626e..0f7ed9437d9 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -60,8 +60,8 @@ class basic_simd : public __simd_operations<_Tp, _Abi> using mask_type = basic_simd_mask; _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES(::cuda::std::is_same_v::abi_type, abi_type>&& ::cuda::std:: - is_same_v::value_type, value_type>) + _CCCL_REQUIRES((::cuda::std::is_same_v::abi_type, abi_type> + _CCCL_AND ::cuda::std::is_same_v::value_type, value_type>) ) _CCCL_API explicit operator basic_simd_mask<_Up, _Ap>() const noexcept { basic_simd_mask<_Up, _Ap> __result; @@ -77,7 +77,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> return simd_size_v; } - _CCCL_API basic_simd() noexcept = default; + _CCCL_HIDE_FROM_ABI basic_simd() noexcept = default; struct __storage_tag_t {}; @@ -99,14 +99,14 @@ class basic_simd : public __simd_operations<_Tp, _Abi> {} _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v> - && !__is_value_preserving_broadcast<_Up>) + _CCCL_REQUIRES((__can_broadcast_v> + && !__is_value_preserving_broadcast<_Up>) ) _CCCL_API constexpr explicit basic_simd(_Up&& __v) noexcept : __s_(_Impl::__broadcast(static_cast(__v))) {} _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(!::cuda::std::is_same_v<_Up, _Tp> && __is_non_narrowing_convertible_v<_Up, value_type>) + _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp> && __is_non_narrowing_convertible_v<_Up, value_type>) ) _CCCL_API basic_simd(const basic_simd<_Up, abi_type>& __v) noexcept { for (::cuda::std::size_t __i = 0; __i < size(); __i++) @@ -116,8 +116,8 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(!::cuda::std::is_same_v<_Up, _Tp> && !__is_non_narrowing_convertible_v<_Up, value_type> && - ::cuda::std::is_convertible_v<_Up, value_type>) + _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp> && !__is_non_narrowing_convertible_v<_Up, value_type> + && ::cuda::std::is_convertible_v<_Up, value_type>) ) _CCCL_API explicit basic_simd(const basic_simd<_Up, abi_type>& __v) noexcept { for (::cuda::std::size_t __i = 0; __i < size(); __i++) @@ -133,21 +133,21 @@ class basic_simd : public __simd_operations<_Tp, _Abi> {} _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) + _CCCL_REQUIRES((__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) ) _CCCL_API explicit basic_simd(const _Up* __mem, _Flags = {}) noexcept { _Impl::__load(__s_, _Flags::template __apply(__mem)); } _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) + _CCCL_REQUIRES((__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) ) _CCCL_API void copy_from(const _Up* __mem, _Flags = {}) noexcept { _Impl::__load(__s_, _Flags::template __apply(__mem)); } _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(__is_vectorizable_v<_Up>&& is_simd_flag_type_v<_Flags>) + _CCCL_REQUIRES((__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) ) _CCCL_API void copy_to(_Up* __mem, _Flags = {}) const noexcept { _Impl::__store(__s_, _Flags::template __apply(__mem)); @@ -196,7 +196,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API basic_simd operator-() const noexcept { - return {_Impl::__unary_minus(__s_), __storage_tag}; + return basic_simd{_Impl::__unary_minus(__s_), __storage_tag}; } _CCCL_API constexpr friend basic_simd& operator+=(basic_simd& __lhs, const basic_simd& __rhs) noexcept @@ -264,7 +264,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator+(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -284,7 +284,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator-(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -304,7 +304,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator*(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -324,7 +324,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator/(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -346,18 +346,18 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs % basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) % __rhs; @@ -368,18 +368,18 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs & basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) & __rhs; @@ -390,18 +390,18 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs | basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) | __rhs; @@ -412,18 +412,18 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs ^ basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) ^ __rhs; @@ -434,18 +434,18 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs << basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) << __rhs; @@ -456,18 +456,18 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs >> basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>&& __can_broadcast_v) + _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) >> __rhs; @@ -477,13 +477,13 @@ class basic_simd : public __simd_operations<_Tp, _Abi> _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) [[nodiscard]] _CCCL_API constexpr basic_simd operator~() const noexcept { - return {_Impl::__bitwise_not(__s_), __storage_tag}; + return basic_simd{_Impl::__bitwise_not(__s_), __storage_tag}; } [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; + return mask_type{_Impl::__equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -503,7 +503,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; + return basic_simd{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -522,7 +522,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__less(__lhs.__s_, __rhs.__s_), __storage_tag}; + return mask_type{_Impl::__less(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -542,7 +542,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__less_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; + return mask_type{_Impl::__less_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -561,7 +561,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__greater(__lhs.__s_, __rhs.__s_), __storage_tag}; + return mask_type{_Impl::__greater(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -581,7 +581,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return {_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), __storage_tag}; + return mask_type{_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; } _CCCL_TEMPLATE(typename _Up) @@ -599,13 +599,6 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } }; -template -class simd : public basic_simd<_Tp, simd_abi::fixed_size<_Np>> -{ -public: - using basic_simd<_Tp, simd_abi::fixed_size<_Np>>::basic_simd; -}; - template using fixed_size_simd = simd<_Tp, _Np>; } // namespace cuda::experimental::datapar diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index d3d94f3bd50..0c3eb87f19b 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -55,7 +55,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> return simd_size_v<_Tp, abi_type>; } - _CCCL_API basic_simd_mask() noexcept = default; + _CCCL_HIDE_FROM_ABI basic_simd_mask() noexcept = default; struct __storage_tag_t {}; @@ -138,7 +138,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES(simd_size_v<_Up, _Ap> == simd_size_v<_Tp, abi_type>) + _CCCL_REQUIRES((simd_size_v<_Up, _Ap> == simd_size_v<_Tp, abi_type>)) _CCCL_API constexpr explicit operator basic_simd<_Up, _Ap>() const noexcept { basic_simd<_Up, _Ap> __result; @@ -201,13 +201,6 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> } }; -template -class simd_mask : public basic_simd_mask<_Tp, simd_abi::fixed_size<_Np>> -{ -public: - using basic_simd_mask<_Tp, simd_abi::fixed_size<_Np>>::basic_simd_mask; -}; - template using fixed_size_simd_mask = simd_mask<_Tp, _Np>; } // namespace cuda::experimental::datapar diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index 53e28317976..ae1cf71ba92 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include @@ -35,7 +36,7 @@ namespace cuda::experimental::datapar struct element_aligned_tag { template - _CCCL_HIDE_FROM_ABI static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept + [[nodiscard]] _CCCL_API static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept { return __ptr; } @@ -44,7 +45,7 @@ struct element_aligned_tag struct vector_aligned_tag { template - _CCCL_HIDE_FROM_ABI static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept + [[nodiscard]] _CCCL_API static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept { return __ptr; } @@ -54,9 +55,9 @@ template <::cuda::std::size_t _Alignment> struct overaligned_tag { template - _CCCL_HIDE_FROM_ABI static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept + [[nodiscard]] _CCCL_API static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept { - _CCCL_ASSERT(reinterpret_cast<::cuda::std::uintptr_t>(__ptr) % _Alignment == 0, + _CCCL_ASSERT(::cuda::std::is_sufficiently_aligned<_Alignment>(__ptr), "Pointer does not satisfy overaligned_tag alignment requirement"); return __ptr; } diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 46e0c89978d..e094e0708f7 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -150,6 +150,10 @@ foreach (cudax_target IN LISTS cudax_TARGETS) algorithm/copy.cu ) + cudax_add_catch2_test(test_target simd ${cudax_target} + simd/simd.cu + ) + if (cudax_ENABLE_CUFILE) cudax_add_catch2_test(test_target cufile_driver_attributes ${cudax_target} cufile/driver_attributes.cu diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu new file mode 100644 index 00000000000..11b0c07709f --- /dev/null +++ b/cudax/test/simd/simd.cu @@ -0,0 +1,219 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include + +namespace dp = cuda::experimental::datapar; + +namespace +{ +struct identity_index_generator +{ + template + __host__ __device__ constexpr int operator()(Index idx) const + { + return static_cast(idx); + } +}; + +struct double_index_generator +{ + template + __host__ __device__ constexpr int operator()(Index idx) const + { + return static_cast(idx * 2); + } +}; + +struct alternating_mask_generator +{ + template + __host__ __device__ constexpr bool operator()(Index idx) const + { + return (idx % 2) == 0; + } +}; +} // namespace + +template +__host__ __device__ void expect_equal(const Simd& actual, const ::cuda::std::array& expected) +{ + static_assert(N == Simd::size(), "Mismatch between expected values and simd width"); + for (::cuda::std::size_t i = 0; i < N; ++i) + { + CUDAX_REQUIRE(actual[i] == expected[i]); + } +} + +C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") +{ + using abi_t = dp::simd_abi::fixed_size<4>; + using simd_t = dp::simd; + using mask_t = dp::simd_mask; + using other_t = dp::simd; + using rebind_t = dp::rebind_simd_t; + + STATIC_REQUIRE(dp::is_abi_tag_v); + STATIC_REQUIRE(!dp::is_abi_tag_v); + STATIC_REQUIRE(dp::simd_size_v == 4); + STATIC_REQUIRE(dp::simd_size_v == 4); + STATIC_REQUIRE(dp::simd_size_v == 0); + + STATIC_REQUIRE(dp::is_simd_v); + STATIC_REQUIRE(!dp::is_simd_v); + STATIC_REQUIRE(dp::is_simd_mask_v); + STATIC_REQUIRE(!dp::is_simd_mask_v); + + STATIC_REQUIRE(dp::is_simd_flag_type_v); + STATIC_REQUIRE(dp::is_simd_flag_type_v); + STATIC_REQUIRE(dp::is_simd_flag_type_v>); + + STATIC_REQUIRE(simd_t::size() == 4); + STATIC_REQUIRE(mask_t::size() == simd_t::size()); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(int)); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(int) * simd_t::size()); + STATIC_REQUIRE(dp::memory_alignment_v> == 128); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(bool)); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(bool) * mask_t::size()); + + STATIC_REQUIRE(::cuda::std::is_same_v); +} + +C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") +{ + using simd_t = dp::simd; + + simd_t broadcast(7); + expect_equal(broadcast, ::cuda::std::array{7, 7, 7, 7}); + + simd_t generated(double_index_generator{}); + expect_equal(generated, ::cuda::std::array{0, 2, 4, 6}); + + alignas(64) int storage[simd_t::size()] = {0, 1, 2, 3}; + simd_t from_ptr(storage, dp::overaligned<64>); + expect_equal(from_ptr, ::cuda::std::array{0, 1, 2, 3}); + + int roundtrip[simd_t::size()] = {}; + generated.copy_to(roundtrip, dp::overaligned<64>); + + simd_t loaded; + loaded.copy_from(roundtrip, dp::overaligned<64>); + expect_equal(loaded, ::cuda::std::array{0, 2, 4, 6}); + + dp::simd widened(generated); + expect_equal(widened, ::cuda::std::array{0.0F, 2.0F, 4.0F, 6.0F}); + + dp::simd assigned = simd_t(identity_index_generator{}); + assigned = generated; + expect_equal(assigned, ::cuda::std::array{0, 2, 4, 6}); + + auto incremented = generated; + ++incremented; + expect_equal(incremented, ::cuda::std::array{1, 3, 5, 7}); + + auto decremented = incremented; + decremented--; + expect_equal(decremented, ::cuda::std::array{0, 2, 4, 6}); +} + +C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") +{ + using simd_t = dp::simd; + using mask_t = simd_t::mask_type; + + simd_t lhs(identity_index_generator{}); + simd_t rhs(2); + + auto sum = lhs + rhs; + expect_equal(sum, ::cuda::std::array{2, 3, 4, 5}); + + auto difference = sum - 1; + expect_equal(difference, ::cuda::std::array{1, 2, 3, 4}); + + auto product = lhs * rhs; + expect_equal(product, ::cuda::std::array{0, 2, 4, 6}); + + auto quotient = product / rhs; + expect_equal(quotient, ::cuda::std::array{0, 1, 2, 3}); + + auto modulo = product % rhs; + expect_equal(modulo, ::cuda::std::array{0, 0, 0, 0}); + + auto bit_and = product & simd_t(3); + expect_equal(bit_and, ::cuda::std::array{0, 2, 0, 2}); + + auto bit_or = bit_and | simd_t(4); + expect_equal(bit_or, ::cuda::std::array{4, 6, 4, 6}); + + auto bit_xor = bit_and ^ simd_t(1); + expect_equal(bit_xor, ::cuda::std::array{1, 3, 1, 3}); + + auto shift_left = simd_t(1) << lhs; + expect_equal(shift_left, ::cuda::std::array{1, 2, 4, 8}); + + auto shift_right = shift_left >> simd_t(1); + expect_equal(shift_right, ::cuda::std::array{0, 1, 2, 4}); + + mask_t eq_mask = (lhs == lhs); + CUDAX_REQUIRE(eq_mask.all()); + mask_t lt_mask = (lhs < simd_t(2)); + CUDAX_REQUIRE(lt_mask.count() == 2); + mask_t ge_mask = (lhs >= simd_t(1)); + CUDAX_REQUIRE(ge_mask.any()); + CUDAX_REQUIRE(!ge_mask.none()); + + auto negated = -lhs; + expect_equal(negated, ::cuda::std::array{0, -1, -2, -3}); +} + +C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") +{ + using mask_t = dp::simd_mask; + using simd_t = dp::simd; + + mask_t alternating(alternating_mask_generator{}); + expect_equal(alternating, ::cuda::std::array{true, false, true, false}); + CUDAX_REQUIRE(alternating.count() == 2); + CUDAX_REQUIRE(alternating.any()); + CUDAX_REQUIRE(!alternating.all()); + CUDAX_REQUIRE(!alternating.none()); + + mask_t inverted = !alternating; + expect_equal(inverted, ::cuda::std::array{false, true, false, true}); + + mask_t zero = alternating & inverted; + CUDAX_REQUIRE(zero.none()); + + mask_t combined = alternating | inverted; + CUDAX_REQUIRE(combined.all()); + + bool buffer[mask_t::size()] = {}; + alternating.copy_to(buffer); + mask_t loaded(buffer); + CUDAX_REQUIRE(loaded == alternating); + + auto vec_from_mask = static_cast(alternating); + expect_equal(vec_from_mask, ::cuda::std::array{1, 0, 1, 0}); + + mask_t mutated = alternating; + mutated[1] = true; + mutated[3] = true; + CUDAX_REQUIRE(mutated.all()); + + mask_t broadcast_true(true); + CUDAX_REQUIRE(broadcast_true.all()); +} + From 8bf87834edb71e6274d522f1394a1fdd0f5d8e0a Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 18:03:56 -0800 Subject: [PATCH 11/32] formatting --- cudax/include/cuda/experimental/__simd/simd_mask.h | 2 +- cudax/test/simd/simd.cu | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index 0c3eb87f19b..f909c2c656a 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -138,7 +138,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES((simd_size_v<_Up, _Ap> == simd_size_v<_Tp, abi_type>)) + _CCCL_REQUIRES((simd_size_v<_Up, _Ap> == simd_size_v<_Tp, abi_type>) ) _CCCL_API constexpr explicit operator basic_simd<_Up, _Ap>() const noexcept { basic_simd<_Up, _Ap> __result; diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index 11b0c07709f..0608fd60af6 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -216,4 +216,3 @@ C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") mask_t broadcast_true(true); CUDAX_REQUIRE(broadcast_true.all()); } - From e5e67bbdef43975ec9166b800bcc3bd078267722 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 20 Nov 2025 18:16:18 -0800 Subject: [PATCH 12/32] reduce redundancy --- cudax/test/simd/simd.cu | 160 +++++++++++++++++++++++++++++++++------- 1 file changed, 132 insertions(+), 28 deletions(-) diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index 0608fd60af6..ecb48155fa8 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -58,6 +59,9 @@ __host__ __device__ void expect_equal(const Simd& actual, const ::cuda::std::arr } } +template +using simd_array_t = ::cuda::std::array; + C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") { using abi_t = dp::simd_abi::fixed_size<4>; @@ -94,105 +98,163 @@ C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") { - using simd_t = dp::simd; + using simd_t = dp::simd; + using mask_t = simd_t::mask_type; + using array_t = simd_array_t; simd_t broadcast(7); - expect_equal(broadcast, ::cuda::std::array{7, 7, 7, 7}); + expect_equal(broadcast, array_t{7, 7, 7, 7}); simd_t generated(double_index_generator{}); - expect_equal(generated, ::cuda::std::array{0, 2, 4, 6}); + expect_equal(generated, array_t{0, 2, 4, 6}); alignas(64) int storage[simd_t::size()] = {0, 1, 2, 3}; simd_t from_ptr(storage, dp::overaligned<64>); - expect_equal(from_ptr, ::cuda::std::array{0, 1, 2, 3}); + expect_equal(from_ptr, array_t{0, 1, 2, 3}); int roundtrip[simd_t::size()] = {}; generated.copy_to(roundtrip, dp::overaligned<64>); simd_t loaded; loaded.copy_from(roundtrip, dp::overaligned<64>); - expect_equal(loaded, ::cuda::std::array{0, 2, 4, 6}); + expect_equal(loaded, array_t{0, 2, 4, 6}); dp::simd widened(generated); expect_equal(widened, ::cuda::std::array{0.0F, 2.0F, 4.0F, 6.0F}); + mask_t from_simd = static_cast(generated); + expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); + dp::simd assigned = simd_t(identity_index_generator{}); assigned = generated; - expect_equal(assigned, ::cuda::std::array{0, 2, 4, 6}); + expect_equal(assigned, array_t{0, 2, 4, 6}); auto incremented = generated; ++incremented; - expect_equal(incremented, ::cuda::std::array{1, 3, 5, 7}); + expect_equal(incremented, array_t{1, 3, 5, 7}); auto decremented = incremented; decremented--; - expect_equal(decremented, ::cuda::std::array{0, 2, 4, 6}); + expect_equal(decremented, array_t{0, 2, 4, 6}); } C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") { - using simd_t = dp::simd; - using mask_t = simd_t::mask_type; + using simd_t = dp::simd; + using mask_t = simd_t::mask_type; + using array_t = simd_array_t; simd_t lhs(identity_index_generator{}); simd_t rhs(2); auto sum = lhs + rhs; - expect_equal(sum, ::cuda::std::array{2, 3, 4, 5}); + expect_equal(sum, array_t{2, 3, 4, 5}); auto difference = sum - 1; - expect_equal(difference, ::cuda::std::array{1, 2, 3, 4}); + expect_equal(difference, array_t{1, 2, 3, 4}); + + auto vec_plus_scalar = lhs + 5; + expect_equal(vec_plus_scalar, array_t{5, 6, 7, 8}); + + auto scalar_plus_vec = 5 + lhs; + expect_equal(scalar_plus_vec, array_t{5, 6, 7, 8}); + + auto scalar_minus_vec = 5 - lhs; + expect_equal(scalar_minus_vec, array_t{5, 4, 3, 2}); auto product = lhs * rhs; - expect_equal(product, ::cuda::std::array{0, 2, 4, 6}); + expect_equal(product, array_t{0, 2, 4, 6}); auto quotient = product / rhs; - expect_equal(quotient, ::cuda::std::array{0, 1, 2, 3}); + expect_equal(quotient, array_t{0, 1, 2, 3}); auto modulo = product % rhs; - expect_equal(modulo, ::cuda::std::array{0, 0, 0, 0}); + expect_equal(modulo, array_t{0, 0, 0, 0}); auto bit_and = product & simd_t(3); - expect_equal(bit_and, ::cuda::std::array{0, 2, 0, 2}); + expect_equal(bit_and, array_t{0, 2, 0, 2}); auto bit_or = bit_and | simd_t(4); - expect_equal(bit_or, ::cuda::std::array{4, 6, 4, 6}); + expect_equal(bit_or, array_t{4, 6, 4, 6}); auto bit_xor = bit_and ^ simd_t(1); - expect_equal(bit_xor, ::cuda::std::array{1, 3, 1, 3}); + expect_equal(bit_xor, array_t{1, 3, 1, 3}); + + auto vec_or_scalar = lhs | 1; + expect_equal(vec_or_scalar, array_t{1, 1, 3, 3}); + + auto scalar_or_vec = 1 | lhs; + expect_equal(scalar_or_vec, array_t{1, 1, 3, 3}); auto shift_left = simd_t(1) << lhs; - expect_equal(shift_left, ::cuda::std::array{1, 2, 4, 8}); + expect_equal(shift_left, array_t{1, 2, 4, 8}); auto shift_right = shift_left >> simd_t(1); - expect_equal(shift_right, ::cuda::std::array{0, 1, 2, 4}); + expect_equal(shift_right, array_t{0, 1, 2, 4}); + + auto vector_shift_scalar = lhs << 1; + expect_equal(vector_shift_scalar, array_t{0, 2, 4, 6}); + + auto scalar_shift_vector = 1 << lhs; + expect_equal(scalar_shift_vector, array_t{1, 2, 4, 8}); + + auto compound = lhs; + compound += rhs; + compound -= rhs; + expect_equal(compound, array_t{0, 1, 2, 3}); + + auto bitwise_compound = simd_t(3); + bitwise_compound &= rhs; + bitwise_compound |= simd_t(4); + bitwise_compound ^= simd_t(1); + expect_equal(bitwise_compound, array_t{7, 7, 7, 7}); + + auto shift_compound = simd_t(1); + shift_compound <<= rhs; + shift_compound >>= rhs; + expect_equal(shift_compound, array_t{1, 1, 1, 1}); mask_t eq_mask = (lhs == lhs); CUDAX_REQUIRE(eq_mask.all()); - mask_t lt_mask = (lhs < simd_t(2)); + mask_t lt_mask = (lhs < 2); CUDAX_REQUIRE(lt_mask.count() == 2); - mask_t ge_mask = (lhs >= simd_t(1)); + + mask_t scalar_first_lt = (2 < lhs); + CUDAX_REQUIRE(scalar_first_lt.count() == 2); + + mask_t scalar_eq_rhs = (lhs == 1); + CUDAX_REQUIRE(scalar_eq_rhs.count() == 1); + + mask_t scalar_eq_lhs = (1 == lhs); + CUDAX_REQUIRE(scalar_eq_lhs.count() == 1); + + mask_t ge_mask = (lhs >= 1); CUDAX_REQUIRE(ge_mask.any()); CUDAX_REQUIRE(!ge_mask.none()); auto negated = -lhs; - expect_equal(negated, ::cuda::std::array{0, -1, -2, -3}); + expect_equal(negated, array_t{0, -1, -2, -3}); + + auto bitwise_not = ~lhs; + expect_equal(bitwise_not, array_t{-1, -2, -3, -4}); } C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") { - using mask_t = dp::simd_mask; - using simd_t = dp::simd; + using mask_t = dp::simd_mask; + using simd_t = dp::simd; + using mask_array_t = ::cuda::std::array; + using simd_array_typed = simd_array_t; mask_t alternating(alternating_mask_generator{}); - expect_equal(alternating, ::cuda::std::array{true, false, true, false}); + expect_equal(alternating, mask_array_t{true, false, true, false}); CUDAX_REQUIRE(alternating.count() == 2); CUDAX_REQUIRE(alternating.any()); CUDAX_REQUIRE(!alternating.all()); CUDAX_REQUIRE(!alternating.none()); mask_t inverted = !alternating; - expect_equal(inverted, ::cuda::std::array{false, true, false, true}); + expect_equal(inverted, mask_array_t{false, true, false, true}); mask_t zero = alternating & inverted; CUDAX_REQUIRE(zero.none()); @@ -205,14 +267,56 @@ C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") mask_t loaded(buffer); CUDAX_REQUIRE(loaded == alternating); + mask_t copied(false); + copied.copy_from(buffer); + CUDAX_REQUIRE(copied == alternating); + + alignas(64) bool aligned_buffer[mask_t::size()] = {true, true, false, false}; + mask_t from_aligned(false); + from_aligned.copy_from(aligned_buffer, dp::overaligned<64>); + bool aligned_roundtrip[mask_t::size()] = {}; + from_aligned.copy_to(aligned_roundtrip, dp::overaligned<64>); + mask_t roundtrip_check(aligned_roundtrip); + CUDAX_REQUIRE(roundtrip_check == from_aligned); + auto vec_from_mask = static_cast(alternating); - expect_equal(vec_from_mask, ::cuda::std::array{1, 0, 1, 0}); + expect_equal(vec_from_mask, simd_array_typed{1, 0, 1, 0}); mask_t mutated = alternating; mutated[1] = true; mutated[3] = true; CUDAX_REQUIRE(mutated.all()); + mask_t xor_mask = alternating ^ inverted; + CUDAX_REQUIRE(xor_mask.all()); + + mask_t assigned = alternating; + assigned ^= inverted; + CUDAX_REQUIRE(assigned.all()); + + assigned &= combined; + CUDAX_REQUIRE(assigned.all()); + mask_t broadcast_true(true); CUDAX_REQUIRE(broadcast_true.all()); } + +C2H_CCCLRT_TEST("simd.reference", "[simd][reference]") +{ + using simd_t = dp::simd; + using array_t = simd_array_t; + + simd_t values(identity_index_generator{}); + values[2] += 5; + expect_equal(values, array_t{0, 1, 7, 3}); + + using ::cuda::std::swap; + + swap(values[0], values[3]); + int scalar = 42; + swap(values[1], scalar); + swap(scalar, values[2]); + + expect_equal(values, array_t{3, 42, 1, 0}); + CUDAX_REQUIRE(scalar == 7); +} From 4ebcb5df0bbefb84159e2c1dba0426cc731aa5b2 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Nov 2025 09:28:28 -0800 Subject: [PATCH 13/32] working unit test --- cudax/include/cuda/experimental/__simd/simd.h | 6 +++--- cudax/test/simd/simd.cu | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index 0f7ed9437d9..c83ba9cdb75 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -89,20 +89,20 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_API explicit basic_simd(const _Storage& __s, __storage_tag_t) - : __s_(__s) + : __s_{__s} {} _CCCL_TEMPLATE(typename _Up) _CCCL_REQUIRES(__can_broadcast_v>&& __is_value_preserving_broadcast<_Up>) _CCCL_API constexpr basic_simd(_Up&& __v) noexcept - : __s_(_Impl::__broadcast(static_cast(__v))) + : __s_{_Impl::__broadcast(static_cast(__v))} {} _CCCL_TEMPLATE(typename _Up) _CCCL_REQUIRES((__can_broadcast_v> && !__is_value_preserving_broadcast<_Up>) ) _CCCL_API constexpr explicit basic_simd(_Up&& __v) noexcept - : __s_(_Impl::__broadcast(static_cast(__v))) + : __s_{_Impl::__broadcast(static_cast(__v))} {} _CCCL_TEMPLATE(typename _Up) diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index ecb48155fa8..78785b2b179 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -49,17 +49,17 @@ struct alternating_mask_generator }; } // namespace -template +template __host__ __device__ void expect_equal(const Simd& actual, const ::cuda::std::array& expected) { static_assert(N == Simd::size(), "Mismatch between expected values and simd width"); - for (::cuda::std::size_t i = 0; i < N; ++i) + for (size_t i = 0; i < N; ++i) { CUDAX_REQUIRE(actual[i] == expected[i]); } } -template +template using simd_array_t = ::cuda::std::array; C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") @@ -112,7 +112,7 @@ C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") simd_t from_ptr(storage, dp::overaligned<64>); expect_equal(from_ptr, array_t{0, 1, 2, 3}); - int roundtrip[simd_t::size()] = {}; + alignas(64) int roundtrip[simd_t::size()] = {}; generated.copy_to(roundtrip, dp::overaligned<64>); simd_t loaded; @@ -122,8 +122,8 @@ C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") dp::simd widened(generated); expect_equal(widened, ::cuda::std::array{0.0F, 2.0F, 4.0F, 6.0F}); - mask_t from_simd = static_cast(generated); - expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); + // mask_t from_simd = static_cast(generated); + // expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); dp::simd assigned = simd_t(identity_index_generator{}); assigned = generated; @@ -219,7 +219,7 @@ C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") mask_t lt_mask = (lhs < 2); CUDAX_REQUIRE(lt_mask.count() == 2); - mask_t scalar_first_lt = (2 < lhs); + mask_t scalar_first_lt = (2 <= lhs); CUDAX_REQUIRE(scalar_first_lt.count() == 2); mask_t scalar_eq_rhs = (lhs == 1); @@ -274,7 +274,7 @@ C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") alignas(64) bool aligned_buffer[mask_t::size()] = {true, true, false, false}; mask_t from_aligned(false); from_aligned.copy_from(aligned_buffer, dp::overaligned<64>); - bool aligned_roundtrip[mask_t::size()] = {}; + alignas(64) bool aligned_roundtrip[mask_t::size()] = {}; from_aligned.copy_to(aligned_roundtrip, dp::overaligned<64>); mask_t roundtrip_check(aligned_roundtrip); CUDAX_REQUIRE(roundtrip_check == from_aligned); From 489d3f392d040bdec3ad1360d2ac4d938d36e2ed Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Nov 2025 10:01:15 -0800 Subject: [PATCH 14/32] fix semantic --- cudax/include/cuda/experimental/__simd/simd.h | 5 ++--- cudax/include/cuda/experimental/__simd/traits.h | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index c83ba9cdb75..b9368d4d9c4 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -60,8 +60,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> using mask_type = basic_simd_mask; _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES((::cuda::std::is_same_v::abi_type, abi_type> - _CCCL_AND ::cuda::std::is_same_v::value_type, value_type>) ) + _CCCL_REQUIRES((::cuda::std::is_same_v<_Up, value_type>) _CCCL_AND ::cuda::std::is_same_v<_Ap, abi_type>) _CCCL_API explicit operator basic_simd_mask<_Up, _Ap>() const noexcept { basic_simd_mask<_Up, _Ap> __result; @@ -503,7 +502,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept { - return basic_simd{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; + return mask_type{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up) diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index ae1cf71ba92..db86ea79cb5 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -113,11 +113,11 @@ struct simd_size<_Tp, _Abi, false> template > inline constexpr ::cuda::std::size_t simd_size_v = simd_size<_Tp, _Abi>::value; -template -inline constexpr bool is_simd_v> = true; +template +inline constexpr ::cuda::std::size_t simd_size_v<_Tp, void> = _Tp::size(); template -inline constexpr bool is_simd_v> = true; +inline constexpr bool is_simd_v> = true; template inline constexpr bool is_simd_mask_v> = true; From b66bf7ea1c21afa8804041cd3203be9f8d16d5ea Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Nov 2025 10:41:22 -0800 Subject: [PATCH 15/32] add simd::scalar --- .../cuda/experimental/__simd/declaration.h | 4 +- .../experimental/__simd/fixed_size_impl.h | 5 +- .../cuda/experimental/__simd/scalar_impl.h | 337 ++++++++++++++++++ cudax/include/cuda/experimental/__simd/simd.h | 1 + .../cuda/experimental/__simd/simd_mask.h | 3 +- .../include/cuda/experimental/__simd/traits.h | 3 + cudax/test/simd/simd.cu | 33 +- 7 files changed, 366 insertions(+), 20 deletions(-) create mode 100644 cudax/include/cuda/experimental/__simd/scalar_impl.h diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h index 14c9f3e14af..f13c4e9ac03 100644 --- a/cudax/include/cuda/experimental/__simd/declaration.h +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -27,7 +27,9 @@ namespace cuda::experimental::datapar { namespace simd_abi { -struct vector_abi; +struct __scalar; + +using scalar = __scalar; template struct __fixed_size; diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index d7c0a2f6b0c..836ecd22fac 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -268,7 +268,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> template [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) noexcept + __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) { _MaskStorage __result; ((__result.__set(_Is, @@ -279,7 +279,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> } template - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) noexcept + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) { return __generate_init(__g, ::cuda::std::make_index_sequence<_Np>()); } @@ -358,6 +358,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> return false; } + // P1928R15 requires simd-size-type (ptrdiff_t) return type [[nodiscard]] _CCCL_API static constexpr int __count(const _MaskStorage& __s) noexcept { int __cnt = 0; diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h new file mode 100644 index 00000000000..55813ae1073 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/scalar_impl.h @@ -0,0 +1,337 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SCALAR_IMPL_H +#define _CUDAX___SIMD_SCALAR_IMPL_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuda::experimental::datapar +{ +namespace simd_abi +{ +struct __scalar +{ + static constexpr ::cuda::std::size_t __simd_size = 1; +}; +} // namespace simd_abi + +template +struct __simd_storage<_Tp, simd_abi::__scalar> +{ + using value_type = _Tp; + _Tp __data; + + [[nodiscard]] _CCCL_API constexpr _Tp __get([[maybe_unused]] ::cuda::std::size_t __idx) const noexcept + { + _CCCL_ASSERT(__idx == 0, "Index is out of bounds"); + return __data; + } + + _CCCL_API constexpr void __set([[maybe_unused]] ::cuda::std::size_t __idx, _Tp __v) noexcept + { + _CCCL_ASSERT(__idx == 0, "Index is out of bounds"); + __data = __v; + } +}; + +template +struct __mask_storage<_Tp, simd_abi::__scalar> + : __simd_storage<::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>, simd_abi::__scalar> +{ + using value_type = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; +}; + +// ********************************************************************************************************************* +// * SIMD Arithmetic Operations +// ********************************************************************************************************************* + +template +struct __simd_operations<_Tp, simd_abi::__scalar> +{ + using _SimdStorage = __simd_storage<_Tp, simd_abi::__scalar>; + using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>; + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept + { + return _SimdStorage{__v}; + } + + template + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) + { + return _SimdStorage{__g(::cuda::std::integral_constant<::cuda::std::size_t, 0>())}; + } + + template + _CCCL_API static constexpr void __load(_SimdStorage& __s, const _Up* __mem) noexcept + { + __s.__data = static_cast<_Tp>(__mem[0]); + } + + template + _CCCL_API static constexpr void __store(const _SimdStorage& __s, _Up* __mem) noexcept + { + __mem[0] = static_cast<_Up>(__s.__data); + } + + _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept + { + __s.__data += 1; + } + + _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept + { + __s.__data -= 1; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(!__s.__data); + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept + { + return _SimdStorage{static_cast<_Tp>(~__s.__data)}; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept + { + return _SimdStorage{static_cast<_Tp>(-__s.__data)}; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data + __rhs.__data)}; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data - __rhs.__data)}; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data * __rhs.__data)}; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __divides(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data / __rhs.__data)}; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data == __rhs.__data); + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data != __rhs.__data); + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data < __rhs.__data); + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data <= __rhs.__data); + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data > __rhs.__data); + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data >= __rhs.__data); + return __result; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __modulo(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data % __rhs.__data)}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_and(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data & __rhs.__data)}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_or(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data | __rhs.__data)}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __bitwise_xor(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data ^ __rhs.__data)}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __shift_left(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data << __rhs.__data)}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __shift_right(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { + return _SimdStorage{static_cast<_Tp>(__lhs.__data >> __rhs.__data)}; + } +}; + +// ********************************************************************************************************************* +// * SIMD Mask Operations +// ********************************************************************************************************************* + +template +struct __mask_operations<_Tp, simd_abi::__scalar> +{ + using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>; + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__v); + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) + { + _MaskStorage __result; + __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>( + static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, 0>()))); + return __result; + } + + _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept + { + __s.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__mem[0]); + } + + _CCCL_API static constexpr void __store(const _MaskStorage& __s, bool* __mem) noexcept + { + __mem[0] = static_cast(__s.__data); + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + return _MaskStorage{__lhs.__data & __rhs.__data}; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + return _MaskStorage{__lhs.__data | __rhs.__data}; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + return _MaskStorage{__lhs.__data ^ __rhs.__data}; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept + { + return _MaskStorage{~__s.__data}; + } + + [[nodiscard]] _CCCL_API static constexpr bool __equal_to(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + return __lhs.__data == __rhs.__data; + } + + [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept + { + return static_cast(__s.__data); + } + + [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept + { + return static_cast(__s.__data); + } + + [[nodiscard]] _CCCL_API static constexpr int __count(const _MaskStorage& __s) noexcept + { + return static_cast(__s.__data) ? 1 : 0; + } +}; +} // namespace cuda::experimental::datapar + +#include + +#endif // _CUDAX___SIMD_SCALAR_IMPL_H diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index b9368d4d9c4..28823e4e5e4 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index f909c2c656a..20d672ceb2e 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -143,7 +144,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> { basic_simd<_Up, _Ap> __result; _CCCL_PRAGMA_UNROLL_FULL() - for (::cuda::std::size_t __i = 0; __i < size(); ++__i) + for (::cuda::std::size_t __i = 0; __i < simd_size_v<_Up, _Ap>; ++__i) { __result[__i] = static_cast<_Up>((*this)[__i]); } diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index db86ea79cb5..fed67785c3d 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -76,6 +76,9 @@ template struct is_abi_tag : ::cuda::std::bool_constant> {}; +template <> +inline constexpr bool is_abi_tag_v = true; + template inline constexpr bool is_abi_tag_v> = true; diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index 78785b2b179..9938431e329 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -21,7 +21,7 @@ namespace dp = cuda::experimental::datapar; namespace { -struct identity_index_generator +struct linear_index_gen { template __host__ __device__ constexpr int operator()(Index idx) const @@ -30,7 +30,7 @@ struct identity_index_generator } }; -struct double_index_generator +struct linear_index_x2_gen { template __host__ __device__ constexpr int operator()(Index idx) const @@ -39,7 +39,7 @@ struct double_index_generator } }; -struct alternating_mask_generator +struct alternating_mask_gen { template __host__ __device__ constexpr bool operator()(Index idx) const @@ -98,21 +98,22 @@ C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") { - using simd_t = dp::simd; - using mask_t = simd_t::mask_type; - using array_t = simd_array_t; + constexpr auto size = 4; + using simd_t = dp::simd; + using mask_t = simd_t::mask_type; + using array_t = simd_array_t; simd_t broadcast(7); expect_equal(broadcast, array_t{7, 7, 7, 7}); - simd_t generated(double_index_generator{}); + simd_t generated(linear_index_x2_gen{}); expect_equal(generated, array_t{0, 2, 4, 6}); - alignas(64) int storage[simd_t::size()] = {0, 1, 2, 3}; + alignas(64) int storage[size] = {0, 1, 2, 3}; simd_t from_ptr(storage, dp::overaligned<64>); expect_equal(from_ptr, array_t{0, 1, 2, 3}); - alignas(64) int roundtrip[simd_t::size()] = {}; + alignas(64) int roundtrip[size] = {}; generated.copy_to(roundtrip, dp::overaligned<64>); simd_t loaded; @@ -120,12 +121,12 @@ C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") expect_equal(loaded, array_t{0, 2, 4, 6}); dp::simd widened(generated); - expect_equal(widened, ::cuda::std::array{0.0F, 2.0F, 4.0F, 6.0F}); + expect_equal(widened, ::cuda::std::array{0.0f, 2.0f, 4.0f, 6.0f}); - // mask_t from_simd = static_cast(generated); - // expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); + mask_t from_simd = static_cast(generated); + expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); - dp::simd assigned = simd_t(identity_index_generator{}); + dp::simd assigned = simd_t(linear_index_gen{}); assigned = generated; expect_equal(assigned, array_t{0, 2, 4, 6}); @@ -144,7 +145,7 @@ C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") using mask_t = simd_t::mask_type; using array_t = simd_array_t; - simd_t lhs(identity_index_generator{}); + simd_t lhs(linear_index_gen{}); simd_t rhs(2); auto sum = lhs + rhs; @@ -246,7 +247,7 @@ C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") using mask_array_t = ::cuda::std::array; using simd_array_typed = simd_array_t; - mask_t alternating(alternating_mask_generator{}); + mask_t alternating(alternating_mask_gen{}); expect_equal(alternating, mask_array_t{true, false, true, false}); CUDAX_REQUIRE(alternating.count() == 2); CUDAX_REQUIRE(alternating.any()); @@ -306,7 +307,7 @@ C2H_CCCLRT_TEST("simd.reference", "[simd][reference]") using simd_t = dp::simd; using array_t = simd_array_t; - simd_t values(identity_index_generator{}); + simd_t values(linear_index_gen{}); values[2] += 5; expect_equal(values, array_t{0, 1, 7, 3}); From fd238724886f2f71ac9db8af3b3f6213a12008dd Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Nov 2025 12:30:26 -0800 Subject: [PATCH 16/32] use bool as mask_storage --- .../experimental/__simd/fixed_size_impl.h | 59 +++++++++---------- .../cuda/experimental/__simd/scalar_impl.h | 56 +++++------------- .../cuda/experimental/__simd/utility.h | 14 ----- 3 files changed, 43 insertions(+), 86 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index 836ecd22fac..4864f9fc593 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -26,12 +26,9 @@ #include #include #include -#include -#include #include #include -#include #include @@ -67,11 +64,11 @@ struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> } }; +// use a single bit for the mask storage could be not efficient in CUDA template -struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> - : __simd_storage<::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>, simd_abi::__fixed_size<_Np>> +struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> : public __simd_storage> { - using value_type = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; + using value_type = bool; }; // Helper macros to generate repeated fixed-size operations. @@ -88,18 +85,17 @@ struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> return __result; \ } -#define _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(_Name, _Op) \ - [[nodiscard]] _CCCL_API static constexpr _MaskStorage _Name( \ - const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept \ - { \ - _MaskStorage __result; \ - _CCCL_PRAGMA_UNROLL_FULL() \ - for (int __i = 0; __i < _Np; ++__i) \ - { \ - __result.__data[__i] = \ - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>((__lhs.__data[__i] _Op __rhs.__data[__i])); \ - } \ - return __result; \ +#define _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(_Name, _Op) \ + [[nodiscard]] _CCCL_API static constexpr _MaskStorage _Name( \ + const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept \ + { \ + _MaskStorage __result; \ + _CCCL_PRAGMA_UNROLL_FULL() \ + for (int __i = 0; __i < _Np; ++__i) \ + { \ + __result.__data[__i] = (__lhs.__data[__i] _Op __rhs.__data[__i]); \ + } \ + return __result; \ } #define _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_StorageType, _Name, _Op) \ @@ -185,7 +181,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(!__s.__data[__i]); + __result.__data[__i] = !__s.__data[__i]; } return __result; } @@ -257,11 +253,10 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept { _MaskStorage __result; - const auto __all_bits_v = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__v); _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; ++__i) { - __result.__set(__i, __all_bits_v); + __result.__set(__i, __v); } return __result; } @@ -271,10 +266,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) { _MaskStorage __result; - ((__result.__set(_Is, - ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>( - static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, _Is>()))))), - ...); + ((__result.__set(_Is, static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, _Is>())))), ...); return __result; } @@ -289,7 +281,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __s.__data[__i] = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__mem[__i]); + __s.__data[__i] = __mem[__i]; } } @@ -302,11 +294,12 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> } } - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_and, &) + // TODO: optimize with uint32 SWAR + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_and, &&) - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_or, |) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_or, ||) - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_xor, ^) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_xor, !=) [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept { @@ -314,7 +307,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> _CCCL_PRAGMA_UNROLL_FULL() for (int __i = 0; __i < _Np; __i++) { - __result.__data[__i] = ~__s.__data[__i]; + __result.__data[__i] = !__s.__data[__i]; } return __result; } @@ -359,7 +352,7 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> } // P1928R15 requires simd-size-type (ptrdiff_t) return type - [[nodiscard]] _CCCL_API static constexpr int __count(const _MaskStorage& __s) noexcept + [[nodiscard]] _CCCL_API static constexpr ::cuda::std::ptrdiff_t __count(const _MaskStorage& __s) noexcept { int __cnt = 0; _CCCL_PRAGMA_UNROLL_FULL() @@ -370,7 +363,9 @@ struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> ++__cnt; } } - return __cnt; + const auto __ret = static_cast<::cuda::std::ptrdiff_t>(__cnt); + _CCCL_ASSUME(__ret >= 0 && __ret <= _Np); + return __ret; } }; diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h index 55813ae1073..d1f152e9aa7 100644 --- a/cudax/include/cuda/experimental/__simd/scalar_impl.h +++ b/cudax/include/cuda/experimental/__simd/scalar_impl.h @@ -21,17 +21,13 @@ # pragma system_header #endif // no system header -#include #include #include #include #include -#include -#include #include #include -#include #include @@ -65,10 +61,9 @@ struct __simd_storage<_Tp, simd_abi::__scalar> }; template -struct __mask_storage<_Tp, simd_abi::__scalar> - : __simd_storage<::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>, simd_abi::__scalar> +struct __mask_storage<_Tp, simd_abi::__scalar> : __simd_storage { - using value_type = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_Tp>>; + using value_type = bool; }; // ********************************************************************************************************************* @@ -116,9 +111,7 @@ struct __simd_operations<_Tp, simd_abi::__scalar> [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(!__s.__data); - return __result; + return _MaskStorage{!__s.__data}; } [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept @@ -158,49 +151,37 @@ struct __simd_operations<_Tp, simd_abi::__scalar> [[nodiscard]] _CCCL_API static constexpr _MaskStorage __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data == __rhs.__data); - return __result; + return _MaskStorage{__lhs.__data == __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data != __rhs.__data); - return __result; + return _MaskStorage{__lhs.__data != __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data < __rhs.__data); - return __result; + return _MaskStorage{__lhs.__data < __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data <= __rhs.__data); - return __result; + return _MaskStorage{__lhs.__data <= __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data > __rhs.__data); - return __result; + return _MaskStorage{__lhs.__data > __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__lhs.__data >= __rhs.__data); - return __result; + return _MaskStorage{__lhs.__data >= __rhs.__data}; } _CCCL_TEMPLATE(typename _Up = _Tp) @@ -263,18 +244,13 @@ struct __mask_operations<_Tp, simd_abi::__scalar> [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__v); - return __result; + return _MaskStorage{__v}; } template - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) { - _MaskStorage __result; - __result.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>( - static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, 0>()))); - return __result; + return _MaskStorage{static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, 0>()))}; } _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept @@ -290,24 +266,24 @@ struct __mask_operations<_Tp, simd_abi::__scalar> [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept { - return _MaskStorage{__lhs.__data & __rhs.__data}; + return _MaskStorage{__lhs.__data && __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept { - return _MaskStorage{__lhs.__data | __rhs.__data}; + return _MaskStorage{__lhs.__data || __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept { - return _MaskStorage{__lhs.__data ^ __rhs.__data}; + return _MaskStorage{__lhs.__data != __rhs.__data}; } [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept { - return _MaskStorage{~__s.__data}; + return _MaskStorage{!__s.__data}; } [[nodiscard]] _CCCL_API static constexpr bool __equal_to(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index 441889ed7df..ff86058738f 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -22,17 +22,13 @@ #endif // no system header #include -#include #include #include #include #include #include #include -#include #include -#include -#include #include #include #include @@ -41,16 +37,6 @@ namespace cuda::experimental::datapar { -template -[[nodiscard]] _CCCL_API constexpr typename _Storage::value_type __mask_bits_from_bool(bool __v) noexcept -{ - using _MaskValueType = typename _Storage::value_type; - static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_MaskValueType>, - "__mask_bits_from_bool requires unsigned integer storage"); - using _Up = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<_MaskValueType>>; - return __v ? (::cuda::std::numeric_limits<_Up>::max()) : _MaskValueType{0}; -} - template inline constexpr bool __is_vectorizable_v = ::cuda::std::is_arithmetic_v<_Tp> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp> From 3b3a50e3cc534cff2e60c68d591f0d318ffb140b Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Nov 2025 12:37:47 -0800 Subject: [PATCH 17/32] simplify __simd_reference --- .../cuda/experimental/__simd/reference.h | 37 ++++++++----------- .../cuda/experimental/__simd/scalar_impl.h | 2 +- cudax/include/cuda/experimental/__simd/simd.h | 2 +- .../cuda/experimental/__simd/simd_mask.h | 2 +- 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index 90c45c23a62..573f4426955 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -35,7 +35,7 @@ namespace cuda::experimental::datapar { -template +template class __simd_reference { template @@ -59,14 +59,7 @@ class __simd_reference _CCCL_API constexpr void __set(_Vp __v) noexcept { - if constexpr (::cuda::std::is_same_v<_Vp, bool>) - { - __s_.__set(__idx_, ::cuda::experimental::datapar::__mask_bits_from_bool<_Storage>(__v)); - } - else - { - __s_.__set(__idx_, __v); - } + __s_.__set(__idx_, __v); } public: @@ -88,15 +81,15 @@ class __simd_reference return {__s_, __idx_}; } - template - friend _CCCL_API void swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, - __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; + template + friend _CCCL_API void swap(__simd_reference<_Storage1, _Vp1>&& __a, + __simd_reference<_Storage1, _Vp1>&& __b) noexcept; - template - friend _CCCL_API void swap(_Vp1& __a, __simd_reference<_Tp1, _Storage1, _Vp1>&& __b) noexcept; + template + friend _CCCL_API void swap(_Vp1& __a, __simd_reference<_Storage1, _Vp1>&& __b) noexcept; - template - friend _CCCL_API void swap(__simd_reference<_Tp1, _Storage1, _Vp1>&& __a, _Vp1& __b) noexcept; + template + friend _CCCL_API void swap(__simd_reference<_Storage1, _Vp1>&& __a, _Vp1& __b) noexcept; template () += ::cuda::std::declval<_Up>())> _CCCL_API __simd_reference operator+=(_Up&& __v) && noexcept @@ -195,24 +188,24 @@ class __simd_reference } }; -template -_CCCL_API void swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept +template +_CCCL_API void swap(__simd_reference<_Storage, _Vp>&& __a, __simd_reference<_Storage, _Vp>&& __b) noexcept { _Vp __tmp(::cuda::std::move(__a)); ::cuda::std::move(__a) = ::cuda::std::move(__b); ::cuda::std::move(__b) = ::cuda::std::move(__tmp); } -template -_CCCL_API void swap(_Vp& __a, __simd_reference<_Tp, _Storage, _Vp>&& __b) noexcept +template +_CCCL_API void swap(_Vp& __a, __simd_reference<_Storage, _Vp>&& __b) noexcept { _Vp __tmp(::cuda::std::move(__a)); __a = ::cuda::std::move(__b); ::cuda::std::move(__b) = ::cuda::std::move(__tmp); } -template -_CCCL_API void swap(__simd_reference<_Tp, _Storage, _Vp>&& __a, _Vp& __b) noexcept +template +_CCCL_API void swap(__simd_reference<_Storage, _Vp>&& __a, _Vp& __b) noexcept { _Vp __tmp(::cuda::std::move(__a)); ::cuda::std::move(__a) = ::cuda::std::move(__b); diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h index d1f152e9aa7..ab5c5e611c8 100644 --- a/cudax/include/cuda/experimental/__simd/scalar_impl.h +++ b/cudax/include/cuda/experimental/__simd/scalar_impl.h @@ -255,7 +255,7 @@ struct __mask_operations<_Tp, simd_abi::__scalar> _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept { - __s.__data = ::cuda::experimental::datapar::__mask_bits_from_bool<_MaskStorage>(__mem[0]); + __s.__data = __mem[0]; } _CCCL_API static constexpr void __store(const _MaskStorage& __s, bool* __mem) noexcept diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index 28823e4e5e4..0f3ce4bdd86 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -56,7 +56,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> public: using value_type = _Tp; - using reference = __simd_reference<_Tp, _Storage, value_type>; + using reference = __simd_reference<_Storage, value_type>; using abi_type = _Abi; using mask_type = basic_simd_mask; diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index 20d672ceb2e..8e224d0d951 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -48,7 +48,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> public: using value_type = bool; - using reference = __simd_reference<_Tp, _Storage, bool>; + using reference = __simd_reference<_Storage, bool>; using abi_type = _Abi; [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept From 5458cf7baabf5e4d854605009f9e5e42d8f04b87 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 21 Nov 2025 16:13:36 -0800 Subject: [PATCH 18/32] header cleanup --- cudax/include/cuda/experimental/__simd/reference.h | 3 --- cudax/include/cuda/experimental/__simd/scalar_impl.h | 1 - cudax/include/cuda/experimental/__simd/traits.h | 1 - cudax/include/cuda/experimental/__simd/utility.h | 8 ++++---- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index 573f4426955..ab6104b54b7 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -24,13 +24,10 @@ #include #include #include -#include #include #include #include -#include - #include namespace cuda::experimental::datapar diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h index ab5c5e611c8..a1ebc2bebd1 100644 --- a/cudax/include/cuda/experimental/__simd/scalar_impl.h +++ b/cudax/include/cuda/experimental/__simd/scalar_impl.h @@ -25,7 +25,6 @@ #include #include #include -#include #include diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index fed67785c3d..70e89285528 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index ff86058738f..67a0940c1d5 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -56,10 +56,10 @@ inline constexpr bool __can_broadcast_v = || (!__is_vectorizable_v<_Up> && ::cuda::std::is_convertible_v<_Up, _Tp>) || ::cuda::std::is_same_v<_Up, int> || (::cuda::std::is_same_v<_Up, unsigned int> && ::cuda::std::is_unsigned_v<_Tp>); -template +template inline constexpr bool __is_well_formed = false; -template +template inline constexpr bool __is_well_formed<_Tp, _Generator, _Idx, @@ -69,13 +69,13 @@ inline constexpr bool __is_well_formed<_Tp, _Tp, decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<::cuda::std::size_t, _Idx>()))>; -template +template _CCCL_HIDE_FROM_ABI constexpr bool __can_generate(::cuda::std::index_sequence<_Idxes...>) { return (true && ... && __is_well_formed<_Tp, _Generator, _Idxes>); } -template +template inline constexpr bool __can_generate_v = ::cuda::experimental::datapar::__can_generate<_Tp, _Generator>(::cuda::std::make_index_sequence<_Size>()); } // namespace cuda::experimental::datapar From f672929dc848200b025d1ba3582a602a381dbf5b Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 24 Nov 2025 15:00:14 -0800 Subject: [PATCH 19/32] fix c++17 --- cudax/include/cuda/experimental/__simd/simd.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h index 0f3ce4bdd86..ee7d0fc6190 100644 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ b/cudax/include/cuda/experimental/__simd/simd.h @@ -61,7 +61,7 @@ class basic_simd : public __simd_operations<_Tp, _Abi> using mask_type = basic_simd_mask; _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES((::cuda::std::is_same_v<_Up, value_type>) _CCCL_AND ::cuda::std::is_same_v<_Ap, abi_type>) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, value_type> _CCCL_AND ::cuda::std::is_same_v<_Ap, abi_type>) _CCCL_API explicit operator basic_simd_mask<_Up, _Ap>() const noexcept { basic_simd_mask<_Up, _Ap> __result; @@ -133,21 +133,21 @@ class basic_simd : public __simd_operations<_Tp, _Abi> {} _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES((__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) ) + _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) _CCCL_API explicit basic_simd(const _Up* __mem, _Flags = {}) noexcept { _Impl::__load(__s_, _Flags::template __apply(__mem)); } _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES((__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) ) + _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) _CCCL_API void copy_from(const _Up* __mem, _Flags = {}) noexcept { _Impl::__load(__s_, _Flags::template __apply(__mem)); } _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES((__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) ) + _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) _CCCL_API void copy_to(_Up* __mem, _Flags = {}) const noexcept { _Impl::__store(__s_, _Flags::template __apply(__mem)); @@ -350,14 +350,14 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs % basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) % __rhs; @@ -372,14 +372,14 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs & basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) & __rhs; @@ -394,14 +394,14 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs | basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) | __rhs; @@ -416,14 +416,14 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs ^ basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) ^ __rhs; @@ -438,14 +438,14 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs << basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) << __rhs; @@ -460,14 +460,14 @@ class basic_simd : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(const basic_simd& __lhs, _Up&& __rhs) noexcept { return __lhs >> basic_simd(static_cast(__rhs)); } _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) ) + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(_Up&& __lhs, const basic_simd& __rhs) noexcept { return basic_simd(static_cast(__lhs)) >> __rhs; From 2b5d84ecab1c64994e6011d6a512c162405822f2 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 24 Nov 2025 15:02:08 -0800 Subject: [PATCH 20/32] fix MSVC warning --- cudax/include/cuda/experimental/__simd/scalar_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h index a1ebc2bebd1..02a2196faa4 100644 --- a/cudax/include/cuda/experimental/__simd/scalar_impl.h +++ b/cudax/include/cuda/experimental/__simd/scalar_impl.h @@ -300,7 +300,7 @@ struct __mask_operations<_Tp, simd_abi::__scalar> return static_cast(__s.__data); } - [[nodiscard]] _CCCL_API static constexpr int __count(const _MaskStorage& __s) noexcept + [[nodiscard]] _CCCL_API static constexpr ::cuda::std::ptrdiff_t __count(const _MaskStorage& __s) noexcept { return static_cast(__s.__data) ? 1 : 0; } From e7d976c75840ad18532c87fd30e443438349a0f0 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 24 Nov 2025 15:06:54 -0800 Subject: [PATCH 21/32] formatting --- cudax/include/cuda/experimental/__simd/reference.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index ab6104b54b7..94f8a7491c5 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -79,8 +79,7 @@ class __simd_reference } template - friend _CCCL_API void swap(__simd_reference<_Storage1, _Vp1>&& __a, - __simd_reference<_Storage1, _Vp1>&& __b) noexcept; + friend _CCCL_API void swap(__simd_reference<_Storage1, _Vp1>&& __a, __simd_reference<_Storage1, _Vp1>&& __b) noexcept; template friend _CCCL_API void swap(_Vp1& __a, __simd_reference<_Storage1, _Vp1>&& __b) noexcept; From 4c2d8b8b5981bee59ee9576b8dd8444846a304a3 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 24 Nov 2025 15:32:43 -0800 Subject: [PATCH 22/32] fix macro names --- .../experimental/__simd/fixed_size_impl.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index 4864f9fc593..a2eadb2d0d2 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -72,36 +72,36 @@ struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> : public __simd_storage< }; // Helper macros to generate repeated fixed-size operations. -#define _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_StorageType, _Name, _Op) \ - [[nodiscard]] _CCCL_API static constexpr _StorageType _Name( \ - const _StorageType& __lhs, const _StorageType& __rhs) noexcept \ - { \ - _StorageType __result; \ - _CCCL_PRAGMA_UNROLL_FULL() \ - for (int __i = 0; __i < _Np; ++__i) \ - { \ - __result.__data[__i] = (__lhs.__data[__i] _Op __rhs.__data[__i]); \ - } \ - return __result; \ +#define _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_STORAGE_TYPE, _NAME, _OP) \ + [[nodiscard]] _CCCL_API static constexpr _STORAGE_TYPE _NAME( \ + const _STORAGE_TYPE& __lhs, const _STORAGE_TYPE& __rhs) noexcept \ + { \ + _STORAGE_TYPE __result; \ + _CCCL_PRAGMA_UNROLL_FULL() \ + for (int __i = 0; __i < _Np; ++__i) \ + { \ + __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]); \ + } \ + return __result; \ } -#define _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(_Name, _Op) \ - [[nodiscard]] _CCCL_API static constexpr _MaskStorage _Name( \ +#define _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(_NAME, _OP) \ + [[nodiscard]] _CCCL_API static constexpr _MaskStorage _NAME( \ const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept \ { \ _MaskStorage __result; \ _CCCL_PRAGMA_UNROLL_FULL() \ for (int __i = 0; __i < _Np; ++__i) \ { \ - __result.__data[__i] = (__lhs.__data[__i] _Op __rhs.__data[__i]); \ + __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]); \ } \ return __result; \ } -#define _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_StorageType, _Name, _Op) \ - _CCCL_TEMPLATE(typename _Up = _Tp) \ - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) \ - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_StorageType, _Name, _Op) +#define _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_STORAGE_TYPE, _NAME, _OP) \ + _CCCL_TEMPLATE(typename _Up = _Tp) \ + _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) \ + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_STORAGE_TYPE, _NAME, _OP) // ********************************************************************************************************************* // * SIMD Arithmetic Operations From 9e4ba7c4deadca14e6b9cfc98de2ae4918785567 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 24 Nov 2025 15:34:05 -0800 Subject: [PATCH 23/32] fix count() signature --- cudax/include/cuda/experimental/__simd/simd_mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/simd_mask.h index 8e224d0d951..5309f4b4b98 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/simd_mask.h @@ -196,7 +196,7 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> return !any(); } - [[nodiscard]] _CCCL_API constexpr int count() const noexcept + [[nodiscard]] _CCCL_API constexpr ::cuda::std::ptrdiff_t count() const noexcept { return _Impl::__count(__s_); } From 6d6be224be9ebbea040268deb06ae644da6ed877 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 23 Dec 2025 16:50:53 -0800 Subject: [PATCH 24/32] draft refactor --- .../__simd/{simd_mask.h => basic_mask.h} | 82 ++- .../cuda/experimental/__simd/basic_vec.h | 457 +++++++++++++ .../cuda/experimental/__simd/concepts.h | 110 ++++ .../cuda/experimental/__simd/declaration.h | 44 +- .../experimental/__simd/fixed_size_impl.h | 20 +- .../cuda/experimental/__simd/reference.h | 8 +- .../cuda/experimental/__simd/scalar_impl.h | 18 +- cudax/include/cuda/experimental/__simd/simd.h | 608 ------------------ .../include/cuda/experimental/__simd/traits.h | 66 +- cudax/test/CMakeLists.txt | 6 +- cudax/test/simd/simd.cu | 6 +- 11 files changed, 717 insertions(+), 708 deletions(-) rename cudax/include/cuda/experimental/__simd/{simd_mask.h => basic_mask.h} (60%) create mode 100644 cudax/include/cuda/experimental/__simd/basic_vec.h create mode 100644 cudax/include/cuda/experimental/__simd/concepts.h delete mode 100644 cudax/include/cuda/experimental/__simd/simd.h diff --git a/cudax/include/cuda/experimental/__simd/simd_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h similarity index 60% rename from cudax/include/cuda/experimental/__simd/simd_mask.h rename to cudax/include/cuda/experimental/__simd/basic_mask.h index 5309f4b4b98..8b19408c5c5 100644 --- a/cudax/include/cuda/experimental/__simd/simd_mask.h +++ b/cudax/include/cuda/experimental/__simd/basic_mask.h @@ -36,12 +36,13 @@ namespace cuda::experimental::datapar { -template -class basic_simd_mask : public __mask_operations<_Tp, _Abi> +// P1928R15: basic_mask is the primary SIMD mask type with Bytes as first template parameter +template <::cuda::std::size_t _Bytes, typename _Abi> +class basic_mask : public __mask_operations<_Bytes, _Abi> { - static_assert(is_abi_tag_v<_Abi>, "basic_simd_mask requires a valid ABI tag"); + static_assert(is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag"); - using _Impl = __mask_operations<_Tp, _Abi>; + using _Impl = __mask_operations<_Bytes, _Abi>; using _Storage = typename _Impl::_MaskStorage; _Storage __s_; @@ -51,12 +52,15 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> using reference = __simd_reference<_Storage, bool>; using abi_type = _Abi; + // P1928R15: Bytes represents the size of the corresponding element type + static constexpr ::cuda::std::size_t bytes = _Bytes; + [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept { - return simd_size_v<_Tp, abi_type>; + return _Abi::__simd_size; } - _CCCL_HIDE_FROM_ABI basic_simd_mask() noexcept = default; + _CCCL_HIDE_FROM_ABI basic_mask() noexcept = default; struct __storage_tag_t {}; @@ -67,41 +71,41 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> return __s_; } - _CCCL_API basic_simd_mask(const _Storage& __s, __storage_tag_t) noexcept + _CCCL_API basic_mask(const _Storage& __s, __storage_tag_t) noexcept : __s_{__s} {} _CCCL_TEMPLATE(typename _Up) _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, bool>) - _CCCL_API explicit basic_simd_mask(_Up __v) noexcept + _CCCL_API explicit basic_mask(_Up __v) noexcept : __s_{_Impl::__broadcast(__v)} {} _CCCL_TEMPLATE(typename _Generator) - _CCCL_REQUIRES(__can_generate_v>) - _CCCL_API explicit basic_simd_mask(_Generator&& __g) noexcept + _CCCL_REQUIRES(__can_generate_v) + _CCCL_API explicit basic_mask(_Generator&& __g) noexcept : __s_(_Impl::__generate(__g)) {} _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) - _CCCL_API explicit basic_simd_mask(const bool* __mem, _Flags = {}) noexcept + _CCCL_API explicit basic_mask(const bool* __mem, _Flags = {}) noexcept { - _Impl::__load(__s_, _Flags::template __apply(__mem)); + _Impl::__load(__s_, _Flags::template __apply(__mem)); } _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) _CCCL_API void copy_from(const bool* __mem, _Flags = {}) noexcept { - _Impl::__load(__s_, _Flags::template __apply(__mem)); + _Impl::__load(__s_, _Flags::template __apply(__mem)); } _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) _CCCL_API void copy_to(bool* __mem, _Flags = {}) const noexcept { - _Impl::__store(__s_, _Flags::template __apply(__mem)); + _Impl::__store(__s_, _Flags::template __apply(__mem)); } _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept @@ -115,67 +119,78 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> } // Bitwise operations - [[nodiscard]] _CCCL_API constexpr friend basic_simd_mask - operator&(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend basic_simd_mask - operator|(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator|(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr friend basic_simd_mask - operator^(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator^(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; } - [[nodiscard]] _CCCL_API constexpr basic_simd_mask operator!() const noexcept + [[nodiscard]] _CCCL_API constexpr basic_mask operator!() const noexcept { return {_Impl::__bitwise_not(__s_), __storage_tag}; } + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator&&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator||(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + // Conversion to basic_vec _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES((simd_size_v<_Up, _Ap> == simd_size_v<_Tp, abi_type>) ) - _CCCL_API constexpr explicit operator basic_simd<_Up, _Ap>() const noexcept + _CCCL_REQUIRES((sizeof(_Up) == _Bytes && _Ap::__simd_size == _Abi::__simd_size)) + _CCCL_API constexpr explicit operator basic_vec<_Up, _Ap>() const noexcept { - basic_simd<_Up, _Ap> __result; + basic_vec<_Up, _Ap> __result; _CCCL_PRAGMA_UNROLL_FULL() - for (::cuda::std::size_t __i = 0; __i < simd_size_v<_Up, _Ap>; ++__i) + for (::cuda::std::size_t __i = 0; __i < size(); ++__i) { __result[__i] = static_cast<_Up>((*this)[__i]); } return __result; } - _CCCL_API basic_simd_mask& operator&=(const basic_simd_mask& __rhs) noexcept + _CCCL_API basic_mask& operator&=(const basic_mask& __rhs) noexcept { return *this = *this & __rhs; } - _CCCL_API basic_simd_mask& operator|=(const basic_simd_mask& __rhs) noexcept + _CCCL_API basic_mask& operator|=(const basic_mask& __rhs) noexcept { return *this = *this | __rhs; } - _CCCL_API basic_simd_mask& operator^=(const basic_simd_mask& __rhs) noexcept + _CCCL_API basic_mask& operator^=(const basic_mask& __rhs) noexcept { return *this = *this ^ __rhs; } // Comparison operations - [[nodiscard]] _CCCL_API constexpr friend bool - operator==(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API friend constexpr bool operator==(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { return _Impl::__equal_to(__lhs.__s_, __rhs.__s_); } #if _CCCL_STD_VER < 2020 - [[nodiscard]] - _CCCL_API constexpr friend bool operator!=(const basic_simd_mask& __lhs, const basic_simd_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API friend constexpr bool operator!=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { return !(__lhs == __rhs); } @@ -201,9 +216,6 @@ class basic_simd_mask : public __mask_operations<_Tp, _Abi> return _Impl::__count(__s_); } }; - -template -using fixed_size_simd_mask = simd_mask<_Tp, _Np>; } // namespace cuda::experimental::datapar #include diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h new file mode 100644 index 00000000000..ec555f9e248 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -0,0 +1,457 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SIMD_H +#define _CUDAX___SIMD_SIMD_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda::experimental::datapar +{ +// P1928R15: basic_vec is the primary SIMD vector type (renamed from basic_simd) +template +class basic_vec : public __simd_operations<_Tp, _Abi> +{ + static_assert(is_abi_tag_v<_Abi>, "basic_vec requires a valid ABI tag"); + + using _Impl = __simd_operations<_Tp, _Abi>; + using _Storage = typename _Impl::_SimdStorage; + + _Storage __s_; + + template + static constexpr bool __is_value_preserving_broadcast = + (__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && __is_non_narrowing_convertible_v<_Up, _Tp>) + || (!__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && ::cuda::std::is_convertible_v<_Up, _Tp>); + + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + +public: + using value_type = _Tp; + using reference = __simd_reference<_Storage, value_type>; + using mask_type = basic_mask; + using abi_type = _Abi; + + // TODO: add iterators + // using iterator = simd-iterator; + // using const_iterator = simd-iterator; + + // constexpr iterator begin() noexcept { return {*this, 0}; } + // constexpr const_iterator begin() const noexcept { return {*this, 0}; } + // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; } + // constexpr default_sentinel_t end() const noexcept { return {}; } + // constexpr default_sentinel_t cend() const noexcept { return {}; } + + static constexpr ::cuda::std::integral_constant<__simd_size_type, __simd_size_v> size{}; + + _CCCL_HIDE_FROM_ABI basic_vec() noexcept = default; + + // [simd.ctor], basic_vec constructors + // TODO: fix constraints + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES( + (__can_broadcast_v>) _CCCL_AND(__is_value_preserving_broadcast<_Up>)) + _CCCL_API constexpr basic_vec(_Up&& __v) noexcept + : __s_{_Impl::__broadcast(static_cast(__v))} + {} + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES((__can_broadcast_v>) // + _CCCL_AND(!__is_value_preserving_broadcast<_Up>)) + _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept + : __s_{_Impl::__broadcast(static_cast(__v))} + {} + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp>) _CCCL_AND(__is_non_narrowing_convertible_v<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, abi_type>& __v) noexcept + { + for (__simd_size_type __i = 0; __i < size; __i++) + { + (*this)[__i] = static_cast(__v[__i]); + } + } + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp>) _CCCL_AND(!__is_non_narrowing_convertible_v<_Up, value_type>) + _CCCL_AND(::cuda::std::is_convertible_v<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, abi_type>& __v) noexcept + { + for (__simd_size_type __i = 0; __i < size; __i++) + { + (*this)[__i] = static_cast(__v[__i]); + } + } + + _CCCL_TEMPLATE(typename _Generator) + _CCCL_REQUIRES(__can_generate_v) + _CCCL_API constexpr explicit basic_vec(_Generator&& __g) + : __s_(_Impl::__generate(__g)) + {} + + // TODO: add constructors + // template + // constexpr basic_vec(R&& range, flags = {}); + + // template + // constexpr basic_vec(R&& range, const mask_type& mask, flags = {}); + + // constexpr basic_vec(const real - type & reals, const real - type& imags = {}) noexcept; + + // [simd.subscr], basic_vec subscript operators + _CCCL_API value_type operator[](__simd_size_type __i) const noexcept + { + return __s_.__get(__i); + } + + // TODO: add operator[] + // template + // constexpr resize_t operator[](const I& indices) const; + + // TODO: [simd.complex.access], basic_vec complex accessors + // constexpr real-type real() const noexcept; + // constexpr real-type imag() const noexcept; + // constexpr void real(const real-type& v) noexcept; + // constexpr void imag(const real-type& v) noexcept; + + // [simd.unary], basic_vec unary operators + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_pre_increment<_Up>) + _CCCL_API basic_vec& operator++() noexcept + { + _Impl::__increment(__s_); + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_post_increment<_Up>) + _CCCL_API basic_vec operator++(int) noexcept + { + const basic_vec __r = *this; + _Impl::__increment(__s_); + return __r; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_pre_decrement<_Up>) + _CCCL_API basic_vec& operator--() noexcept + { + _Impl::__decrement(__s_); + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_post_decrement<_Up>) + _CCCL_API basic_vec operator--(int) noexcept + { + const basic_vec __r = *this; + _Impl::__decrement(__s_); + return __r; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_negate<_Up>) + [[nodiscard]] _CCCL_API mask_type operator!() const noexcept + { + return mask_type{_Impl::__negate(__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_not<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator~() const noexcept + { + return basic_vec{_Impl::__bitwise_not(__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_plus<_Up>) + [[nodiscard]] _CCCL_API basic_vec operator+() const noexcept + { + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_unary_minus<_Up>) + [[nodiscard]] _CCCL_API basic_vec operator-() const noexcept + { + return basic_vec{_Impl::__unary_minus(__s_), __storage_tag}; + } + + // [simd.binary], basic_vec binary operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_plus<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator+(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_minus<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator-(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_multiplies<_Up>) + [[nodiscard]] + _CCCL_API friend constexpr basic_vec operator*(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_divides<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator/(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_modulo<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator%(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_and<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator&(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_or<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator|(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_xor<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator^(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left_size<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, __simd_size_type __n) noexcept + { + return basic_vec{_Impl::__shift_left(__lhs.__s_, basic_vec{__n}), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right_size<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, __simd_size_type __n) noexcept + { + return basic_vec{_Impl::__shift_right(__lhs.__s_, basic_vec{__n}), __storage_tag}; + } + + // [simd.cassign], basic_vec compound assignment + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_plus<_Up>) + _CCCL_API friend constexpr basic_vec& operator+=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs + __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_minus<_Up>) + _CCCL_API friend constexpr basic_vec& operator-=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs - __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_multiplies<_Up>) + _CCCL_API friend constexpr basic_vec& operator*=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs * __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_divides<_Up>) + _CCCL_API friend constexpr basic_vec& operator/=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs / __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_modulo<_Up>) + _CCCL_API friend constexpr basic_vec& operator%=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs % __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_and<_Up>) + _CCCL_API friend constexpr basic_vec& operator&=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs & __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_or<_Up>) + _CCCL_API friend constexpr basic_vec& operator|=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs | __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_xor<_Up>) + _CCCL_API friend constexpr basic_vec& operator^=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs ^ __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left<_Up>) + _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs << __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right<_Up>) + _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs >> __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left_size<_Up>) + _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs = __lhs << __n; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right_size<_Up>) + _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs = __lhs >> __n; + } + + // [simd.comparison], basic_vec compare operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_equal_to<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator==(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_not_equal_to<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator!=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_greater_equal<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator>=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_less_equal<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator<=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__less_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_greater<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__greater(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_less<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__less(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + // _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) + // _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) + // _CCCL_API void copy_from(const _Up* __mem, _Flags = {}) noexcept + // { + // _Impl::__load(__s_, _Flags::template __apply(__mem)); + // } + // + // _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) + // _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) + // _CCCL_API void copy_to(_Up* __mem, _Flags = {}) const noexcept + // { + // _Impl::__store(__s_, _Flags::template __apply(__mem)); + // } +}; + +// TODO: deduction guides +// template +// basic_vec(R&& r, Ts...) -> ...; + +// template +// basic_vec(basic_mask) -> ...; +} // namespace cuda::experimental::datapar + +#include + +#endif // _CUDAX___SIMD_SIMD_H diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/concepts.h new file mode 100644 index 00000000000..d72e83ac6da --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/concepts.h @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_CONCEPTS_H +#define _CUDAX___SIMD_CONCEPTS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +#include + +namespace cuda::experimental::datapar +{ +template +_CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t)); + +template +_CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++)); + +template +_CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t)); + +template +_CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--)); + +template +_CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((!__t)); + +template +_CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((~__t)); + +template +_CCCL_CONCEPT __has_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((+__t)); + +template +_CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((-__t)); + +template +_CCCL_CONCEPT __has_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t - __t)); + +template +_CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t * __t)); + +template +_CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t / __t)); + +template +_CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t % __t)); + +template +_CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t & __t)); + +template +_CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t | __t)); + +template +_CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t ^ __t)); + +template +_CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __t)); + +template +_CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __t)); + +template +_CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{})); + +template +_CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{})); + +template +_CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t == __t)); + +template +_CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t != __t)); + +template +_CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >= __t)); + +template +_CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t <= __t)); + +template +_CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t > __t)); + +template +_CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t < __t)); +} // namespace cuda::experimental::datapar + +#include + +#endif // _CUDAX___SIMD_CONCEPTS_H diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h index f13c4e9ac03..fc6da9fbd5f 100644 --- a/cudax/include/cuda/experimental/__simd/declaration.h +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -21,10 +21,19 @@ # pragma system_header #endif // no system header +#include +#include + #include namespace cuda::experimental::datapar { +// exposition-only helpers +using __simd_size_type = ::cuda::std::ptrdiff_t; + +template <::cuda::std::size_t _Bytes> +using __integer_from_bytes = ::cuda::std::__make_nbit_int_t<_Bytes * 8, false>; + namespace simd_abi { struct __scalar; @@ -42,31 +51,46 @@ using compatible = fixed_size<1>; template using native = fixed_size<1>; + +template +using deduce = fixed_size<_Np>; } // namespace simd_abi +// exposition-only helpers +template +inline constexpr __simd_size_type __simd_size_v = 0; + +template +inline constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::fixed_size<_Np>> = _Np; + +template +inline constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::native<_Tp>> = 1; + template struct __simd_storage; template struct __simd_operations; -template +template <::cuda::std::size_t _Bytes, typename _Abi> struct __mask_storage; -template +template <::cuda::std::size_t _Bytes, typename _Abi> struct __mask_operations; -template -class basic_simd; +// P1928R15: basic_vec is the primary SIMD vector type +template > +class basic_vec; -template -using simd = basic_simd<_Tp, simd_abi::fixed_size<_Np>>; +// P1928R15: basic_mask is the primary SIMD mask type with Bytes as first template parameter +template <::cuda::std::size_t _Bytes, typename _Abi = simd_abi::native<__integer_from_bytes<_Bytes>>> +class basic_mask; -template -class basic_simd_mask; +template >> +using vec = basic_vec<_Tp, simd_abi::deduce<_Tp, _Np>>; -template -using simd_mask = basic_simd_mask<_Tp, simd_abi::fixed_size<_Np>>; +template >> +using mask = basic_mask>; } // namespace cuda::experimental::datapar #include diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index a2eadb2d0d2..1b6a7f194b9 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -64,11 +64,13 @@ struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> } }; -// use a single bit for the mask storage could be not efficient in CUDA -template -struct __mask_storage<_Tp, simd_abi::__fixed_size<_Np>> : public __simd_storage> +// P1928R15: Mask storage is now indexed by Bytes (element size) rather than type +// Using a single bit for the mask storage could be not efficient in CUDA +template <::cuda::std::size_t _Bytes, int _Np> +struct __mask_storage<_Bytes, simd_abi::__fixed_size<_Np>> : public __simd_storage> { - using value_type = bool; + using value_type = bool; + static constexpr ::cuda::std::size_t __element_bytes = _Bytes; }; // Helper macros to generate repeated fixed-size operations. @@ -111,7 +113,7 @@ template struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> { using _SimdStorage = __simd_storage<_Tp, simd_abi::__fixed_size<_Np>>; - using _MaskStorage = __mask_storage<_Tp, simd_abi::__fixed_size<_Np>>; + using _MaskStorage = __mask_storage>; [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept { @@ -242,13 +244,13 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> }; // ********************************************************************************************************************* -// * SIMD Mask Operations +// * SIMD Mask Operations (P1928R15: indexed by Bytes instead of type) // ********************************************************************************************************************* -template -struct __mask_operations<_Tp, simd_abi::__fixed_size<_Np>> +template <::cuda::std::size_t _Bytes, int _Np> +struct __mask_operations<_Bytes, simd_abi::__fixed_size<_Np>> { - using _MaskStorage = __mask_storage<_Tp, simd_abi::__fixed_size<_Np>>; + using _MaskStorage = __mask_storage<_Bytes, simd_abi::__fixed_size<_Np>>; [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept { diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index 94f8a7491c5..95050b01163 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -35,11 +35,13 @@ namespace cuda::experimental::datapar template class __simd_reference { + // P1928R15: basic_vec is the primary SIMD vector type template - friend class basic_simd; + friend class basic_vec; - template - friend class basic_simd_mask; + // P1928R15: basic_mask is the primary SIMD mask type (indexed by Bytes) + template <::cuda::std::size_t, typename> + friend class basic_mask; _Storage& __s_; ::cuda::std::size_t __idx_; diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h index 02a2196faa4..01863f6fb14 100644 --- a/cudax/include/cuda/experimental/__simd/scalar_impl.h +++ b/cudax/include/cuda/experimental/__simd/scalar_impl.h @@ -59,10 +59,12 @@ struct __simd_storage<_Tp, simd_abi::__scalar> } }; -template -struct __mask_storage<_Tp, simd_abi::__scalar> : __simd_storage +// P1928R15: Mask storage is now indexed by Bytes (element size) rather than type +template <::cuda::std::size_t _Bytes> +struct __mask_storage<_Bytes, simd_abi::__scalar> : __simd_storage { - using value_type = bool; + using value_type = bool; + static constexpr ::cuda::std::size_t __element_bytes = _Bytes; }; // ********************************************************************************************************************* @@ -73,7 +75,7 @@ template struct __simd_operations<_Tp, simd_abi::__scalar> { using _SimdStorage = __simd_storage<_Tp, simd_abi::__scalar>; - using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>; + using _MaskStorage = __mask_storage; [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept { @@ -233,13 +235,13 @@ struct __simd_operations<_Tp, simd_abi::__scalar> }; // ********************************************************************************************************************* -// * SIMD Mask Operations +// * SIMD Mask Operations (P1928R15: indexed by Bytes instead of type) // ********************************************************************************************************************* -template -struct __mask_operations<_Tp, simd_abi::__scalar> +template <::cuda::std::size_t _Bytes> +struct __mask_operations<_Bytes, simd_abi::__scalar> { - using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>; + using _MaskStorage = __mask_storage<_Bytes, simd_abi::__scalar>; [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept { diff --git a/cudax/include/cuda/experimental/__simd/simd.h b/cudax/include/cuda/experimental/__simd/simd.h deleted file mode 100644 index ee7d0fc6190..00000000000 --- a/cudax/include/cuda/experimental/__simd/simd.h +++ /dev/null @@ -1,608 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX___SIMD_SIMD_H -#define _CUDAX___SIMD_SIMD_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace cuda::experimental::datapar -{ -template -class basic_simd : public __simd_operations<_Tp, _Abi> -{ - static_assert(is_abi_tag_v<_Abi>, "basic_simd requires a valid ABI tag"); - - using _Impl = __simd_operations<_Tp, _Abi>; - using _Storage = typename _Impl::_SimdStorage; - - _Storage __s_; - - template - static constexpr bool __is_value_preserving_broadcast = - (__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && __is_non_narrowing_convertible_v<_Up, _Tp>) - || (!__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && ::cuda::std::is_convertible_v<_Up, _Tp>); - -public: - using value_type = _Tp; - using reference = __simd_reference<_Storage, value_type>; - using abi_type = _Abi; - using mask_type = basic_simd_mask; - - _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, value_type> _CCCL_AND ::cuda::std::is_same_v<_Ap, abi_type>) - _CCCL_API explicit operator basic_simd_mask<_Up, _Ap>() const noexcept - { - basic_simd_mask<_Up, _Ap> __result; - for (::cuda::std::size_t __i = 0; __i < size(); ++__i) - { - __result[__i] = static_cast((*this)[__i]); - } - return __result; - } - - [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept - { - return simd_size_v; - } - - _CCCL_HIDE_FROM_ABI basic_simd() noexcept = default; - - struct __storage_tag_t - {}; - static constexpr __storage_tag_t __storage_tag{}; - - _CCCL_API explicit operator _Storage() const - { - return __s_; - } - - _CCCL_API explicit basic_simd(const _Storage& __s, __storage_tag_t) - : __s_{__s} - {} - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v>&& __is_value_preserving_broadcast<_Up>) - _CCCL_API constexpr basic_simd(_Up&& __v) noexcept - : __s_{_Impl::__broadcast(static_cast(__v))} - {} - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((__can_broadcast_v> - && !__is_value_preserving_broadcast<_Up>) ) - _CCCL_API constexpr explicit basic_simd(_Up&& __v) noexcept - : __s_{_Impl::__broadcast(static_cast(__v))} - {} - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp> && __is_non_narrowing_convertible_v<_Up, value_type>) ) - _CCCL_API basic_simd(const basic_simd<_Up, abi_type>& __v) noexcept - { - for (::cuda::std::size_t __i = 0; __i < size(); __i++) - { - (*this)[__i] = static_cast(__v[__i]); - } - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp> && !__is_non_narrowing_convertible_v<_Up, value_type> - && ::cuda::std::is_convertible_v<_Up, value_type>) ) - _CCCL_API explicit basic_simd(const basic_simd<_Up, abi_type>& __v) noexcept - { - for (::cuda::std::size_t __i = 0; __i < size(); __i++) - { - (*this)[__i] = static_cast(__v[__i]); - } - } - - _CCCL_TEMPLATE(typename _Generator) - _CCCL_REQUIRES(__can_generate_v) - _CCCL_API explicit basic_simd(_Generator&& __g) - : __s_(_Impl::__generate(__g)) - {} - - _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) - _CCCL_API explicit basic_simd(const _Up* __mem, _Flags = {}) noexcept - { - _Impl::__load(__s_, _Flags::template __apply(__mem)); - } - - _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) - _CCCL_API void copy_from(const _Up* __mem, _Flags = {}) noexcept - { - _Impl::__load(__s_, _Flags::template __apply(__mem)); - } - - _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) - _CCCL_API void copy_to(_Up* __mem, _Flags = {}) const noexcept - { - _Impl::__store(__s_, _Flags::template __apply(__mem)); - } - - _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept - { - return reference(__s_, __i); - } - - _CCCL_API value_type operator[](::cuda::std::size_t __i) const noexcept - { - return __s_.__get(__i); - } - - _CCCL_API basic_simd& operator++() noexcept - { - _Impl::__increment(__s_); - return *this; - } - - _CCCL_API basic_simd operator++(int) noexcept - { - const basic_simd __r = *this; - _Impl::__increment(__s_); - return __r; - } - - _CCCL_API basic_simd& operator--() noexcept - { - _Impl::__decrement(__s_); - return *this; - } - - _CCCL_API basic_simd operator--(int) noexcept - { - const basic_simd __r = *this; - _Impl::__decrement(__s_); - return __r; - } - - [[nodiscard]] _CCCL_API basic_simd operator+() const noexcept - { - return *this; - } - - [[nodiscard]] _CCCL_API basic_simd operator-() const noexcept - { - return basic_simd{_Impl::__unary_minus(__s_), __storage_tag}; - } - - _CCCL_API constexpr friend basic_simd& operator+=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs + __rhs; - } - - _CCCL_API constexpr friend basic_simd& operator-=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs - __rhs; - } - - _CCCL_API constexpr friend basic_simd& operator*=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs * __rhs; - } - - _CCCL_API constexpr friend basic_simd& operator/=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs / __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend basic_simd& operator%=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs % __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend basic_simd& operator&=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs & __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend basic_simd& operator|=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs | __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend basic_simd& operator^=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs ^ __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend basic_simd& operator<<=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs << __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - _CCCL_API constexpr friend basic_simd& operator>>=(basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return __lhs = __lhs >> __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator+(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator+(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs + basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator+(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) + __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator-(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator-(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs - basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator-(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) - __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator*(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator*(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs * basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator*(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) * __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator/(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator/(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs / basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator/(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) / __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator%(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs % basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator%(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) % __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator&(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs & basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator&(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) & __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator|(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs | basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator|(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) | __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator^(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs ^ basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator^(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) ^ __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator<<(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs << basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator<<(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) << __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr friend basic_simd - operator>>(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd{_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs >> basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp> _CCCL_AND __can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend basic_simd operator>>(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) >> __rhs; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>) - [[nodiscard]] _CCCL_API constexpr basic_simd operator~() const noexcept - { - return basic_simd{_Impl::__bitwise_not(__s_), __storage_tag}; - } - - [[nodiscard]] _CCCL_API constexpr friend mask_type - operator==(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return mask_type{_Impl::__equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs == basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator==(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) == __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend mask_type - operator!=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return mask_type{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs != basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator!=(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) != __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return mask_type{_Impl::__less(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs < basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) < __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend mask_type - operator<=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return mask_type{_Impl::__less_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs <= basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator<=(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) <= __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return mask_type{_Impl::__greater(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs > basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) > __rhs; - } - - [[nodiscard]] _CCCL_API constexpr friend mask_type - operator>=(const basic_simd& __lhs, const basic_simd& __rhs) noexcept - { - return mask_type{_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(const basic_simd& __lhs, _Up&& __rhs) noexcept - { - return __lhs >= basic_simd(static_cast(__rhs)); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(__can_broadcast_v) - [[nodiscard]] _CCCL_API constexpr friend mask_type operator>=(_Up&& __lhs, const basic_simd& __rhs) noexcept - { - return basic_simd(static_cast(__lhs)) >= __rhs; - } -}; - -template -using fixed_size_simd = simd<_Tp, _Np>; -} // namespace cuda::experimental::datapar - -#include - -#endif // _CUDAX___SIMD_SIMD_H diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index 70e89285528..cdfae75ab5b 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -82,17 +82,17 @@ template inline constexpr bool is_abi_tag_v> = true; template -inline constexpr bool is_simd_v = false; +inline constexpr bool is_vec_v = false; template -struct is_simd : ::cuda::std::bool_constant> +struct is_vec : ::cuda::std::bool_constant> {}; template -inline constexpr bool is_simd_mask_v = false; +inline constexpr bool is_mask_v = false; template -struct is_simd_mask : ::cuda::std::bool_constant> +struct is_mask : ::cuda::std::bool_constant> {}; template @@ -112,17 +112,12 @@ struct simd_size<_Tp, _Abi, false> static constexpr ::cuda::std::size_t value = 0; }; -template > -inline constexpr ::cuda::std::size_t simd_size_v = simd_size<_Tp, _Abi>::value; - -template -inline constexpr ::cuda::std::size_t simd_size_v<_Tp, void> = _Tp::size(); template -inline constexpr bool is_simd_v> = true; +inline constexpr bool is_vec_v> = true; -template -inline constexpr bool is_simd_mask_v> = true; +template <::cuda::std::size_t _Bytes, typename _Abi> +inline constexpr bool is_mask_v> = true; template <> inline constexpr bool is_simd_flag_type_v = true; @@ -137,33 +132,35 @@ inline constexpr bool is_simd_flag_type_v> = true; template struct memory_alignment; +// P1928R15: basic_vec memory alignment template -struct memory_alignment, element_aligned_tag> +struct memory_alignment, element_aligned_tag> : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Tp)> {}; template -struct memory_alignment, vector_aligned_tag> - : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Tp) * simd_size_v<_Tp, _Abi>> +struct memory_alignment, vector_aligned_tag> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Tp) * __simd_size_v<_Tp, _Abi>> {}; template -struct memory_alignment, overaligned_tag<_Alignment>> +struct memory_alignment, overaligned_tag<_Alignment>> : ::cuda::std::integral_constant<::cuda::std::size_t, _Alignment> {}; -template -struct memory_alignment, element_aligned_tag> +// P1928R15: basic_mask memory alignment (indexed by Bytes) +template <::cuda::std::size_t _Bytes, typename _Abi> +struct memory_alignment, element_aligned_tag> : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(bool)> {}; -template -struct memory_alignment, vector_aligned_tag> - : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(bool) * simd_size_v<_Tp, _Abi>> +template <::cuda::std::size_t _Bytes, typename _Abi> +struct memory_alignment, vector_aligned_tag> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(bool) * _Abi::__simd_size> {}; -template -struct memory_alignment, overaligned_tag<_Alignment>> +template <::cuda::std::size_t _Bytes, typename _Abi, ::cuda::std::size_t _Alignment> +struct memory_alignment, overaligned_tag<_Alignment>> : ::cuda::std::integral_constant<::cuda::std::size_t, _Alignment> {}; @@ -174,20 +171,33 @@ inline constexpr ::cuda::std::size_t memory_alignment_v = memory_alignment<_Tp, template struct rebind_simd; +// P1928R15: rebind for basic_vec template -struct rebind_simd<_Tp, basic_simd<_Up, _Abi>> +struct rebind_simd<_Tp, basic_vec<_Up, _Abi>> { - using type = basic_simd<_Tp, _Abi>; + using type = basic_vec<_Tp, _Abi>; }; -template -struct rebind_simd<_Tp, basic_simd_mask<_Up, _Abi>> +// P1928R15: rebind for basic_mask (creates mask with sizeof(_Tp) bytes) +template +struct rebind_simd<_Tp, basic_mask<_Bytes, _Abi>> { - using type = basic_simd_mask<_Tp, _Abi>; + using type = basic_mask; }; template using rebind_simd_t = typename rebind_simd<_Tp, _Simd>::type; + +// P1928R15: mask_element_size trait - get the Bytes value from a mask +template +struct mask_element_size; + +template <::cuda::std::size_t _Bytes, typename _Abi> +struct mask_element_size> : ::cuda::std::integral_constant<::cuda::std::size_t, _Bytes> +{}; + +template +inline constexpr ::cuda::std::size_t mask_element_size_v = mask_element_size<_Tp>::value; } // namespace cuda::experimental::datapar #include diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index ec62b2c2ad1..380c1c68642 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -106,12 +106,10 @@ cudax_add_catch2_test(test_target algorithm algorithm/copy.cu ) - cudax_add_catch2_test(test_target simd ${cudax_target} +cudax_add_catch2_test(test_target simd simd/simd.cu - ) +) - if (cudax_ENABLE_CUFILE) - cudax_add_catch2_test(test_target cufile_driver_attributes ${cudax_target} if (cudax_ENABLE_CUFILE) cudax_add_catch2_test(test_target cufile.driver_attributes cufile/driver_attributes.cu diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index 9938431e329..7d4d5573211 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -72,9 +72,9 @@ C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") STATIC_REQUIRE(dp::is_abi_tag_v); STATIC_REQUIRE(!dp::is_abi_tag_v); - STATIC_REQUIRE(dp::simd_size_v == 4); - STATIC_REQUIRE(dp::simd_size_v == 4); - STATIC_REQUIRE(dp::simd_size_v == 0); + STATIC_REQUIRE(dp::__simd_size_v == 4); + STATIC_REQUIRE(dp::__simd_size_v == 4); + STATIC_REQUIRE(dp::__simd_size_v == 0); STATIC_REQUIRE(dp::is_simd_v); STATIC_REQUIRE(!dp::is_simd_v); From 47bdf7be4949d5cf965d37995cc8aa237aa65fd4 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 6 Jan 2026 11:10:13 -0800 Subject: [PATCH 25/32] add ctor constrains --- .../cuda/experimental/__simd/basic_vec.h | 18 +-- .../cuda/experimental/__simd/concepts.h | 150 +++++++++++++++--- 2 files changed, 134 insertions(+), 34 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h index ec555f9e248..8ba1881b42b 100644 --- a/cudax/include/cuda/experimental/__simd/basic_vec.h +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -85,21 +85,21 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up) _CCCL_REQUIRES( - (__can_broadcast_v>) _CCCL_AND(__is_value_preserving_broadcast<_Up>)) - _CCCL_API constexpr basic_vec(_Up&& __v) noexcept + (__explicitly_convertible_to<_Up, value_type>) _CCCL_AND(__is_simd_ctor_explicit_from_value<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept : __s_{_Impl::__broadcast(static_cast(__v))} {} _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((__can_broadcast_v>) // - _CCCL_AND(!__is_value_preserving_broadcast<_Up>)) - _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept + _CCCL_REQUIRES( + (__explicitly_convertible_to<_Up, value_type>) _CCCL_AND(!__is_simd_ctor_explicit_from_value<_Up, value_type>)) + _CCCL_API constexpr basic_vec(_Up&& __v) noexcept : __s_{_Impl::__broadcast(static_cast(__v))} {} - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp>) _CCCL_AND(__is_non_narrowing_convertible_v<_Up, value_type>)) - _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, abi_type>& __v) noexcept + _CCCL_TEMPLATE(typename _Up, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept { for (__simd_size_type __i = 0; __i < size; __i++) { @@ -124,7 +124,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> : __s_(_Impl::__generate(__g)) {} - // TODO: add constructors + // TODO: add range constructors // template // constexpr basic_vec(R&& range, flags = {}); diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/concepts.h index d72e83ac6da..101785ec017 100644 --- a/cudax/include/cuda/experimental/__simd/concepts.h +++ b/cudax/include/cuda/experimental/__simd/concepts.h @@ -21,7 +21,11 @@ # pragma system_header #endif // no system header +#include #include +#include +#include +#include #include @@ -29,79 +33,175 @@ namespace cuda::experimental::datapar { -template +template +bool constexpr __is_value_preserving_broadcast_impl() +{ + // TODO + return true; +} + +template +_CCCL_CONCEPT __is_value_preserving_convertible = __is_value_preserving_broadcast_impl<_From, _To>(); + +template +_CCCL_CONCEPT __explicitly_convertible_to = + _CCCL_REQUIRES_EXPR((_To, _From))(requires(static_cast<_To>(::cuda::std::declval<_From>()))); + +template +_CCCL_CONCEPT __constexpr_wrapper_like = + ::cuda::std::convertible_to<_Tp, decltype(_Tp::value)> + && ::cuda::std::equality_comparable_with<_Tp, decltype(_Tp::value)> && (_Tp() == _Tp::value) + && (static_cast(_Tp()) == _Tp::value); + +template +_CCCL_CONCEPT __is_simd_ctor_explicit_from_value = + ::cuda::std::convertible_to<_Tp, _ValueType> + && ((!::cuda::std::is_arithmetic_v<_Tp> && !__constexpr_wrapper_like<_Tp>) + || (::cuda::std::is_arithmetic_v<_Tp> && __is_value_preserving_convertible<_Tp, _ValueType>) + || (__constexpr_wrapper_like<_Tp> && ::cuda::std::is_arithmetic_v<::cuda::std::remove_cvref_t<_Tp>> + && __is_value_preserving_convertible<_Tp, _ValueType>); + + +template +inline constexpr int __integer_conversion_rank = 0; + +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 5; +template <> +inline constexpr int __integer_conversion_rank = 5; +#if defined(_CCCL_HAS_INT128) +template <> +inline constexpr int __integer_conversion_rank<__int128_t> = 6; +template <> +inline constexpr int __integer_conversion_rank<__uint128_t> = 6; +#endif // defined(_CCCL_HAS_INT128) + +template +inline constexpr int __fp_conversion_rank = 0; + +#if _CCCL_HAS_NVFP16() +template <> +inline constexpr int __fp_conversion_rank<__half> = 1; +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank<__nv_bfloat16> = 1; +#endif // _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank = 2; +template <> +inline constexpr int __fp_conversion_rank = 3; +#if _CCCL_HAS_LONG_DOUBLE() +template <> +inline constexpr int __fp_conversion_rank = 4; +#endif // _CCCL_HAS_LONG_DOUBLE() +#if _CCCL_HAS_FLOAT128() +template <> +inline constexpr int __fp_conversion_rank<__float128> = 5; +#endif // _CCCL_HAS_FLOAT128() + + +template +_CCCL_CONCEPT __is_simd_ctor_explicit_from_vec = + !__is_value_preserving_convertible<_Tp, _Up> || +(::cuda::std::is_integral_v<_Tp> && ::cuda::std::is_integral_v<_Up> && +__integer_conversion_rank<_Tp> > __integer_conversion_rank<_Up>) || +(::cuda::is_floating_point_v<_Tp> && ::cuda::is_floating_point_v<_Up> && +(__fp_conversion_rank<_Tp> > __fp_conversion_rank<_Up>)); + + + +template _CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t)); -template +template _CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++)); -template +template _CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t)); -template +template _CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--)); -template +template _CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((!__t)); -template +template _CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((~__t)); -template +template _CCCL_CONCEPT __has_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((+__t)); -template +template _CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((-__t)); -template +template _CCCL_CONCEPT __has_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t - __t)); -template +template _CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t * __t)); -template +template _CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t / __t)); -template +template _CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t % __t)); -template +template _CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t & __t)); -template +template _CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t | __t)); -template +template _CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t ^ __t)); -template +template _CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __t)); -template +template _CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __t)); -template +template _CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{})); -template +template _CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{})); -template +template _CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t == __t)); -template +template _CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t != __t)); -template +template _CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >= __t)); -template +template _CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t <= __t)); -template +template _CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t > __t)); -template +template _CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t < __t)); } // namespace cuda::experimental::datapar From 06020ebaf9bfe9e03d8203beeb2425705af954b5 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 25 Mar 2026 16:13:26 -0700 Subject: [PATCH 26/32] refactor to match std::simd --- cudax/include/cuda/experimental/__simd/abi.h | 51 +++ .../cuda/experimental/__simd/basic_mask.h | 405 +++++++++++++----- .../cuda/experimental/__simd/basic_vec.h | 8 +- .../cuda/experimental/__simd/concepts.h | 4 +- .../cuda/experimental/__simd/declaration.h | 74 +--- .../cuda/experimental/__simd/exposition.h | 56 +++ .../experimental/__simd/fixed_size_impl.h | 4 +- .../cuda/experimental/__simd/reference.h | 4 +- .../cuda/experimental/__simd/scalar_impl.h | 10 +- .../include/cuda/experimental/__simd/traits.h | 8 +- .../cuda/experimental/__simd/type_traits.h | 95 ++++ .../cuda/experimental/__simd/utility.h | 62 +-- cudax/include/cuda/experimental/simd.cuh | 4 +- cudax/test/CMakeLists.txt | 4 + cudax/test/simd/simd.cu | 132 +++--- 15 files changed, 665 insertions(+), 256 deletions(-) create mode 100644 cudax/include/cuda/experimental/__simd/abi.h create mode 100644 cudax/include/cuda/experimental/__simd/exposition.h create mode 100644 cudax/include/cuda/experimental/__simd/type_traits.h diff --git a/cudax/include/cuda/experimental/__simd/abi.h b/cudax/include/cuda/experimental/__simd/abi.h new file mode 100644 index 00000000000..bc669952ae8 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/abi.h @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_ABI_H +#define _CUDAX___SIMD_ABI_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::simd +{ +using __simd_size_type = ::cuda::std::ptrdiff_t; + +// [simd.expos.abi], simd ABI tags +namespace simd_abi +{ +template <__simd_size_type _Np> +struct __fixed_size; + +template <__simd_size_type _Np> +using fixed_size = __fixed_size<_Np>; + +template +using native = fixed_size<1>; + +template +using __deduce_abi_t = fixed_size<_Np>; +} // namespace simd_abi +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_ABI_H diff --git a/cudax/include/cuda/experimental/__simd/basic_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h index 8b19408c5c5..0294b25cbf0 100644 --- a/cudax/include/cuda/experimental/__simd/basic_mask.h +++ b/cudax/include/cuda/experimental/__simd/basic_mask.h @@ -4,12 +4,12 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef _CUDAX___SIMD_SIMD_MASK_H -#define _CUDAX___SIMD_SIMD_MASK_H +#ifndef _CUDAX___SIMD_BASIC_MASK_H +#define _CUDAX___SIMD_BASIC_MASK_H #include @@ -21,203 +21,414 @@ # pragma system_header #endif // no system header +#include +#include #include +#include #include -#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include #include #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { -// P1928R15: basic_mask is the primary SIMD mask type with Bytes as first template parameter +// [simd.mask.class], class template basic_mask template <::cuda::std::size_t _Bytes, typename _Abi> class basic_mask : public __mask_operations<_Bytes, _Abi> { - static_assert(is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag"); + static_assert(_Bytes >= 0, "basic_mask requires a positive number of bytes"); + static_assert(__is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag"); using _Impl = __mask_operations<_Bytes, _Abi>; using _Storage = typename _Impl::_MaskStorage; _Storage __s_; + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + + _CCCL_API constexpr basic_mask(_Storage __v, __storage_tag_t) noexcept + : __s_{__v} + {} + public: using value_type = bool; - using reference = __simd_reference<_Storage, bool>; using abi_type = _Abi; - // P1928R15: Bytes represents the size of the corresponding element type - static constexpr ::cuda::std::size_t bytes = _Bytes; + // TODO(fbusato): add simd-iterator + // using iterator = simd-iterator; + // using const_iterator = simd-iterator; - [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t size() noexcept - { - return _Abi::__simd_size; - } + // constexpr iterator begin() noexcept { return {*this, 0}; } + // constexpr const_iterator begin() const noexcept { return {*this, 0}; } + // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; } + // constexpr default_sentinel_t end() const noexcept { return {}; } + // constexpr default_sentinel_t cend() const noexcept { return {}; } - _CCCL_HIDE_FROM_ABI basic_mask() noexcept = default; + static constexpr ::cuda::std::integral_constant<__simd_size_type, __simd_size_v<__integer_from<_Bytes>, _Abi>> size{}; - struct __storage_tag_t - {}; - static constexpr __storage_tag_t __storage_tag{}; + static constexpr auto __usize = ::cuda::std::size_t{size}; - _CCCL_API explicit operator _Storage() const noexcept - { - return __s_; - } + _CCCL_HIDE_FROM_ABI basic_mask() noexcept = default; - _CCCL_API basic_mask(const _Storage& __s, __storage_tag_t) noexcept - : __s_{__s} - {} + // [simd.mask.ctor], basic_mask constructors _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_same_v<_Up, bool>) - _CCCL_API explicit basic_mask(_Up __v) noexcept + _CCCL_REQUIRES(::cuda::std::same_as<_Up, value_type>) + _CCCL_API constexpr explicit basic_mask(_Up __v) noexcept : __s_{_Impl::__broadcast(__v)} {} + _CCCL_TEMPLATE(::cuda::std::size_t _UBytes, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<__integer_from<_UBytes>, _UAbi> == size())) + _CCCL_API constexpr explicit basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, __x[__i]); + } + } + _CCCL_TEMPLATE(typename _Generator) - _CCCL_REQUIRES(__can_generate_v) - _CCCL_API explicit basic_mask(_Generator&& __g) noexcept - : __s_(_Impl::__generate(__g)) + _CCCL_REQUIRES(__can_generate_v) + _CCCL_API constexpr explicit basic_mask(_Generator&& __g) + : __s_{_Impl::__generate(__g)} {} - _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) - _CCCL_API explicit basic_mask(const bool* __mem, _Flags = {}) noexcept + _CCCL_TEMPLATE(typename _Tp) + _CCCL_REQUIRES(::cuda::std::same_as<_Tp, ::cuda::std::bitset<__usize>>) + _CCCL_API constexpr basic_mask(const _Tp& __b) noexcept + : __s_{_Impl::__broadcast(false)} { - _Impl::__load(__s_, _Flags::template __apply(__mem)); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, static_cast(__b[__i])); + } } - _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) - _CCCL_API void copy_from(const bool* __mem, _Flags = {}) noexcept + _CCCL_TEMPLATE(typename _Tp) + _CCCL_REQUIRES((::cuda::std::__cccl_is_unsigned_integer_v<_Tp> && !::cuda::std::same_as<_Tp, value_type>) ) + _CCCL_API constexpr explicit basic_mask(_Tp __val) noexcept + : __s_{_Impl::__broadcast(false)} { - _Impl::__load(__s_, _Flags::template __apply(__mem)); + constexpr auto __num_bits = __simd_size_type{::cuda::std::__num_bits_v<_Tp>}; + constexpr auto __size_as_int = size(); + constexpr auto __m = __size_as_int < __num_bits ? __size_as_int : __num_bits; + using __uint8_array_t = ::cuda::std::array<::cuda::std::uint8_t, sizeof(_Tp)>; + const auto __val1 = ::cuda::std::bit_cast<__uint8_array_t>(__val); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __m; ++__i) + { + const auto __byte = __val1[__i / CHAR_BIT]; + __s_.__set(__i, static_cast((__byte >> (__i % CHAR_BIT)) & _Tp{1})); + } } - _CCCL_TEMPLATE(typename _Flags = element_aligned_tag) - _CCCL_REQUIRES(is_simd_flag_type_v<_Flags>) - _CCCL_API void copy_to(bool* __mem, _Flags = {}) const noexcept + // [simd.mask.subscr], basic_mask subscript operators + + [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const noexcept { - _Impl::__store(__s_, _Flags::template __apply(__mem)); + _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __simd_size_type{size}), "Index is out of bounds"); + return static_cast(__s_.__get(__i)); } - _CCCL_API reference operator[](::cuda::std::size_t __i) noexcept + // TODO(fbusato): subscript with integral indices, requires permute() + // template + // constexpr resize_t operator[](const I& indices) const; + + // [simd.mask.unary], basic_mask unary operators + + [[nodiscard]] _CCCL_API constexpr basic_mask operator!() const noexcept { - return reference(__s_, __i); + return {_Impl::__bitwise_not(__s_), __storage_tag}; } - _CCCL_API value_type operator[](::cuda::std::size_t __i) const noexcept + _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_B>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_B>, _Abi> operator+() const noexcept { - return static_cast(__s_.__get(__i)); + return static_cast, _Abi>>(*this); } - // Bitwise operations - [[nodiscard]] _CCCL_API friend constexpr basic_mask - operator&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_B>) ) + _CCCL_API void operator+() const noexcept = delete; + + _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_B>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_B>, _Abi> operator-() const noexcept { - return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + return -static_cast, _Abi>>(*this); } - [[nodiscard]] _CCCL_API friend constexpr basic_mask - operator|(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_B>) ) + _CCCL_API void operator-() const noexcept = delete; + + _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_B>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_B>, _Abi> operator~() const noexcept { - return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + return ~static_cast, _Abi>>(*this); } - [[nodiscard]] _CCCL_API friend constexpr basic_mask - operator^(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_B>) ) + _CCCL_API void operator~() const noexcept = delete; + + // [simd.mask.conv], basic_mask conversions + + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES((sizeof(_Up) != _Bytes && __simd_size_v<_Up, _Ap> == size())) + _CCCL_API constexpr explicit operator basic_vec<_Up, _Ap>() const noexcept { - return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + basic_vec<_Up, _Ap> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __result[__i] = static_cast<_Up>((*this)[__i]); + } + return __result; } - [[nodiscard]] _CCCL_API constexpr basic_mask operator!() const noexcept + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES((sizeof(_Up) == _Bytes && __simd_size_v<_Up, _Ap> == size())) + _CCCL_API constexpr operator basic_vec<_Up, _Ap>() const noexcept { - return {_Impl::__bitwise_not(__s_), __storage_tag}; + basic_vec<_Up, _Ap> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __result[__i] = static_cast<_Up>((*this)[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API constexpr ::cuda::std::bitset<__usize> to_bitset() const noexcept + { + ::cuda::std::bitset<__usize> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __result.set(__i, (*this)[__i]); + } + return __result; } + [[nodiscard]] _CCCL_API constexpr unsigned long long to_ullong() const + { + constexpr __simd_size_type __nbits = ::cuda::std::__num_bits_v; + if constexpr (size > __nbits) + { + for (auto __i = __nbits; __i < size; ++__i) + { + _CCCL_ASSERT(!(*this)[__i], "Bit above unsigned long long width is set"); + } + } + return to_bitset().to_ullong(); + } + + // [simd.mask.binary], basic_mask binary operators + [[nodiscard]] _CCCL_API friend constexpr basic_mask operator&&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + return {_Impl::__logic_and(__lhs.__s_, __rhs.__s_), __storage_tag}; } [[nodiscard]] _CCCL_API friend constexpr basic_mask operator||(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__logic_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator|(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; } - // Conversion to basic_vec - _CCCL_TEMPLATE(typename _Up, typename _Ap) - _CCCL_REQUIRES((sizeof(_Up) == _Bytes && _Ap::__simd_size == _Abi::__simd_size)) - _CCCL_API constexpr explicit operator basic_vec<_Up, _Ap>() const noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator^(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - basic_vec<_Up, _Ap> __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (::cuda::std::size_t __i = 0; __i < size(); ++__i) - { - __result[__i] = static_cast<_Up>((*this)[__i]); - } - return __result; + return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; } - _CCCL_API basic_mask& operator&=(const basic_mask& __rhs) noexcept + // [simd.mask.cassign], basic_mask compound assignment + + _CCCL_API friend constexpr basic_mask& operator&=(basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return *this = *this & __rhs; + return __lhs = __lhs & __rhs; } - _CCCL_API basic_mask& operator|=(const basic_mask& __rhs) noexcept + _CCCL_API friend constexpr basic_mask& operator|=(basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return *this = *this | __rhs; + return __lhs = __lhs | __rhs; } - _CCCL_API basic_mask& operator^=(const basic_mask& __rhs) noexcept + _CCCL_API friend constexpr basic_mask& operator^=(basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return *this = *this ^ __rhs; + return __lhs = __lhs ^ __rhs; } - // Comparison operations - [[nodiscard]] _CCCL_API friend constexpr bool operator==(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + // [simd.mask.comparison], basic_mask comparisons (element-wise) + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator==(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return _Impl::__equal_to(__lhs.__s_, __rhs.__s_); + return !(__lhs ^ __rhs); } -#if _CCCL_STD_VER < 2020 - [[nodiscard]] _CCCL_API friend constexpr bool operator!=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator!=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return !(__lhs == __rhs); + return __lhs ^ __rhs; } -#endif // _CCCL_STD_VER < 2020 - [[nodiscard]] _CCCL_API constexpr bool all() const noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator>=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return _Impl::__all(__s_); + return __lhs || !__rhs; } - [[nodiscard]] _CCCL_API constexpr bool any() const noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator<=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return _Impl::__any(__s_); + return !__lhs || __rhs; } - [[nodiscard]] _CCCL_API constexpr bool none() const noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator>(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return !any(); + return __lhs && !__rhs; } - [[nodiscard]] _CCCL_API constexpr ::cuda::std::ptrdiff_t count() const noexcept + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator<(const basic_mask& __lhs, const basic_mask& __rhs) noexcept { - return _Impl::__count(__s_); + return !__lhs && __rhs; } + + // TODO(fbusato): [simd.mask.cond], basic_mask exposition only conditional operators + // friend constexpr basic_mask __simd_select_impl( + // const basic_mask&, const basic_mask&, const basic_mask&) noexcept; + // friend constexpr basic_mask __simd_select_impl( + // const basic_mask&, same_as auto, same_as auto) noexcept; + // template + // friend constexpr vec __simd_select_impl( + // const basic_mask&, const T0&, const T1&) noexcept; }; -} // namespace cuda::experimental::datapar + +// [simd.mask.reductions], reductions + +template <::cuda::std::size_t _Bytes, typename _Abi> +[[nodiscard]] _CCCL_API constexpr bool all_of(const basic_mask<_Bytes, _Abi>& __k) noexcept +{ + using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; + return __mask_operations<_Bytes, _Abi>::__all(static_cast<__mask_storage_t>(__k)); +} + +template <::cuda::std::size_t _Bytes, typename _Abi> +[[nodiscard]] _CCCL_API constexpr bool any_of(const basic_mask<_Bytes, _Abi>& __k) noexcept +{ + using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; + return __mask_operations<_Bytes, _Abi>::__any(static_cast<__mask_storage_t>(__k)); +} + +template <::cuda::std::size_t _Bytes, typename _Abi> +[[nodiscard]] _CCCL_API constexpr bool none_of(const basic_mask<_Bytes, _Abi>& __k) noexcept +{ + return !::cuda::experimental::simd::any_of(__k); +} + +template <::cuda::std::size_t _Bytes, typename _Abi> +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_count(const basic_mask<_Bytes, _Abi>& __k) noexcept +{ + using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; + return __mask_operations<_Bytes, _Abi>::__count(static_cast<__mask_storage_t>(__k)); +} + +template <::cuda::std::size_t _Bytes, typename _Abi> +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_min_index(const basic_mask<_Bytes, _Abi>& __k) noexcept +{ + _CCCL_ASSERT(any_of(__k), "No bits are set"); + using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; + return __mask_operations<_Bytes, _Abi>::__min_index(static_cast<__mask_storage_t>(__k)); +} + +template <::cuda::std::size_t _Bytes, typename _Abi> +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_max_index(const basic_mask<_Bytes, _Abi>& __k) noexcept +{ + _CCCL_ASSERT(any_of(__k), "No bits are set"); + using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; + return __mask_operations<_Bytes, _Abi>::__max_index(static_cast<__mask_storage_t>(__k)); +} + +// Scalar bool overloads + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr bool all_of(_Tp __x) noexcept +{ + return __x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr bool any_of(_Tp __x) noexcept +{ + return __x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr bool none_of(_Tp __x) noexcept +{ + return !__x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_count(_Tp __x) noexcept +{ + return __x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_min_index(_Tp __x) noexcept +{ + _CCCL_ASSERT(__x, "No bits are set"); + return 0; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) + +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_max_index(_Tp __x) noexcept +{ + _CCCL_ASSERT(__x, "No bits are set"); + return 0; +} +} // namespace cuda::experimental::simd #include -#endif // _CUDAX___SIMD_SIMD_MASK_H +#endif // _CUDAX___SIMD_BASIC_MASK_H diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h index 8ba1881b42b..187c0bb37be 100644 --- a/cudax/include/cuda/experimental/__simd/basic_vec.h +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -27,18 +27,18 @@ #include #include +#include #include #include #include #include #include -#include #include #include #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { // P1928R15: basic_vec is the primary SIMD vector type (renamed from basic_simd) template @@ -450,7 +450,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> // template // basic_vec(basic_mask) -> ...; -} // namespace cuda::experimental::datapar +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/concepts.h index 101785ec017..b0b3fca2c06 100644 --- a/cudax/include/cuda/experimental/__simd/concepts.h +++ b/cudax/include/cuda/experimental/__simd/concepts.h @@ -31,7 +31,7 @@ #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { template bool constexpr __is_value_preserving_broadcast_impl() @@ -203,7 +203,7 @@ _CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t > __t)); template _CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t < __t)); -} // namespace cuda::experimental::datapar +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h index fc6da9fbd5f..36699d13cd5 100644 --- a/cudax/include/cuda/experimental/__simd/declaration.h +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -22,76 +22,38 @@ #endif // no system header #include -#include -#include - -namespace cuda::experimental::datapar -{ -// exposition-only helpers -using __simd_size_type = ::cuda::std::ptrdiff_t; +#include +#include -template <::cuda::std::size_t _Bytes> -using __integer_from_bytes = ::cuda::std::__make_nbit_int_t<_Bytes * 8, false>; +#include -namespace simd_abi +namespace cuda::experimental::simd { -struct __scalar; - -using scalar = __scalar; - -template -struct __fixed_size; - -template -using fixed_size = __fixed_size<_Np>; - -template -using compatible = fixed_size<1>; - -template -using native = fixed_size<1>; - -template -using deduce = fixed_size<_Np>; -} // namespace simd_abi - -// exposition-only helpers -template -inline constexpr __simd_size_type __simd_size_v = 0; - -template -inline constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::fixed_size<_Np>> = _Np; - -template -inline constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::native<_Tp>> = 1; - -template -struct __simd_storage; - -template -struct __simd_operations; - -template <::cuda::std::size_t _Bytes, typename _Abi> -struct __mask_storage; - +// template +// struct __simd_storage; +// +// template +// struct __simd_operations; +// +// template <::cuda::std::size_t _Bytes, typename _Abi> +// struct __mask_storage; +// template <::cuda::std::size_t _Bytes, typename _Abi> struct __mask_operations; -// P1928R15: basic_vec is the primary SIMD vector type template > class basic_vec; -// P1928R15: basic_mask is the primary SIMD mask type with Bytes as first template parameter -template <::cuda::std::size_t _Bytes, typename _Abi = simd_abi::native<__integer_from_bytes<_Bytes>>> +template <::cuda::std::size_t _Bytes, typename _Abi = simd_abi::native<__integer_from<_Bytes>>> class basic_mask; template >> -using vec = basic_vec<_Tp, simd_abi::deduce<_Tp, _Np>>; +using vec = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>; template >> -using mask = basic_mask>; -} // namespace cuda::experimental::datapar +using mask = basic_mask>; +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/exposition.h b/cudax/include/cuda/experimental/__simd/exposition.h new file mode 100644 index 00000000000..10e2ec5ac5f --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/exposition.h @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_EXPOSITION_H +#define _CUDAX___SIMD_EXPOSITION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], exposition-only helpers + +template <::cuda::std::size_t _Bytes> +using __integer_from = ::cuda::std::__make_nbit_int_t<_Bytes * 8, true>; + +template +constexpr bool __is_vectorizable_v = + ::cuda::std::is_arithmetic_v<_Tp> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp> + && !::cuda::std::is_same_v<_Tp, bool>; + +template +constexpr __simd_size_type __simd_size_v = 0; + +template +constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::fixed_size<_Np>> = _Np; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_EXPOSITION_H diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h index 1b6a7f194b9..27b57531d81 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/fixed_size_impl.h @@ -32,7 +32,7 @@ #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { namespace simd_abi { @@ -374,7 +374,7 @@ struct __mask_operations<_Bytes, simd_abi::__fixed_size<_Np>> #undef _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP #undef _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP #undef _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP -} // namespace cuda::experimental::datapar +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/reference.h index 95050b01163..6d1f2646fe7 100644 --- a/cudax/include/cuda/experimental/__simd/reference.h +++ b/cudax/include/cuda/experimental/__simd/reference.h @@ -30,7 +30,7 @@ #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { template class __simd_reference @@ -209,7 +209,7 @@ _CCCL_API void swap(__simd_reference<_Storage, _Vp>&& __a, _Vp& __b) noexcept ::cuda::std::move(__a) = ::cuda::std::move(__b); __b = ::cuda::std::move(__tmp); } -} // namespace cuda::experimental::datapar +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h index 01863f6fb14..bbcfbadf317 100644 --- a/cudax/include/cuda/experimental/__simd/scalar_impl.h +++ b/cudax/include/cuda/experimental/__simd/scalar_impl.h @@ -27,10 +27,11 @@ #include #include +#include #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { namespace simd_abi { @@ -38,8 +39,13 @@ struct __scalar { static constexpr ::cuda::std::size_t __simd_size = 1; }; + +using scalar = __scalar; } // namespace simd_abi +template <> +inline constexpr bool is_abi_tag_v = true; + template struct __simd_storage<_Tp, simd_abi::__scalar> { @@ -307,7 +313,7 @@ struct __mask_operations<_Bytes, simd_abi::__scalar> return static_cast(__s.__data) ? 1 : 0; } }; -} // namespace cuda::experimental::datapar +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/traits.h index cdfae75ab5b..2576d60dd48 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/traits.h @@ -26,11 +26,10 @@ #include #include -#include #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { struct element_aligned_tag { @@ -75,9 +74,6 @@ template struct is_abi_tag : ::cuda::std::bool_constant> {}; -template <> -inline constexpr bool is_abi_tag_v = true; - template inline constexpr bool is_abi_tag_v> = true; @@ -198,7 +194,7 @@ struct mask_element_size> : ::cuda::std::integral_const template inline constexpr ::cuda::std::size_t mask_element_size_v = mask_element_size<_Tp>::value; -} // namespace cuda::experimental::datapar +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/type_traits.h b/cudax/include/cuda/experimental/__simd/type_traits.h new file mode 100644 index 00000000000..3b1765c2697 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/type_traits.h @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_TYPE_TRAITS_H +#define _CUDAX___SIMD_TYPE_TRAITS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.traits], alignment +template +struct alignment; + +template +struct alignment, _Up> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Up) * __simd_size_v<_Tp, _Abi>> +{ + static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type"); +}; + +template +constexpr ::cuda::std::size_t alignment_v = alignment<_Tp, _Up>::value; + +// [simd.traits], rebind +template +struct rebind; + +template +struct rebind<_Tp, basic_vec<_Up, _Abi>> +{ + static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type"); + using type = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, __simd_size_v<_Up, _Abi>>>; +}; + +template +struct rebind<_Tp, basic_mask<_Bytes, _Abi>> +{ + static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type"); + using __integer_t = __integer_from; + using __integer_bytes_t = __integer_from<_Bytes>; + + using type = basic_mask>>; +}; + +template +using rebind_t = typename rebind<_Tp, _Vp>::type; + +// [simd.traits], resize +template <__simd_size_type _Np, typename _Vp> +struct resize; + +template <__simd_size_type _Np, typename _Tp, typename _Abi> +struct resize<_Np, basic_vec<_Tp, _Abi>> +{ + using type = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>; +}; + +template <__simd_size_type _Np, ::cuda::std::size_t _Bytes, typename _Abi> +struct resize<_Np, basic_mask<_Bytes, _Abi>> +{ + using type = basic_mask<_Bytes, simd_abi::__deduce_abi_t<__integer_from<_Bytes>, _Np>>; +}; + +template <__simd_size_type _Np, typename _Vp> +using resize_t = typename resize<_Np, _Vp>::type; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_TYPE_TRAITS_H diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index 67a0940c1d5..37d20517218 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -23,24 +23,32 @@ #include #include -#include -#include #include #include #include -#include #include #include #include +#include + #include -namespace cuda::experimental::datapar +namespace cuda::experimental::simd { +template <::cuda::std::size_t _Bytes> +constexpr bool __has_integer_from_v = + (_Bytes == 1 || _Bytes == 2 || _Bytes == 4 || _Bytes == 8 +#if _CCCL_HAS_INT128() + || _Bytes == 16 +#endif // _CCCL_HAS_INT128() + ); + template -inline constexpr bool __is_vectorizable_v = - ::cuda::std::is_arithmetic_v<_Tp> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp> - && !::cuda::std::is_same_v<_Tp, bool>; +constexpr bool __is_abi_tag_v = false; + +template +constexpr bool __is_abi_tag_v> = true; template inline constexpr bool __is_non_narrowing_convertible_v = false; @@ -50,35 +58,33 @@ inline constexpr bool __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = true; -template -inline constexpr bool __can_broadcast_v = - (__is_vectorizable_v<_Up> && __is_non_narrowing_convertible_v<_Up, _Tp>) - || (!__is_vectorizable_v<_Up> && ::cuda::std::is_convertible_v<_Up, _Tp>) || ::cuda::std::is_same_v<_Up, int> - || (::cuda::std::is_same_v<_Up, unsigned int> && ::cuda::std::is_unsigned_v<_Tp>); +// template +// inline constexpr bool __can_broadcast_v = +// (__is_vectorizable_v<_Up> && __is_non_narrowing_convertible_v<_Up, _Tp>) +// || (!__is_vectorizable_v<_Up> && ::cuda::std::is_convertible_v<_Up, _Tp>) || ::cuda::std::is_same_v<_Up, int> +// || (::cuda::std::is_same_v<_Up, unsigned int> && ::cuda::std::is_unsigned_v<_Tp>); template -inline constexpr bool __is_well_formed = false; +constexpr bool __is_well_formed = false; template -inline constexpr bool __is_well_formed<_Tp, - _Generator, - _Idx, - ::cuda::std::void_t()( - ::cuda::std::integral_constant<::cuda::std::size_t, _Idx>()))>> = - __can_broadcast_v< - _Tp, - decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<::cuda::std::size_t, _Idx>()))>; - -template -_CCCL_HIDE_FROM_ABI constexpr bool __can_generate(::cuda::std::index_sequence<_Idxes...>) +constexpr bool __is_well_formed<_Tp, + _Generator, + _Idx, + ::cuda::std::void_t()( + ::cuda::std::integral_constant<__simd_size_type, _Idx>()))>> = ::cuda::std:: + is_same_v<_Tp, decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<__simd_size_type, _Idx>()))>; + +template +_CCCL_API constexpr bool __can_generate(::cuda::std::integer_sequence<__simd_size_type, _Indices...>) { - return (true && ... && __is_well_formed<_Tp, _Generator, _Idxes>); + return (true && ... && __is_well_formed<_Tp, _Generator, _Indices>); } template -inline constexpr bool __can_generate_v = - ::cuda::experimental::datapar::__can_generate<_Tp, _Generator>(::cuda::std::make_index_sequence<_Size>()); -} // namespace cuda::experimental::datapar +constexpr bool __can_generate_v = ::cuda::experimental::simd::__can_generate<_Tp, _Generator>( + ::cuda::std::make_integer_sequence<__simd_size_type, _Size>()); +} // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/simd.cuh b/cudax/include/cuda/experimental/simd.cuh index b14f852610e..aecb5a96ae1 100644 --- a/cudax/include/cuda/experimental/simd.cuh +++ b/cudax/include/cuda/experimental/simd.cuh @@ -11,7 +11,7 @@ #ifndef __CUDAX_SIMD___ #define __CUDAX_SIMD___ -#include -#include +#include +#include #endif // __CUDAX_SIMD___ diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 9ae7be85039..c4eaeefbe77 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -119,6 +119,10 @@ cudax_add_catch2_test(test_target hierarchy_groups hierarchy/group.cu ) +cudax_add_catch2_test(test_target simd + simd/simd.cu +) + if (cudax_ENABLE_CUFILE) cudax_add_catch2_test(test_target cufile.driver_attributes cufile/driver_attributes.cu diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index 7d4d5573211..7314a14184b 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -17,7 +17,7 @@ #include -namespace dp = cuda::experimental::datapar; +namespace dp = cuda::experimental::simd; namespace { @@ -65,9 +65,9 @@ using simd_array_t = ::cuda::std::array C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") { using abi_t = dp::simd_abi::fixed_size<4>; - using simd_t = dp::simd; - using mask_t = dp::simd_mask; - using other_t = dp::simd; + using simd_t = dp::vec; + using mask_t = dp::mask; + using other_t = dp::vec; using rebind_t = dp::rebind_simd_t; STATIC_REQUIRE(dp::is_abi_tag_v); @@ -76,10 +76,10 @@ C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") STATIC_REQUIRE(dp::__simd_size_v == 4); STATIC_REQUIRE(dp::__simd_size_v == 0); - STATIC_REQUIRE(dp::is_simd_v); - STATIC_REQUIRE(!dp::is_simd_v); - STATIC_REQUIRE(dp::is_simd_mask_v); - STATIC_REQUIRE(!dp::is_simd_mask_v); + STATIC_REQUIRE(dp::is_vec_v); + STATIC_REQUIRE(!dp::is_vec_v); + STATIC_REQUIRE(dp::is_mask_v); + STATIC_REQUIRE(!dp::is_mask_v); STATIC_REQUIRE(dp::is_simd_flag_type_v); STATIC_REQUIRE(dp::is_simd_flag_type_v); @@ -99,7 +99,7 @@ C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") { constexpr auto size = 4; - using simd_t = dp::simd; + using simd_t = dp::vec; using mask_t = simd_t::mask_type; using array_t = simd_array_t; @@ -120,13 +120,13 @@ C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") loaded.copy_from(roundtrip, dp::overaligned<64>); expect_equal(loaded, array_t{0, 2, 4, 6}); - dp::simd widened(generated); + dp::vec widened(generated); expect_equal(widened, ::cuda::std::array{0.0f, 2.0f, 4.0f, 6.0f}); - mask_t from_simd = static_cast(generated); + mask_t from_simd = (generated != simd_t(0)); expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); - dp::simd assigned = simd_t(linear_index_gen{}); + dp::vec assigned = simd_t(linear_index_gen{}); assigned = generated; expect_equal(assigned, array_t{0, 2, 4, 6}); @@ -141,7 +141,7 @@ C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") { - using simd_t = dp::simd; + using simd_t = dp::vec; using mask_t = simd_t::mask_type; using array_t = simd_array_t; @@ -216,22 +216,22 @@ C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") expect_equal(shift_compound, array_t{1, 1, 1, 1}); mask_t eq_mask = (lhs == lhs); - CUDAX_REQUIRE(eq_mask.all()); + CUDAX_REQUIRE(dp::all_of(eq_mask)); mask_t lt_mask = (lhs < 2); - CUDAX_REQUIRE(lt_mask.count() == 2); + CUDAX_REQUIRE(dp::reduce_count(lt_mask) == 2); mask_t scalar_first_lt = (2 <= lhs); - CUDAX_REQUIRE(scalar_first_lt.count() == 2); + CUDAX_REQUIRE(dp::reduce_count(scalar_first_lt) == 2); mask_t scalar_eq_rhs = (lhs == 1); - CUDAX_REQUIRE(scalar_eq_rhs.count() == 1); + CUDAX_REQUIRE(dp::reduce_count(scalar_eq_rhs) == 1); mask_t scalar_eq_lhs = (1 == lhs); - CUDAX_REQUIRE(scalar_eq_lhs.count() == 1); + CUDAX_REQUIRE(dp::reduce_count(scalar_eq_lhs) == 1); mask_t ge_mask = (lhs >= 1); - CUDAX_REQUIRE(ge_mask.any()); - CUDAX_REQUIRE(!ge_mask.none()); + CUDAX_REQUIRE(dp::any_of(ge_mask)); + CUDAX_REQUIRE(!dp::none_of(ge_mask)); auto negated = -lhs; expect_equal(negated, array_t{0, -1, -2, -3}); @@ -242,69 +242,91 @@ C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") { - using mask_t = dp::simd_mask; - using simd_t = dp::simd; + using mask_t = dp::mask; + using simd_t = dp::vec; using mask_array_t = ::cuda::std::array; using simd_array_typed = simd_array_t; mask_t alternating(alternating_mask_gen{}); expect_equal(alternating, mask_array_t{true, false, true, false}); - CUDAX_REQUIRE(alternating.count() == 2); - CUDAX_REQUIRE(alternating.any()); - CUDAX_REQUIRE(!alternating.all()); - CUDAX_REQUIRE(!alternating.none()); + CUDAX_REQUIRE(dp::reduce_count(alternating) == 2); + CUDAX_REQUIRE(dp::any_of(alternating)); + CUDAX_REQUIRE(!dp::all_of(alternating)); + CUDAX_REQUIRE(!dp::none_of(alternating)); mask_t inverted = !alternating; expect_equal(inverted, mask_array_t{false, true, false, true}); mask_t zero = alternating & inverted; - CUDAX_REQUIRE(zero.none()); + CUDAX_REQUIRE(dp::none_of(zero)); mask_t combined = alternating | inverted; - CUDAX_REQUIRE(combined.all()); - - bool buffer[mask_t::size()] = {}; - alternating.copy_to(buffer); - mask_t loaded(buffer); - CUDAX_REQUIRE(loaded == alternating); - - mask_t copied(false); - copied.copy_from(buffer); - CUDAX_REQUIRE(copied == alternating); - - alignas(64) bool aligned_buffer[mask_t::size()] = {true, true, false, false}; - mask_t from_aligned(false); - from_aligned.copy_from(aligned_buffer, dp::overaligned<64>); - alignas(64) bool aligned_roundtrip[mask_t::size()] = {}; - from_aligned.copy_to(aligned_roundtrip, dp::overaligned<64>); - mask_t roundtrip_check(aligned_roundtrip); - CUDAX_REQUIRE(roundtrip_check == from_aligned); + CUDAX_REQUIRE(dp::all_of(combined)); auto vec_from_mask = static_cast(alternating); expect_equal(vec_from_mask, simd_array_typed{1, 0, 1, 0}); - mask_t mutated = alternating; - mutated[1] = true; - mutated[3] = true; - CUDAX_REQUIRE(mutated.all()); - mask_t xor_mask = alternating ^ inverted; - CUDAX_REQUIRE(xor_mask.all()); + CUDAX_REQUIRE(dp::all_of(xor_mask)); mask_t assigned = alternating; assigned ^= inverted; - CUDAX_REQUIRE(assigned.all()); + CUDAX_REQUIRE(dp::all_of(assigned)); assigned &= combined; - CUDAX_REQUIRE(assigned.all()); + CUDAX_REQUIRE(dp::all_of(assigned)); + + mask_t or_test(false); + or_test |= alternating; + CUDAX_REQUIRE(dp::all_of(or_test == alternating)); mask_t broadcast_true(true); - CUDAX_REQUIRE(broadcast_true.all()); + CUDAX_REQUIRE(dp::all_of(broadcast_true)); + + mask_t broadcast_false(false); + CUDAX_REQUIRE(dp::none_of(broadcast_false)); + + // Element-wise comparison operators + mask_t eq_result = (alternating == alternating); + CUDAX_REQUIRE(dp::all_of(eq_result)); + + mask_t ne_result = (alternating != inverted); + CUDAX_REQUIRE(dp::all_of(ne_result)); + + mask_t a(true); + mask_t b(false); + CUDAX_REQUIRE(dp::all_of(a >= b)); + CUDAX_REQUIRE(dp::all_of(b <= a)); + CUDAX_REQUIRE(dp::all_of(a > b)); + CUDAX_REQUIRE(dp::all_of(b < a)); + CUDAX_REQUIRE(dp::all_of(a >= a)); + CUDAX_REQUIRE(dp::all_of(a <= a)); + CUDAX_REQUIRE(dp::none_of(a > a)); + CUDAX_REQUIRE(dp::none_of(a < a)); + + // Unsigned integer constructor + mask_t from_bits(static_cast(0b1010)); + expect_equal(from_bits, mask_array_t{false, true, false, true}); + + // Unary operators returning basic_vec + mask_t pos_input(true); + auto plus_result = +pos_input; + CUDAX_REQUIRE(dp::all_of(plus_result == simd_t(1))); + + auto minus_result = -pos_input; + CUDAX_REQUIRE(dp::all_of(minus_result == simd_t(-1))); + + // Logical operators + mask_t logical_and = alternating && inverted; + CUDAX_REQUIRE(dp::none_of(logical_and)); + + mask_t logical_or = alternating || inverted; + CUDAX_REQUIRE(dp::all_of(logical_or)); } C2H_CCCLRT_TEST("simd.reference", "[simd][reference]") { - using simd_t = dp::simd; + using simd_t = dp::vec; using array_t = simd_array_t; simd_t values(linear_index_gen{}); From 5e019b64de07a121dd6376cc12dfe943c56ad47b Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 25 Mar 2026 16:54:42 -0700 Subject: [PATCH 27/32] implemented fixed_size_simple_mask --- cudax/include/cuda/experimental/__simd/abi.h | 8 +- .../cuda/experimental/__simd/basic_mask.h | 1 + .../cuda/experimental/__simd/declaration.h | 20 +- .../cuda/experimental/__simd/exposition.h | 2 +- .../cuda/experimental/__simd/scalar_impl.h | 320 ------------------ .../{ => specializations}/fixed_size_impl.h | 155 +-------- .../specializations/fixed_size_simple_mask.h | 232 +++++++++++++ .../__simd/{ => to_remove}/concepts.h | 0 .../__simd/{ => to_remove}/reference.h | 0 .../__simd/{ => to_remove}/traits.h | 4 +- .../cuda/experimental/__simd/utility.h | 28 +- cudax/test/simd/simd.cu | 2 +- 12 files changed, 272 insertions(+), 500 deletions(-) delete mode 100644 cudax/include/cuda/experimental/__simd/scalar_impl.h rename cudax/include/cuda/experimental/__simd/{ => specializations}/fixed_size_impl.h (64%) create mode 100644 cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h rename cudax/include/cuda/experimental/__simd/{ => to_remove}/concepts.h (100%) rename cudax/include/cuda/experimental/__simd/{ => to_remove}/reference.h (100%) rename cudax/include/cuda/experimental/__simd/{ => to_remove}/traits.h (96%) diff --git a/cudax/include/cuda/experimental/__simd/abi.h b/cudax/include/cuda/experimental/__simd/abi.h index bc669952ae8..c2af5a2c180 100644 --- a/cudax/include/cuda/experimental/__simd/abi.h +++ b/cudax/include/cuda/experimental/__simd/abi.h @@ -33,16 +33,16 @@ using __simd_size_type = ::cuda::std::ptrdiff_t; namespace simd_abi { template <__simd_size_type _Np> -struct __fixed_size; +struct __fixed_size_simple; template <__simd_size_type _Np> -using fixed_size = __fixed_size<_Np>; +using fixed_size_simple = __fixed_size_simple<_Np>; // implementation-defined ABI template -using native = fixed_size<1>; +using native = fixed_size_simple<1>; // implementation-defined ABI template -using __deduce_abi_t = fixed_size<_Np>; +using __deduce_abi_t = fixed_size_simple<_Np>; } // namespace simd_abi } // namespace cuda::experimental::simd diff --git a/cudax/include/cuda/experimental/__simd/basic_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h index 0294b25cbf0..98caeb8181c 100644 --- a/cudax/include/cuda/experimental/__simd/basic_mask.h +++ b/cudax/include/cuda/experimental/__simd/basic_mask.h @@ -34,6 +34,7 @@ #include #include +#include #include #include diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h index 36699d13cd5..70ec87d2e5d 100644 --- a/cudax/include/cuda/experimental/__simd/declaration.h +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -30,17 +30,7 @@ namespace cuda::experimental::simd { -// template -// struct __simd_storage; -// -// template -// struct __simd_operations; -// -// template <::cuda::std::size_t _Bytes, typename _Abi> -// struct __mask_storage; -// -template <::cuda::std::size_t _Bytes, typename _Abi> -struct __mask_operations; + template > class basic_vec; @@ -53,6 +43,14 @@ using vec = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>; template >> using mask = basic_mask>; + +// specializations + +template <::cuda::std::size_t _Bytes, typename _Abi> +struct __mask_storage; + +template <::cuda::std::size_t _Bytes, typename _Abi> +struct __mask_operations; } // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/exposition.h b/cudax/include/cuda/experimental/__simd/exposition.h index 10e2ec5ac5f..816b4d54f4c 100644 --- a/cudax/include/cuda/experimental/__simd/exposition.h +++ b/cudax/include/cuda/experimental/__simd/exposition.h @@ -48,7 +48,7 @@ template constexpr __simd_size_type __simd_size_v = 0; template -constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::fixed_size<_Np>> = _Np; +constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::fixed_size_simple<_Np>> = _Np; } // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/scalar_impl.h b/cudax/include/cuda/experimental/__simd/scalar_impl.h deleted file mode 100644 index bbcfbadf317..00000000000 --- a/cudax/include/cuda/experimental/__simd/scalar_impl.h +++ /dev/null @@ -1,320 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX___SIMD_SCALAR_IMPL_H -#define _CUDAX___SIMD_SCALAR_IMPL_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include - -#include -#include - -#include - -namespace cuda::experimental::simd -{ -namespace simd_abi -{ -struct __scalar -{ - static constexpr ::cuda::std::size_t __simd_size = 1; -}; - -using scalar = __scalar; -} // namespace simd_abi - -template <> -inline constexpr bool is_abi_tag_v = true; - -template -struct __simd_storage<_Tp, simd_abi::__scalar> -{ - using value_type = _Tp; - _Tp __data; - - [[nodiscard]] _CCCL_API constexpr _Tp __get([[maybe_unused]] ::cuda::std::size_t __idx) const noexcept - { - _CCCL_ASSERT(__idx == 0, "Index is out of bounds"); - return __data; - } - - _CCCL_API constexpr void __set([[maybe_unused]] ::cuda::std::size_t __idx, _Tp __v) noexcept - { - _CCCL_ASSERT(__idx == 0, "Index is out of bounds"); - __data = __v; - } -}; - -// P1928R15: Mask storage is now indexed by Bytes (element size) rather than type -template <::cuda::std::size_t _Bytes> -struct __mask_storage<_Bytes, simd_abi::__scalar> : __simd_storage -{ - using value_type = bool; - static constexpr ::cuda::std::size_t __element_bytes = _Bytes; -}; - -// ********************************************************************************************************************* -// * SIMD Arithmetic Operations -// ********************************************************************************************************************* - -template -struct __simd_operations<_Tp, simd_abi::__scalar> -{ - using _SimdStorage = __simd_storage<_Tp, simd_abi::__scalar>; - using _MaskStorage = __mask_storage; - - [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept - { - return _SimdStorage{__v}; - } - - template - [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) - { - return _SimdStorage{__g(::cuda::std::integral_constant<::cuda::std::size_t, 0>())}; - } - - template - _CCCL_API static constexpr void __load(_SimdStorage& __s, const _Up* __mem) noexcept - { - __s.__data = static_cast<_Tp>(__mem[0]); - } - - template - _CCCL_API static constexpr void __store(const _SimdStorage& __s, _Up* __mem) noexcept - { - __mem[0] = static_cast<_Up>(__s.__data); - } - - _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept - { - __s.__data += 1; - } - - _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept - { - __s.__data -= 1; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept - { - return _MaskStorage{!__s.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept - { - return _SimdStorage{static_cast<_Tp>(~__s.__data)}; - } - - [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept - { - return _SimdStorage{static_cast<_Tp>(-__s.__data)}; - } - - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data + __rhs.__data)}; - } - - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data - __rhs.__data)}; - } - - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data * __rhs.__data)}; - } - - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __divides(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data / __rhs.__data)}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data == __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __not_equal_to(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data != __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __less(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data < __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __less_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data <= __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __greater(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data > __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __greater_equal(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data >= __rhs.__data}; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __modulo(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data % __rhs.__data)}; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __bitwise_and(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data & __rhs.__data)}; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __bitwise_or(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data | __rhs.__data)}; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __bitwise_xor(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data ^ __rhs.__data)}; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __shift_left(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data << __rhs.__data)}; - } - - _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) - [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __shift_right(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept - { - return _SimdStorage{static_cast<_Tp>(__lhs.__data >> __rhs.__data)}; - } -}; - -// ********************************************************************************************************************* -// * SIMD Mask Operations (P1928R15: indexed by Bytes instead of type) -// ********************************************************************************************************************* - -template <::cuda::std::size_t _Bytes> -struct __mask_operations<_Bytes, simd_abi::__scalar> -{ - using _MaskStorage = __mask_storage<_Bytes, simd_abi::__scalar>; - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept - { - return _MaskStorage{__v}; - } - - template - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) - { - return _MaskStorage{static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, 0>()))}; - } - - _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept - { - __s.__data = __mem[0]; - } - - _CCCL_API static constexpr void __store(const _MaskStorage& __s, bool* __mem) noexcept - { - __mem[0] = static_cast(__s.__data); - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data && __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data || __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - return _MaskStorage{__lhs.__data != __rhs.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept - { - return _MaskStorage{!__s.__data}; - } - - [[nodiscard]] _CCCL_API static constexpr bool __equal_to(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - return __lhs.__data == __rhs.__data; - } - - [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept - { - return static_cast(__s.__data); - } - - [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept - { - return static_cast(__s.__data); - } - - [[nodiscard]] _CCCL_API static constexpr ::cuda::std::ptrdiff_t __count(const _MaskStorage& __s) noexcept - { - return static_cast(__s.__data) ? 1 : 0; - } -}; -} // namespace cuda::experimental::simd - -#include - -#endif // _CUDAX___SIMD_SCALAR_IMPL_H diff --git a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_impl.h similarity index 64% rename from cudax/include/cuda/experimental/__simd/fixed_size_impl.h rename to cudax/include/cuda/experimental/__simd/specializations/fixed_size_impl.h index 27b57531d81..1a2f20831a2 100644 --- a/cudax/include/cuda/experimental/__simd/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_impl.h @@ -36,15 +36,15 @@ namespace cuda::experimental::simd { namespace simd_abi { -template -struct __fixed_size +template <__simd_size_type _Np> +struct __fixed_size_simple { - static constexpr ::cuda::std::size_t __simd_size = _Np; + static constexpr __simd_size_type __simd_size = _Np; }; } // namespace simd_abi -template -struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> +template +struct __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>> { using value_type = _Tp; _Tp __data[_Np]; @@ -64,15 +64,6 @@ struct __simd_storage<_Tp, simd_abi::__fixed_size<_Np>> } }; -// P1928R15: Mask storage is now indexed by Bytes (element size) rather than type -// Using a single bit for the mask storage could be not efficient in CUDA -template <::cuda::std::size_t _Bytes, int _Np> -struct __mask_storage<_Bytes, simd_abi::__fixed_size<_Np>> : public __simd_storage> -{ - using value_type = bool; - static constexpr ::cuda::std::size_t __element_bytes = _Bytes; -}; - // Helper macros to generate repeated fixed-size operations. #define _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_STORAGE_TYPE, _NAME, _OP) \ [[nodiscard]] _CCCL_API static constexpr _STORAGE_TYPE _NAME( \ @@ -109,11 +100,11 @@ struct __mask_storage<_Bytes, simd_abi::__fixed_size<_Np>> : public __simd_stora // * SIMD Arithmetic Operations // ********************************************************************************************************************* -template -struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> +template +struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> { - using _SimdStorage = __simd_storage<_Tp, simd_abi::__fixed_size<_Np>>; - using _MaskStorage = __mask_storage>; + using _SimdStorage = __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>>; + using _MaskStorage = __mask_storage>; [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept { @@ -243,134 +234,6 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size<_Np>> _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __shift_right, >>) }; -// ********************************************************************************************************************* -// * SIMD Mask Operations (P1928R15: indexed by Bytes instead of type) -// ********************************************************************************************************************* - -template <::cuda::std::size_t _Bytes, int _Np> -struct __mask_operations<_Bytes, simd_abi::__fixed_size<_Np>> -{ - using _MaskStorage = __mask_storage<_Bytes, simd_abi::__fixed_size<_Np>>; - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; ++__i) - { - __result.__set(__i, __v); - } - return __result; - } - - template - [[nodiscard]] _CCCL_API static constexpr _MaskStorage - __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) - { - _MaskStorage __result; - ((__result.__set(_Is, static_cast(__g(::cuda::std::integral_constant<::cuda::std::size_t, _Is>())))), ...); - return __result; - } - - template - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) - { - return __generate_init(__g, ::cuda::std::make_index_sequence<_Np>()); - } - - _CCCL_API static constexpr void __load(_MaskStorage& __s, const bool* __mem) noexcept - { - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __s.__data[__i] = __mem[__i]; - } - } - - _CCCL_API static constexpr void __store(const _MaskStorage& __s, bool* __mem) noexcept - { - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __mem[__i] = static_cast(__s.__data[__i]); - } - } - - // TODO: optimize with uint32 SWAR - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_and, &&) - - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_or, ||) - - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_MaskStorage, __bitwise_xor, !=) - - [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept - { - _MaskStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __result.__data[__i] = !__s.__data[__i]; - } - return __result; - } - - [[nodiscard]] _CCCL_API static constexpr bool __equal_to(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept - { - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - if (__lhs.__data[__i] != __rhs.__data[__i]) - { - return false; - } - } - return true; - } - - [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept - { - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - if (!__s.__data[__i]) - { - return false; - } - } - return true; - } - - [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept - { - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - if (__s.__data[__i]) - { - return true; - } - } - return false; - } - - // P1928R15 requires simd-size-type (ptrdiff_t) return type - [[nodiscard]] _CCCL_API static constexpr ::cuda::std::ptrdiff_t __count(const _MaskStorage& __s) noexcept - { - int __cnt = 0; - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - if (__s.__data[__i]) - { - ++__cnt; - } - } - const auto __ret = static_cast<::cuda::std::ptrdiff_t>(__cnt); - _CCCL_ASSUME(__ret >= 0 && __ret <= _Np); - return __ret; - } -}; - #undef _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP #undef _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP #undef _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP diff --git a/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h new file mode 100644 index 00000000000..50434015934 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h @@ -0,0 +1,232 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H +#define _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +#include + +namespace cuda::experimental::simd +{ +// Bool-per-element mask storage for fixed_size_simple ABI +template <::cuda::std::size_t _Bytes, __simd_size_type _Np> +struct __mask_storage<_Bytes, simd_abi::__fixed_size_simple<_Np>> +{ + using value_type = bool; + static constexpr ::cuda::std::size_t __element_bytes = _Bytes; + + bool __data[_Np]; + + [[nodiscard]] _CCCL_API constexpr bool __get(__simd_size_type __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + return __data[__idx]; + } + + _CCCL_API constexpr void __set(__simd_size_type __idx, bool __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + __data[__idx] = __v; + } +}; + +// Mask operations for fixed_size_simple ABI with bool-per-element storage +template <::cuda::std::size_t _Bytes, __simd_size_type _Np> +struct __mask_operations<_Bytes, simd_abi::__fixed_size_simple<_Np>> +{ + using _MaskStorage = __mask_storage<_Bytes, simd_abi::__fixed_size_simple<_Np>>; + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __v; + } + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __generate_init(_Generator&& __g, ::cuda::std::integer_sequence<__simd_size_type, _Is...>) + { + _MaskStorage __result; + ((__result.__data[_Is] = static_cast(__g(::cuda::std::integral_constant<__simd_size_type, _Is>()))), ...); + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) + { + return __generate_init(__g, ::cuda::std::make_integer_sequence<__simd_size_type, _Np>()); + } + + // Logical operators (for operator&& and operator||) + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __logic_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __logic_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i]; + } + return __result; + } + + // Bitwise operators (for operator&, operator|, operator^) + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = !__s.__data[__i]; + } + return __result; + } + + // Reductions + + [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (!__s.__data[__i]) + { + return false; + } + } + return true; + } + + [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (__s.__data[__i]) + { + return true; + } + } + return false; + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __count(const _MaskStorage& __s) noexcept + { + __simd_size_type __count = 0; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __count += static_cast<__simd_size_type>(__s.__data[__i]); + } + return __count; + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __min_index(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (__s.__data[__i]) + { + return __i; + } + } + _CCCL_UNREACHABLE(); + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __max_index(const _MaskStorage& __s) noexcept + { + for (__simd_size_type __i = _Np - 1; __i >= 0; --__i) + { + if (__s.__data[__i]) + { + return __i; + } + } + _CCCL_UNREACHABLE(); + } +}; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/to_remove/concepts.h similarity index 100% rename from cudax/include/cuda/experimental/__simd/concepts.h rename to cudax/include/cuda/experimental/__simd/to_remove/concepts.h diff --git a/cudax/include/cuda/experimental/__simd/reference.h b/cudax/include/cuda/experimental/__simd/to_remove/reference.h similarity index 100% rename from cudax/include/cuda/experimental/__simd/reference.h rename to cudax/include/cuda/experimental/__simd/to_remove/reference.h diff --git a/cudax/include/cuda/experimental/__simd/traits.h b/cudax/include/cuda/experimental/__simd/to_remove/traits.h similarity index 96% rename from cudax/include/cuda/experimental/__simd/traits.h rename to cudax/include/cuda/experimental/__simd/to_remove/traits.h index 2576d60dd48..9690822cc73 100644 --- a/cudax/include/cuda/experimental/__simd/traits.h +++ b/cudax/include/cuda/experimental/__simd/to_remove/traits.h @@ -75,7 +75,7 @@ struct is_abi_tag : ::cuda::std::bool_constant> {}; template -inline constexpr bool is_abi_tag_v> = true; +inline constexpr bool is_abi_tag_v> = true; template inline constexpr bool is_vec_v = false; @@ -98,7 +98,7 @@ template struct is_simd_flag_type : ::cuda::std::bool_constant> {}; -template , bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> +template , bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> struct simd_size : ::cuda::std::integral_constant<::cuda::std::size_t, _Abi::__simd_size> {}; diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index 37d20517218..98ae8f994ba 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -36,6 +36,18 @@ namespace cuda::experimental::simd { + + +template +inline constexpr bool __is_non_narrowing_convertible_v = false; + +template +inline constexpr bool + __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = + true; + + + template <::cuda::std::size_t _Bytes> constexpr bool __has_integer_from_v = (_Bytes == 1 || _Bytes == 2 || _Bytes == 4 || _Bytes == 8 @@ -48,21 +60,7 @@ template constexpr bool __is_abi_tag_v = false; template -constexpr bool __is_abi_tag_v> = true; - -template -inline constexpr bool __is_non_narrowing_convertible_v = false; - -template -inline constexpr bool - __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = - true; - -// template -// inline constexpr bool __can_broadcast_v = -// (__is_vectorizable_v<_Up> && __is_non_narrowing_convertible_v<_Up, _Tp>) -// || (!__is_vectorizable_v<_Up> && ::cuda::std::is_convertible_v<_Up, _Tp>) || ::cuda::std::is_same_v<_Up, int> -// || (::cuda::std::is_same_v<_Up, unsigned int> && ::cuda::std::is_unsigned_v<_Tp>); +constexpr bool __is_abi_tag_v> = true; template constexpr bool __is_well_formed = false; diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index 7314a14184b..342ef4e5fd3 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -64,7 +64,7 @@ using simd_array_t = ::cuda::std::array C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") { - using abi_t = dp::simd_abi::fixed_size<4>; + using abi_t = dp::simd_abi::fixed_size_simple<4>; using simd_t = dp::vec; using mask_t = dp::mask; using other_t = dp::vec; From 4b324db299279b2a32fb97ed33222fb81b92aaba Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 25 Mar 2026 16:59:45 -0700 Subject: [PATCH 28/32] use fixed_size_simple ABI --- cudax/include/cuda/experimental/__simd/abi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/abi.h b/cudax/include/cuda/experimental/__simd/abi.h index c2af5a2c180..9ab1b812c52 100644 --- a/cudax/include/cuda/experimental/__simd/abi.h +++ b/cudax/include/cuda/experimental/__simd/abi.h @@ -33,7 +33,7 @@ using __simd_size_type = ::cuda::std::ptrdiff_t; namespace simd_abi { template <__simd_size_type _Np> -struct __fixed_size_simple; +struct __fixed_size_simple; // internal ABI tag template <__simd_size_type _Np> using fixed_size_simple = __fixed_size_simple<_Np>; // implementation-defined ABI @@ -42,7 +42,7 @@ template using native = fixed_size_simple<1>; // implementation-defined ABI template -using __deduce_abi_t = fixed_size_simple<_Np>; +using __deduce_abi_t = fixed_size_simple<_Np>; // exposition-only } // namespace simd_abi } // namespace cuda::experimental::simd From 71d8840d05a97d9c82b5db1574c16e3986dadfc1 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 30 Mar 2026 16:19:09 -0700 Subject: [PATCH 29/32] implemented basic_vec --- cudax/include/cuda/experimental/__simd/abi.h | 2 +- .../cuda/experimental/__simd/basic_mask.h | 92 +++---- .../cuda/experimental/__simd/basic_vec.h | 253 ++++++++++++------ .../cuda/experimental/__simd/concepts.h | 237 ++++++++++++++++ .../cuda/experimental/__simd/declaration.h | 8 +- cudax/include/cuda/experimental/__simd/flag.h | 105 ++++++++ ...ed_size_impl.h => fixed_size_simple_vec.h} | 96 +++---- .../experimental/__simd/to_remove/concepts.h | 210 --------------- .../cuda/experimental/__simd/utility.h | 35 ++- 9 files changed, 612 insertions(+), 426 deletions(-) create mode 100644 cudax/include/cuda/experimental/__simd/concepts.h create mode 100644 cudax/include/cuda/experimental/__simd/flag.h rename cudax/include/cuda/experimental/__simd/specializations/{fixed_size_impl.h => fixed_size_simple_vec.h} (64%) delete mode 100644 cudax/include/cuda/experimental/__simd/to_remove/concepts.h diff --git a/cudax/include/cuda/experimental/__simd/abi.h b/cudax/include/cuda/experimental/__simd/abi.h index 9ab1b812c52..687c89c9db6 100644 --- a/cudax/include/cuda/experimental/__simd/abi.h +++ b/cudax/include/cuda/experimental/__simd/abi.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// diff --git a/cudax/include/cuda/experimental/__simd/basic_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h index 98caeb8181c..6619a7138cf 100644 --- a/cudax/include/cuda/experimental/__simd/basic_mask.h +++ b/cudax/include/cuda/experimental/__simd/basic_mask.h @@ -45,9 +45,11 @@ namespace cuda::experimental::simd template <::cuda::std::size_t _Bytes, typename _Abi> class basic_mask : public __mask_operations<_Bytes, _Abi> { - static_assert(_Bytes >= 0, "basic_mask requires a positive number of bytes"); static_assert(__is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag"); + template + friend class basic_vec; + using _Impl = __mask_operations<_Bytes, _Abi>; using _Storage = typename _Impl::_MaskStorage; @@ -198,7 +200,7 @@ class basic_mask : public __mask_operations<_Bytes, _Abi> _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < size; ++__i) { - __result[__i] = static_cast<_Up>((*this)[__i]); + __result.__s_.__set(__i, static_cast<_Up>((*this)[__i])); } return __result; } @@ -211,7 +213,7 @@ class basic_mask : public __mask_operations<_Bytes, _Abi> _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < size; ++__i) { - __result[__i] = static_cast<_Up>((*this)[__i]); + __result.__s_.__set(__i, static_cast<_Up>((*this)[__i])); } return __result; } @@ -327,6 +329,40 @@ class basic_mask : public __mask_operations<_Bytes, _Abi> return !__lhs && __rhs; } + // [simd.mask.reductions], reductions + + [[nodiscard]] _CCCL_API friend constexpr bool all_of(const basic_mask& __k) noexcept + { + return _Impl::__all(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr bool any_of(const basic_mask& __k) noexcept + { + return _Impl::__any(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr bool none_of(const basic_mask& __k) noexcept + { + return !any_of(__k); + } + + [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_count(const basic_mask& __k) noexcept + { + return _Impl::__count(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_min_index(const basic_mask& __k) + { + _CCCL_ASSERT(any_of(__k), "No bits are set"); + return _Impl::__min_index(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_max_index(const basic_mask& __k) + { + _CCCL_ASSERT(any_of(__k), "No bits are set"); + return _Impl::__max_index(__k.__s_); + } + // TODO(fbusato): [simd.mask.cond], basic_mask exposition only conditional operators // friend constexpr basic_mask __simd_select_impl( // const basic_mask&, const basic_mask&, const basic_mask&) noexcept; @@ -337,51 +373,6 @@ class basic_mask : public __mask_operations<_Bytes, _Abi> // const basic_mask&, const T0&, const T1&) noexcept; }; -// [simd.mask.reductions], reductions - -template <::cuda::std::size_t _Bytes, typename _Abi> -[[nodiscard]] _CCCL_API constexpr bool all_of(const basic_mask<_Bytes, _Abi>& __k) noexcept -{ - using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; - return __mask_operations<_Bytes, _Abi>::__all(static_cast<__mask_storage_t>(__k)); -} - -template <::cuda::std::size_t _Bytes, typename _Abi> -[[nodiscard]] _CCCL_API constexpr bool any_of(const basic_mask<_Bytes, _Abi>& __k) noexcept -{ - using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; - return __mask_operations<_Bytes, _Abi>::__any(static_cast<__mask_storage_t>(__k)); -} - -template <::cuda::std::size_t _Bytes, typename _Abi> -[[nodiscard]] _CCCL_API constexpr bool none_of(const basic_mask<_Bytes, _Abi>& __k) noexcept -{ - return !::cuda::experimental::simd::any_of(__k); -} - -template <::cuda::std::size_t _Bytes, typename _Abi> -[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_count(const basic_mask<_Bytes, _Abi>& __k) noexcept -{ - using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; - return __mask_operations<_Bytes, _Abi>::__count(static_cast<__mask_storage_t>(__k)); -} - -template <::cuda::std::size_t _Bytes, typename _Abi> -[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_min_index(const basic_mask<_Bytes, _Abi>& __k) noexcept -{ - _CCCL_ASSERT(any_of(__k), "No bits are set"); - using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; - return __mask_operations<_Bytes, _Abi>::__min_index(static_cast<__mask_storage_t>(__k)); -} - -template <::cuda::std::size_t _Bytes, typename _Abi> -[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_max_index(const basic_mask<_Bytes, _Abi>& __k) noexcept -{ - _CCCL_ASSERT(any_of(__k), "No bits are set"); - using __mask_storage_t = typename __mask_operations<_Bytes, _Abi>::_MaskStorage; - return __mask_operations<_Bytes, _Abi>::__max_index(static_cast<__mask_storage_t>(__k)); -} - // Scalar bool overloads _CCCL_TEMPLATE(typename _Tp) @@ -414,7 +405,7 @@ _CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) _CCCL_TEMPLATE(typename _Tp) _CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) -[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_min_index(_Tp __x) noexcept +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_min_index(_Tp __x) { _CCCL_ASSERT(__x, "No bits are set"); return 0; @@ -422,8 +413,7 @@ _CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) _CCCL_TEMPLATE(typename _Tp) _CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) - -[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_max_index(_Tp __x) noexcept +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_max_index(_Tp __x) { _CCCL_ASSERT(__x, "No bits are set"); return 0; diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h index 187c0bb37be..dca39bcc332 100644 --- a/cudax/include/cuda/experimental/__simd/basic_vec.h +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDAX___SIMD_SIMD_H -#define _CUDAX___SIMD_SIMD_H +#ifndef _CUDAX___SIMD_BASIC_VEC_H +#define _CUDAX___SIMD_BASIC_VEC_H #include @@ -21,53 +21,59 @@ # pragma system_header #endif // no system header +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include #include +#include #include #include #include -#include -#include -#include -#include +#include +#include +#include #include #include namespace cuda::experimental::simd { -// P1928R15: basic_vec is the primary SIMD vector type (renamed from basic_simd) +// [simd.class], class template basic_vec template class basic_vec : public __simd_operations<_Tp, _Abi> { - static_assert(is_abi_tag_v<_Abi>, "basic_vec requires a valid ABI tag"); + static_assert(__is_vectorizable_v<_Tp>, "basic_vec requires a vectorizable type"); + static_assert(__is_abi_tag_v<_Abi>, "basic_vec requires a valid ABI tag"); + + template <::cuda::std::size_t, typename> + friend class basic_mask; using _Impl = __simd_operations<_Tp, _Abi>; using _Storage = typename _Impl::_SimdStorage; _Storage __s_; - template - static constexpr bool __is_value_preserving_broadcast = - (__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && __is_non_narrowing_convertible_v<_Up, _Tp>) - || (!__is_vectorizable_v<::cuda::std::remove_cvref_t<_Up>> && ::cuda::std::is_convertible_v<_Up, _Tp>); - struct __storage_tag_t {}; static constexpr __storage_tag_t __storage_tag{}; + _CCCL_API constexpr basic_vec(_Storage __s, __storage_tag_t) noexcept + : __s_{__s} + {} + public: using value_type = _Tp; - using reference = __simd_reference<_Storage, value_type>; using mask_type = basic_mask; using abi_type = _Abi; - // TODO: add iterators - // using iterator = simd-iterator; + // TODO(fbusato): add simd-iterator + // using iterator = simd-iterator; // using const_iterator = simd-iterator; // constexpr iterator begin() noexcept { return {*this, 0}; } @@ -81,78 +87,151 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_HIDE_FROM_ABI basic_vec() noexcept = default; // [simd.ctor], basic_vec constructors - // TODO: fix constraints + // [simd.ctor] value broadcast constructor (explicit overload) _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES( - (__explicitly_convertible_to<_Up, value_type>) _CCCL_AND(__is_simd_ctor_explicit_from_value<_Up, value_type>)) + _CCCL_REQUIRES((__explicitly_convertible_to) _CCCL_AND(!__is_value_ctor_implicit<_Up, value_type>)) _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept : __s_{_Impl::__broadcast(static_cast(__v))} {} + // [simd.ctor] value broadcast constructor (implicit overload) _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES( - (__explicitly_convertible_to<_Up, value_type>) _CCCL_AND(!__is_simd_ctor_explicit_from_value<_Up, value_type>)) + _CCCL_REQUIRES((__explicitly_convertible_to) _CCCL_AND(__is_value_ctor_implicit<_Up, value_type>)) _CCCL_API constexpr basic_vec(_Up&& __v) noexcept : __s_{_Impl::__broadcast(static_cast(__v))} {} + // [simd.ctor] converting constructor from basic_vec (explicit overload) _CCCL_TEMPLATE(typename _Up, typename _UAbi) - _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to<_Up, value_type>)) + _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to) + _CCCL_AND(__is_vec_ctor_explicit<_Up, value_type>)) _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept { - for (__simd_size_type __i = 0; __i < size; __i++) + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) { - (*this)[__i] = static_cast(__v[__i]); + __s_.__set(__i, static_cast(__v[__i])); } } - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES((!::cuda::std::is_same_v<_Up, _Tp>) _CCCL_AND(!__is_non_narrowing_convertible_v<_Up, value_type>) - _CCCL_AND(::cuda::std::is_convertible_v<_Up, value_type>)) - _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, abi_type>& __v) noexcept + // [simd.ctor] converting constructor from basic_vec (implicit overload) + _CCCL_TEMPLATE(typename _Up, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to) + _CCCL_AND(!__is_vec_ctor_explicit<_Up, value_type>)) + _CCCL_API constexpr basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept { - for (__simd_size_type __i = 0; __i < size; __i++) + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) { - (*this)[__i] = static_cast(__v[__i]); + __s_.__set(__i, static_cast(__v[__i])); } } + // [simd.ctor] generator constructor _CCCL_TEMPLATE(typename _Generator) _CCCL_REQUIRES(__can_generate_v) _CCCL_API constexpr explicit basic_vec(_Generator&& __g) - : __s_(_Impl::__generate(__g)) + : __s_{_Impl::__generate(__g)} {} - // TODO: add range constructors - // template - // constexpr basic_vec(R&& range, flags = {}); + // [simd.ctor] range constructor + + template + static constexpr bool __range_static_size_matches_v = false; + + template + static constexpr bool __range_static_size_matches_v< + _Range, + _Size, + ::cuda::std::void_t>)>> = + (__simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>} == _Size); + + template + static constexpr bool __is_compatible_range_v = + ::cuda::std::ranges::contiguous_range<_Range> && ::cuda::std::ranges::sized_range<_Range> + && __range_static_size_matches_v<_Range, size()> && __is_vectorizable_v<::cuda::std::ranges::range_value_t<_Range>> + && __explicitly_convertible_to>; + + template + _CCCL_API constexpr static void + __assert_alignment([[maybe_unused]] const ::cuda::std::ranges::range_value_t<_Range>* __data) noexcept + { + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (__has_aligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, alignment_v>), + "flag_aligned requires data to be aligned to alignment_v>"); + } + else if constexpr (__has_overaligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, __overaligned_alignment_v<_Flags...>), + "flag_overaligned requires data to be aligned to N"); + } + } + } + + // [simd.ctor] range constructor + _CCCL_TEMPLATE(typename _Range, typename... _Flags) + _CCCL_REQUIRES(__is_compatible_range_v<_Range>) + _CCCL_API constexpr basic_vec(_Range&& __range, flags<_Flags...> = {}) + { + static_assert(__has_convert_flag_v<_Flags...> + || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + const auto __data = ::cuda::std::ranges::data(__range); + __assert_alignment<_Range, _Flags...>(__data); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, static_cast(__data[__i])); + } + } - // template - // constexpr basic_vec(R&& range, const mask_type& mask, flags = {}); + // [simd.ctor] masked range constructor + _CCCL_TEMPLATE(typename _Range, typename... _Flags) + _CCCL_REQUIRES(__is_compatible_range_v<_Range>) + _CCCL_API constexpr basic_vec(_Range&& __range, const mask_type& __mask, flags<_Flags...> = {}) + { + static_assert(__has_convert_flag_v<_Flags...> + || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + const auto __data = ::cuda::std::ranges::data(__range); + __assert_alignment<_Range, _Flags...>(__data); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, __mask[__i] ? static_cast(__data[__i]) : value_type()); + } + } - // constexpr basic_vec(const real - type & reals, const real - type& imags = {}) noexcept; + // TODO(fbusato): add complex constructor + // constexpr basic_vec(const real-type& __reals, const real-type& __imags = {}) noexcept; // [simd.subscr], basic_vec subscript operators - _CCCL_API value_type operator[](__simd_size_type __i) const noexcept + + [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const { + _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __simd_size_type{size}), "Index is out of bounds"); return __s_.__get(__i); } - // TODO: add operator[] - // template - // constexpr resize_t operator[](const I& indices) const; + // TODO(fbusato): subscript with integral indices, requires permute() + // template + // constexpr resize_t<_Idx::size(), basic_vec> operator[](const _Idx& __indices) const; - // TODO: [simd.complex.access], basic_vec complex accessors + // TODO(fbusato): [simd.complex.access], basic_vec complex accessors // constexpr real-type real() const noexcept; // constexpr real-type imag() const noexcept; - // constexpr void real(const real-type& v) noexcept; - // constexpr void imag(const real-type& v) noexcept; + // constexpr void real(const real-type& __v) noexcept; + // constexpr void imag(const real-type& __v) noexcept; // [simd.unary], basic_vec unary operators + _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(__has_pre_increment<_Up>) - _CCCL_API basic_vec& operator++() noexcept + _CCCL_API constexpr basic_vec& operator++() noexcept { _Impl::__increment(__s_); return *this; @@ -160,7 +239,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(__has_post_increment<_Up>) - _CCCL_API basic_vec operator++(int) noexcept + [[nodiscard]] _CCCL_API constexpr basic_vec operator++(int) noexcept { const basic_vec __r = *this; _Impl::__increment(__s_); @@ -169,7 +248,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(__has_pre_decrement<_Up>) - _CCCL_API basic_vec& operator--() noexcept + _CCCL_API constexpr basic_vec& operator--() noexcept { _Impl::__decrement(__s_); return *this; @@ -177,7 +256,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(__has_post_decrement<_Up>) - _CCCL_API basic_vec operator--(int) noexcept + [[nodiscard]] _CCCL_API constexpr basic_vec operator--(int) noexcept { const basic_vec __r = *this; _Impl::__decrement(__s_); @@ -186,7 +265,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(__has_negate<_Up>) - [[nodiscard]] _CCCL_API mask_type operator!() const noexcept + [[nodiscard]] _CCCL_API constexpr mask_type operator!() const noexcept { return mask_type{_Impl::__negate(__s_), mask_type::__storage_tag}; } @@ -199,15 +278,15 @@ class basic_vec : public __simd_operations<_Tp, _Abi> } _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(__has_plus<_Up>) - [[nodiscard]] _CCCL_API basic_vec operator+() const noexcept + _CCCL_REQUIRES(__has_unary_plus<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator+() const noexcept { return *this; } _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(__has_unary_minus<_Up>) - [[nodiscard]] _CCCL_API basic_vec operator-() const noexcept + [[nodiscard]] _CCCL_API constexpr basic_vec operator-() const noexcept { return basic_vec{_Impl::__unary_minus(__s_), __storage_tag}; } @@ -215,14 +294,14 @@ class basic_vec : public __simd_operations<_Tp, _Abi> // [simd.binary], basic_vec binary operators _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(__has_plus<_Up>) + _CCCL_REQUIRES(__has_binary_plus<_Up>) [[nodiscard]] _CCCL_API friend constexpr basic_vec operator+(const basic_vec& __lhs, const basic_vec& __rhs) noexcept { return basic_vec{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; } _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(__has_minus<_Up>) + _CCCL_REQUIRES(__has_binary_minus<_Up>) [[nodiscard]] _CCCL_API friend constexpr basic_vec operator-(const basic_vec& __lhs, const basic_vec& __rhs) noexcept { return basic_vec{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; @@ -289,27 +368,27 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_REQUIRES(__has_shift_left_size<_Up>) [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, __simd_size_type __n) noexcept { - return basic_vec{_Impl::__shift_left(__lhs.__s_, basic_vec{__n}), __storage_tag}; + return __lhs << basic_vec{__n}; } _CCCL_TEMPLATE(typename _Up = _Tp) _CCCL_REQUIRES(__has_shift_right_size<_Up>) [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, __simd_size_type __n) noexcept { - return basic_vec{_Impl::__shift_right(__lhs.__s_, basic_vec{__n}), __storage_tag}; + return __lhs >> basic_vec{__n}; } // [simd.cassign], basic_vec compound assignment _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(__has_plus<_Up>) + _CCCL_REQUIRES(__has_binary_plus<_Up>) _CCCL_API friend constexpr basic_vec& operator+=(basic_vec& __lhs, const basic_vec& __rhs) noexcept { return __lhs = __lhs + __rhs; } _CCCL_TEMPLATE(typename _Up = _Tp) - _CCCL_REQUIRES(__has_minus<_Up>) + _CCCL_REQUIRES(__has_binary_minus<_Up>) _CCCL_API friend constexpr basic_vec& operator-=(basic_vec& __lhs, const basic_vec& __rhs) noexcept { return __lhs = __lhs - __rhs; @@ -398,7 +477,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _CCCL_REQUIRES(__has_not_equal_to<_Up>) [[nodiscard]] _CCCL_API friend constexpr mask_type operator!=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept { - return mask_type{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), __storage_tag}; + return mask_type{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; } _CCCL_TEMPLATE(typename _Up = _Tp) @@ -429,29 +508,39 @@ class basic_vec : public __simd_operations<_Tp, _Abi> return mask_type{_Impl::__less(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; } - // _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - // _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) - // _CCCL_API void copy_from(const _Up* __mem, _Flags = {}) noexcept - // { - // _Impl::__load(__s_, _Flags::template __apply(__mem)); - // } - // - // _CCCL_TEMPLATE(typename _Up, typename _Flags = element_aligned_tag) - // _CCCL_REQUIRES(__is_vectorizable_v<_Up> _CCCL_AND is_simd_flag_type_v<_Flags>) - // _CCCL_API void copy_to(_Up* __mem, _Flags = {}) const noexcept - // { - // _Impl::__store(__s_, _Flags::template __apply(__mem)); - // } + // TODO(fbusato): [simd.cond], basic_vec exposition-only conditional operators + // friend constexpr basic_vec __simd_select_impl( + // const mask_type&, const basic_vec&, const basic_vec&) noexcept; }; -// TODO: deduction guides -// template -// basic_vec(R&& r, Ts...) -> ...; - -// template -// basic_vec(basic_mask) -> ...; +// Proxy for ranges::size(r) is a constant expression +template +_CCCL_CONCEPT __has_static_size = _CCCL_REQUIRES_EXPR((_Range))((__simd_size_type{::cuda::std::tuple_size_v<_Range>})); + +// [simd.ctor] deduction guide from contiguous sized range +// Deduces vec, static_cast(ranges::size(r))> +// * it is not possible to use the alias "vec" for the deduction guide +// * "vec" is defined as basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>> +// * where _Np is __simd_size_v<_Tp, tuple_size_v<_Range>> +_CCCL_TEMPLATE(typename _Range, typename... _Ts) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __has_static_size<::cuda::std::remove_cvref_t<_Range>>) +basic_vec(_Range&&, _Ts...) -> basic_vec< + ::cuda::std::ranges::range_value_t<_Range>, + simd_abi::__deduce_abi_t<::cuda::std::ranges::range_value_t<_Range>, + __simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>}>>; + +// [simd.ctor] deduction guide from basic_mask +// basic_vec<__integer_from, Abi> is equivalent to decltype(+k): +// * k has type basic_mask<_Bytes, _Abi> +// * +k calls basic_mask::operator+() +// * the return type is basic_vec<__integer_from<_B>, _Abi> +// The deduced type is equivalent to decltype(+k), i.e. basic_vec<__integer_from, Abi> +_CCCL_TEMPLATE(::cuda::std::size_t _Bytes, typename _Abi) +_CCCL_REQUIRES(__has_unary_plus>) +basic_vec(basic_mask<_Bytes, _Abi>) -> basic_vec<__integer_from<_Bytes>, _Abi>; } // namespace cuda::experimental::simd #include -#endif // _CUDAX___SIMD_SIMD_H +#endif // _CUDAX___SIMD_BASIC_VEC_H diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/concepts.h new file mode 100644 index 00000000000..b36316dc827 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/concepts.h @@ -0,0 +1,237 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_CONCEPTS_H +#define _CUDAX___SIMD_CONCEPTS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], explicitly-convertible-to concept + +template +_CCCL_CONCEPT __explicitly_convertible_to = + _CCCL_REQUIRES_EXPR((_To, _From))(requires(static_cast<_To>(::cuda::std::declval<_From>()))); + +// [simd.expos], constexpr-wrapper-like concept + +template +_CCCL_CONCEPT __constexpr_wrapper_like = + ::cuda::std::convertible_to<_Tp, decltype(_Tp::value)> + && ::cuda::std::equality_comparable_with<_Tp, decltype(_Tp::value)> + && ::cuda::std::bool_constant<(_Tp() == _Tp::value)>::value + && ::cuda::std::bool_constant<(static_cast(_Tp()) == _Tp::value)>::value; + +// (c++draft)The conversion from an arithmetic type U to a vectorizable type T is value-preserving if all possible +// values of U can be represented with type T. +template +constexpr bool __is_value_preserving_v = + (::cuda::std::is_integral_v<_From> && ::cuda::std::is_integral_v<_To> + && ::cuda::__is_integer_representable_v<_From, _To>) + || (::cuda::is_floating_point_v<_From> && ::cuda::is_floating_point_v<_To> + && ::cuda::std::__fp_is_implicit_conversion_v<_From, _To>) + || (::cuda::std::is_integral_v<_From> && ::cuda::is_floating_point_v<_To> + && ::cuda::std::numeric_limits<_From>::digits <= ::cuda::std::numeric_limits<_To>::digits); + +// [simd.ctor] implicit value constructor +template > +_CCCL_CONCEPT __is_value_ctor_implicit = + ::cuda::std::convertible_to<_Up, _ValueType> + && ((!::cuda::std::is_arithmetic_v<_From> && !__constexpr_wrapper_like<_From>) + || (::cuda::std::is_arithmetic_v<_From> && __is_value_preserving_v<_From, _ValueType>) + || (__constexpr_wrapper_like<_From> + && ::cuda::std::is_arithmetic_v<::cuda::std::remove_cvref_t> + && __is_value_preserving_v<_From, _ValueType>) ); + +// [conv.rank], integer conversion rank for [simd.ctor] p7 + +template +inline constexpr int __integer_conversion_rank = 0; + +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 5; +template <> +inline constexpr int __integer_conversion_rank = 5; +#if _CCCL_HAS_INT128() +template <> +inline constexpr int __integer_conversion_rank<__int128_t> = 6; +template <> +inline constexpr int __integer_conversion_rank<__uint128_t> = 6; +#endif // _CCCL_HAS_INT128() + +// [conv.rank], floating-point conversion rank for [simd.ctor] p7 + +template +inline constexpr int __fp_conversion_rank = 0; + +#if _CCCL_HAS_NVFP16() +template <> +inline constexpr int __fp_conversion_rank<__half> = 1; +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank<__nv_bfloat16> = 1; +#endif // _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank = 2; +template <> +inline constexpr int __fp_conversion_rank = 3; +#if _CCCL_HAS_LONG_DOUBLE() +template <> +inline constexpr int __fp_conversion_rank = 4; +#endif // _CCCL_HAS_LONG_DOUBLE() +#if _CCCL_HAS_FLOAT128() +template <> +inline constexpr int __fp_conversion_rank<__float128> = 5; +#endif // _CCCL_HAS_FLOAT128() + +// [simd.ctor] p7: explicit(see below) for basic_vec(const basic_vec&) +// explicit evaluates to true if either: +// - conversion from U to value_type is not value-preserving, or +// - both U and value_type are integral and integer_conversion_rank(U) > rank(value_type), or +// - both U and value_type are floating-point and fp_conversion_rank(U) > rank(value_type) +template +constexpr bool __is_vec_ctor_explicit = + !__is_value_preserving_v<_Up, _ValueType> + || (::cuda::std::is_integral_v<_Up> && ::cuda::std::is_integral_v<_ValueType> + && __integer_conversion_rank<_Up> > __integer_conversion_rank<_ValueType>) + || (::cuda::is_floating_point_v<_Up> && ::cuda::is_floating_point_v<_ValueType> + && __fp_conversion_rank<_Up> > __fp_conversion_rank<_ValueType>); + +// [simd.unary], operator constraints + +template +_CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t)); + +template +_CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++)); + +template +_CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t)); + +template +_CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--)); + +template +_CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((!__t)); + +template +_CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((~__t)); + +template +_CCCL_CONCEPT __has_unary_plus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((+__t)); + +template +_CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((-__t)); + +// [simd.binary], binary operator constraints + +template +_CCCL_CONCEPT __has_binary_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a + __b)); + +template +_CCCL_CONCEPT __has_binary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a - __b)); + +template +_CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a * __b)); + +template +_CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a / __b)); + +template +_CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a % __b)); + +template +_CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a & __b)); + +template +_CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a | __b)); + +template +_CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a ^ __b)); + +template +_CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a << __b)); + +template +_CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >> __b)); + +template +_CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{})); + +template +_CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{})); + +// [simd.comparison], comparison operator constraints + +template +_CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a == __b)); + +template +_CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a != __b)); + +template +_CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >= __b)); + +template +_CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a <= __b)); + +template +_CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a > __b)); + +template +_CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a < __b)); +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_CONCEPTS_H diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h index 70ec87d2e5d..bc9d2487f8c 100644 --- a/cudax/include/cuda/experimental/__simd/declaration.h +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -30,8 +30,6 @@ namespace cuda::experimental::simd { - - template > class basic_vec; @@ -46,6 +44,12 @@ using mask = basic_mask>; // specializations +template +struct __simd_storage; + +template +struct __simd_operations; + template <::cuda::std::size_t _Bytes, typename _Abi> struct __mask_storage; diff --git a/cudax/include/cuda/experimental/__simd/flag.h b/cudax/include/cuda/experimental/__simd/flag.h new file mode 100644 index 00000000000..d2aad6e8ba8 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/flag.h @@ -0,0 +1,105 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_FLAG_H +#define _CUDAX___SIMD_FLAG_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], exposition-only flag types + +struct __convert_flag +{}; + +struct __aligned_flag +{}; + +template <::cuda::std::size_t _Np> +struct __overaligned_flag +{ + static_assert(::cuda::is_power_of_two(_Np), "Overaligned flag requires a power-of-2 alignment"); +}; + +template +constexpr bool __is_flag_type_v = false; + +template <> +constexpr bool __is_flag_type_v<__convert_flag> = true; + +template <> +constexpr bool __is_flag_type_v<__aligned_flag> = true; + +template <::cuda::std::size_t _Np> +constexpr bool __is_flag_type_v<__overaligned_flag<_Np>> = true; + +// [simd.flags.overview], class template flags + +template +struct flags +{ + static_assert((true && ... && __is_flag_type_v<_Flags>), + "Every flag type must be one of convert_flag, aligned_flag, or overaligned_flag"); + + // [simd.flags.oper], flags operators + template + [[nodiscard]] _CCCL_API friend constexpr flags<_Flags..., _Other...> operator|(flags, flags<_Other...>) noexcept + { + return {}; + } +}; + +// [simd.flags], flag constants + +inline constexpr flags<> flag_default{}; +inline constexpr flags<__convert_flag> flag_convert{}; +inline constexpr flags<__aligned_flag> flag_aligned{}; + +template <::cuda::std::size_t _Np> +constexpr flags<__overaligned_flag<_Np>> flag_overaligned{}; + +template +constexpr bool __has_convert_flag_v = (false || ... || ::cuda::std::is_same_v<_Flags, __convert_flag>); + +template +constexpr bool __has_aligned_flag_v = (false || ... || ::cuda::std::is_same_v<_Flags, __aligned_flag>); + +template +constexpr ::cuda::std::size_t __overaligned_value_v = 0; + +template <::cuda::std::size_t _Np> +constexpr ::cuda::std::size_t __overaligned_value_v<__overaligned_flag<_Np>> = _Np; + +template +constexpr bool __has_overaligned_flag_v = (false || ... || (__overaligned_value_v<_Flags> != 0)); + +template +constexpr ::cuda::std::size_t __overaligned_alignment_v = + (::cuda::std::size_t{0} | ... | __overaligned_value_v<_Flags>); +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_FLAG_H diff --git a/cudax/include/cuda/experimental/__simd/specializations/fixed_size_impl.h b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h similarity index 64% rename from cudax/include/cuda/experimental/__simd/specializations/fixed_size_impl.h rename to cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h index 1a2f20831a2..624d373ee8a 100644 --- a/cudax/include/cuda/experimental/__simd/specializations/fixed_size_impl.h +++ b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h @@ -4,12 +4,12 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef _CUDAX___SIMD_FIXED_SIZE_IMPL_H -#define _CUDAX___SIMD_FIXED_SIZE_IMPL_H +#ifndef _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H +#define _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H #include @@ -22,10 +22,7 @@ #endif // no system header #include -#include -#include #include -#include #include #include @@ -43,35 +40,33 @@ struct __fixed_size_simple }; } // namespace simd_abi +// Element-per-slot simd storage for fixed_size_simple ABI template struct __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>> { using value_type = _Tp; _Tp __data[_Np]; - [[nodiscard]] _CCCL_API constexpr _Tp __get([[maybe_unused]] ::cuda::std::size_t __idx) const noexcept + [[nodiscard]] _CCCL_API constexpr _Tp __get(__simd_size_type __idx) const noexcept { - using ::cuda::std::size_t; - _CCCL_ASSERT(::cuda::in_range(__idx, size_t{0}, size_t{_Np}), "Index is out of bounds"); + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); return __data[__idx]; } - _CCCL_API constexpr void __set([[maybe_unused]] ::cuda::std::size_t __idx, _Tp __v) noexcept + _CCCL_API constexpr void __set(__simd_size_type __idx, _Tp __v) noexcept { - using ::cuda::std::size_t; - _CCCL_ASSERT(::cuda::in_range(__idx, size_t{0}, size_t{_Np}), "Index is out of bounds"); + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); __data[__idx] = __v; } }; -// Helper macros to generate repeated fixed-size operations. #define _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_STORAGE_TYPE, _NAME, _OP) \ [[nodiscard]] _CCCL_API static constexpr _STORAGE_TYPE _NAME( \ const _STORAGE_TYPE& __lhs, const _STORAGE_TYPE& __rhs) noexcept \ { \ _STORAGE_TYPE __result; \ _CCCL_PRAGMA_UNROLL_FULL() \ - for (int __i = 0; __i < _Np; ++__i) \ + for (__simd_size_type __i = 0; __i < _Np; ++__i) \ { \ __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]); \ } \ @@ -84,22 +79,14 @@ struct __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>> { \ _MaskStorage __result; \ _CCCL_PRAGMA_UNROLL_FULL() \ - for (int __i = 0; __i < _Np; ++__i) \ + for (__simd_size_type __i = 0; __i < _Np; ++__i) \ { \ __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]); \ } \ return __result; \ } -#define _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_STORAGE_TYPE, _NAME, _OP) \ - _CCCL_TEMPLATE(typename _Up = _Tp) \ - _CCCL_REQUIRES(::cuda::std::is_integral_v<_Up>) \ - _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_STORAGE_TYPE, _NAME, _OP) - -// ********************************************************************************************************************* -// * SIMD Arithmetic Operations -// ********************************************************************************************************************* - +// Simd operations for fixed_size_simple ABI template struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> { @@ -110,50 +97,32 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> { _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; ++__i) + for (__simd_size_type __i = 0; __i < _Np; ++__i) { __result.__data[__i] = __v; } return __result; } - template + template [[nodiscard]] _CCCL_API static constexpr _SimdStorage - __generate_init(_Generator&& __g, ::cuda::std::index_sequence<_Is...>) + __generate_init(_Generator&& __g, ::cuda::std::integer_sequence<__simd_size_type, _Is...>) { - return _SimdStorage{{__g(::cuda::std::integral_constant<::cuda::std::size_t, _Is>())...}}; + return _SimdStorage{{__g(::cuda::std::integral_constant<__simd_size_type, _Is>())...}}; } template [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) { - return __generate_init(__g, ::cuda::std::make_index_sequence<_Np>()); - } - - template - _CCCL_API static constexpr void __load(_SimdStorage& __s, const _Up* __mem) noexcept - { - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __s.__data[__i] = static_cast<_Tp>(__mem[__i]); - } + return __generate_init(__g, ::cuda::std::make_integer_sequence<__simd_size_type, _Np>()); } - template - _CCCL_API static constexpr void __store(const _SimdStorage& __s, _Up* __mem) noexcept - { - _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) - { - __mem[__i] = static_cast<_Up>(__s.__data[__i]); - } - } + // Unary operations _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept { _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) + for (__simd_size_type __i = 0; __i < _Np; ++__i) { __s.__data[__i] += 1; } @@ -162,7 +131,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept { _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) + for (__simd_size_type __i = 0; __i < _Np; ++__i) { __s.__data[__i] -= 1; } @@ -172,7 +141,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> { _MaskStorage __result; _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) + for (__simd_size_type __i = 0; __i < _Np; ++__i) { __result.__data[__i] = !__s.__data[__i]; } @@ -183,7 +152,7 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> { _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) + for (__simd_size_type __i = 0; __i < _Np; ++__i) { __result.__data[__i] = ~__s.__data[__i]; } @@ -194,13 +163,15 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> { _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() - for (int __i = 0; __i < _Np; __i++) + for (__simd_size_type __i = 0; __i < _Np; ++__i) { __result.__data[__i] = -__s.__data[__i]; } return __result; } + // Binary arithmetic operations + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __plus, +) _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __minus, -) @@ -209,6 +180,10 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __divides, /) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __modulo, %) + + // Comparison operations + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__equal_to, ==) _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__not_equal_to, !=) @@ -221,24 +196,23 @@ struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__greater_equal, >=) - _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __modulo, %) + // Bitwise and shift operations - _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __bitwise_and, &) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_and, &) - _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __bitwise_or, |) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_or, |) - _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __bitwise_xor, ^) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_xor, ^) - _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __shift_left, <<) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __shift_left, <<) - _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP(_SimdStorage, __shift_right, >>) + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __shift_right, >>) }; -#undef _CUDAX_SIMD_FIXED_SIZE_BITWISE_OP #undef _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP #undef _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP } // namespace cuda::experimental::simd #include -#endif // _CUDAX___SIMD_FIXED_SIZE_IMPL_H +#endif // _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H diff --git a/cudax/include/cuda/experimental/__simd/to_remove/concepts.h b/cudax/include/cuda/experimental/__simd/to_remove/concepts.h deleted file mode 100644 index b0b3fca2c06..00000000000 --- a/cudax/include/cuda/experimental/__simd/to_remove/concepts.h +++ /dev/null @@ -1,210 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX___SIMD_CONCEPTS_H -#define _CUDAX___SIMD_CONCEPTS_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include - -#include - -#include - -namespace cuda::experimental::simd -{ -template -bool constexpr __is_value_preserving_broadcast_impl() -{ - // TODO - return true; -} - -template -_CCCL_CONCEPT __is_value_preserving_convertible = __is_value_preserving_broadcast_impl<_From, _To>(); - -template -_CCCL_CONCEPT __explicitly_convertible_to = - _CCCL_REQUIRES_EXPR((_To, _From))(requires(static_cast<_To>(::cuda::std::declval<_From>()))); - -template -_CCCL_CONCEPT __constexpr_wrapper_like = - ::cuda::std::convertible_to<_Tp, decltype(_Tp::value)> - && ::cuda::std::equality_comparable_with<_Tp, decltype(_Tp::value)> && (_Tp() == _Tp::value) - && (static_cast(_Tp()) == _Tp::value); - -template -_CCCL_CONCEPT __is_simd_ctor_explicit_from_value = - ::cuda::std::convertible_to<_Tp, _ValueType> - && ((!::cuda::std::is_arithmetic_v<_Tp> && !__constexpr_wrapper_like<_Tp>) - || (::cuda::std::is_arithmetic_v<_Tp> && __is_value_preserving_convertible<_Tp, _ValueType>) - || (__constexpr_wrapper_like<_Tp> && ::cuda::std::is_arithmetic_v<::cuda::std::remove_cvref_t<_Tp>> - && __is_value_preserving_convertible<_Tp, _ValueType>); - - -template -inline constexpr int __integer_conversion_rank = 0; - -template <> -inline constexpr int __integer_conversion_rank = 1; -template <> -inline constexpr int __integer_conversion_rank = 1; -template <> -inline constexpr int __integer_conversion_rank = 1; -template <> -inline constexpr int __integer_conversion_rank = 2; -template <> -inline constexpr int __integer_conversion_rank = 2; -template <> -inline constexpr int __integer_conversion_rank = 3; -template <> -inline constexpr int __integer_conversion_rank = 3; -template <> -inline constexpr int __integer_conversion_rank = 4; -template <> -inline constexpr int __integer_conversion_rank = 4; -template <> -inline constexpr int __integer_conversion_rank = 5; -template <> -inline constexpr int __integer_conversion_rank = 5; -#if defined(_CCCL_HAS_INT128) -template <> -inline constexpr int __integer_conversion_rank<__int128_t> = 6; -template <> -inline constexpr int __integer_conversion_rank<__uint128_t> = 6; -#endif // defined(_CCCL_HAS_INT128) - -template -inline constexpr int __fp_conversion_rank = 0; - -#if _CCCL_HAS_NVFP16() -template <> -inline constexpr int __fp_conversion_rank<__half> = 1; -#endif // _CCCL_HAS_NVFP16() -#if _CCCL_HAS_NVBF16() -template <> -inline constexpr int __fp_conversion_rank<__nv_bfloat16> = 1; -#endif // _CCCL_HAS_NVBF16() -template <> -inline constexpr int __fp_conversion_rank = 2; -template <> -inline constexpr int __fp_conversion_rank = 3; -#if _CCCL_HAS_LONG_DOUBLE() -template <> -inline constexpr int __fp_conversion_rank = 4; -#endif // _CCCL_HAS_LONG_DOUBLE() -#if _CCCL_HAS_FLOAT128() -template <> -inline constexpr int __fp_conversion_rank<__float128> = 5; -#endif // _CCCL_HAS_FLOAT128() - - -template -_CCCL_CONCEPT __is_simd_ctor_explicit_from_vec = - !__is_value_preserving_convertible<_Tp, _Up> || -(::cuda::std::is_integral_v<_Tp> && ::cuda::std::is_integral_v<_Up> && -__integer_conversion_rank<_Tp> > __integer_conversion_rank<_Up>) || -(::cuda::is_floating_point_v<_Tp> && ::cuda::is_floating_point_v<_Up> && -(__fp_conversion_rank<_Tp> > __fp_conversion_rank<_Up>)); - - - -template -_CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t)); - -template -_CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++)); - -template -_CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t)); - -template -_CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--)); - -template -_CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((!__t)); - -template -_CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((~__t)); - -template -_CCCL_CONCEPT __has_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((+__t)); - -template -_CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((-__t)); - -template -_CCCL_CONCEPT __has_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t - __t)); - -template -_CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t * __t)); - -template -_CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t / __t)); - -template -_CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t % __t)); - -template -_CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t & __t)); - -template -_CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t | __t)); - -template -_CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t ^ __t)); - -template -_CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __t)); - -template -_CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __t)); - -template -_CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{})); - -template -_CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{})); - -template -_CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t == __t)); - -template -_CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t != __t)); - -template -_CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >= __t)); - -template -_CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t <= __t)); - -template -_CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t > __t)); - -template -_CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t < __t)); -} // namespace cuda::experimental::simd - -#include - -#endif // _CUDAX___SIMD_CONCEPTS_H diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index 98ae8f994ba..cf0029460dc 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -24,21 +24,17 @@ #include #include #include -#include -#include #include #include #include -#include +#include #include namespace cuda::experimental::simd { - - -template +template inline constexpr bool __is_non_narrowing_convertible_v = false; template @@ -46,8 +42,6 @@ inline constexpr bool __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = true; - - template <::cuda::std::size_t _Bytes> constexpr bool __has_integer_from_v = (_Bytes == 1 || _Bytes == 2 || _Bytes == 4 || _Bytes == 8 @@ -59,27 +53,30 @@ constexpr bool __has_integer_from_v = template constexpr bool __is_abi_tag_v = false; -template -constexpr bool __is_abi_tag_v> = true; +template <__simd_size_type _Np> +constexpr bool __is_abi_tag_v> = true; -template +template constexpr bool __is_well_formed = false; -template +template constexpr bool __is_well_formed<_Tp, _Generator, _Idx, ::cuda::std::void_t()( - ::cuda::std::integral_constant<__simd_size_type, _Idx>()))>> = ::cuda::std:: - is_same_v<_Tp, decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<__simd_size_type, _Idx>()))>; - -template -_CCCL_API constexpr bool __can_generate(::cuda::std::integer_sequence<__simd_size_type, _Indices...>) + ::cuda::std::integral_constant<__simd_size_type, _Idx>()))>> = + ::cuda::std::is_convertible_v< + decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<__simd_size_type, _Idx>())), + _Tp>; + +template +[[nodiscard]] +_CCCL_API constexpr bool __can_generate(::cuda::std::integer_sequence<__simd_size_type, _Indices...>) noexcept { return (true && ... && __is_well_formed<_Tp, _Generator, _Indices>); } -template +template constexpr bool __can_generate_v = ::cuda::experimental::simd::__can_generate<_Tp, _Generator>( ::cuda::std::make_integer_sequence<__simd_size_type, _Size>()); } // namespace cuda::experimental::simd From 48de3657aa0c76402f887efce36c5b09a3ae8564 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 31 Mar 2026 11:15:06 -0700 Subject: [PATCH 30/32] reduction implementation --- .../cuda/experimental/__simd/basic_mask.h | 8 + .../cuda/experimental/__simd/basic_vec.h | 1 + .../cuda/experimental/__simd/load_store.h | 0 .../cuda/experimental/__simd/reductions.h | 214 +++++++++++++++++ .../experimental/__simd/to_remove/reference.h | 216 ------------------ .../experimental/__simd/to_remove/traits.h | 201 ---------------- .../cuda/experimental/__simd/utility.h | 23 +- cudax/include/cuda/experimental/simd.cuh | 1 + 8 files changed, 231 insertions(+), 433 deletions(-) create mode 100644 cudax/include/cuda/experimental/__simd/load_store.h create mode 100644 cudax/include/cuda/experimental/__simd/reductions.h delete mode 100644 cudax/include/cuda/experimental/__simd/to_remove/reference.h delete mode 100644 cudax/include/cuda/experimental/__simd/to_remove/traits.h diff --git a/cudax/include/cuda/experimental/__simd/basic_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h index 6619a7138cf..83c1b9315a4 100644 --- a/cudax/include/cuda/experimental/__simd/basic_mask.h +++ b/cudax/include/cuda/experimental/__simd/basic_mask.h @@ -157,6 +157,14 @@ class basic_mask : public __mask_operations<_Bytes, _Abi> return {_Impl::__bitwise_not(__s_), __storage_tag}; } + template <::cuda::std::size_t _Bytes> + static constexpr bool __has_integer_from_v = + (_Bytes == 1 || _Bytes == 2 || _Bytes == 4 || _Bytes == 8 +#if _CCCL_HAS_INT128() + || _Bytes == 16 +#endif // _CCCL_HAS_INT128() + ); + _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) _CCCL_REQUIRES(__has_integer_from_v<_B>) [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_B>, _Abi> operator+() const noexcept diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h index dca39bcc332..0bd10dabfdd 100644 --- a/cudax/include/cuda/experimental/__simd/basic_vec.h +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -539,6 +539,7 @@ basic_vec(_Range&&, _Ts...) -> basic_vec< _CCCL_TEMPLATE(::cuda::std::size_t _Bytes, typename _Abi) _CCCL_REQUIRES(__has_unary_plus>) basic_vec(basic_mask<_Bytes, _Abi>) -> basic_vec<__integer_from<_Bytes>, _Abi>; + } // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/load_store.h b/cudax/include/cuda/experimental/__simd/load_store.h new file mode 100644 index 00000000000..e69de29bb2d diff --git a/cudax/include/cuda/experimental/__simd/reductions.h b/cudax/include/cuda/experimental/__simd/reductions.h new file mode 100644 index 00000000000..37785464113 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/reductions.h @@ -0,0 +1,214 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_REDUCTIONS_H +#define _CUDAX___SIMD_REDUCTIONS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], reduction-binary-operation concept + +template +_CCCL_CONCEPT __reduction_binary_operation = _CCCL_REQUIRES_EXPR( + (_BinaryOp, _Tp), const _BinaryOp __binary_op, const vec<_Tp, 1> __v)(_Same_as(vec<_Tp, 1>) __binary_op(__v, __v)); + +template +constexpr bool __is_reduce_default_supported_operation_v = + ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::plus<>> || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::multiplies<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_and<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_or<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_xor<>>; + +template +[[nodiscard]] _CCCL_API constexpr _Tp __default_identity_element() noexcept +{ + if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::plus<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_or<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_xor<>>) + { + return _Tp(); + } + else if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::multiplies<>>) + { + return _Tp(1); + } + else if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_and<>>) + { + return _Tp(~_Tp()); + } + else + { + static_assert(::cuda::std::__always_false_v<_Tp>, + "No default identity element for this BinaryOperation; provide one explicitly"); + return _Tp(); + } +} + +// [simd.reductions], reduce + +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation = ::cuda::std::plus<>) +_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce(const basic_vec<_Tp, _Abi>& __x, _BinaryOperation __binary_op = ::cuda::std::plus<>{}) +{ + vec<_Tp, 1> __result{__x[0]}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 1; __i < __x.size; ++__i) + { + __result = __binary_op(__result, vec<_Tp, 1>{__x[__i]}); + } + return __result[0]; +} + +// We need two overloads: +// 1) An argument for identity_element is provided for the invocation +// 2) unless BinaryOperation is one of plus<>, multiplies<>, bit_and<>, bit_or<>, or bit_xor<> +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation) +_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce(const basic_vec<_Tp, _Abi>& __x, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + _BinaryOperation __binary_op, + ::cuda::std::type_identity_t<_Tp> __identity_element) +{ + vec<_Tp, 1> __result{__identity_element}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __x.size; ++__i) + { + if (__mask[__i]) + { + __result = __binary_op(__result, vec<_Tp, 1>{__x[__i]}); + } + } + return __result[0]; +} + +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation) +_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp> _CCCL_AND + __is_reduce_default_supported_operation_v<_BinaryOperation>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce(const basic_vec<_Tp, _Abi>& __x, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + _BinaryOperation __binary_op = ::cuda::std::plus<>{}) +{ + return ::cuda::experimental::simd::reduce( + __x, __mask, __binary_op, ::cuda::experimental::simd::__default_identity_element<_Tp, _BinaryOperation>()); +} + +// [simd.reductions], reduce_min + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp reduce_min(const basic_vec<_Tp, _Abi>& __x) noexcept +{ + auto __result = __x[0]; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 1; __i < __x.size; ++__i) + { + const auto __val = __x[__i]; + if (!(__result < __val)) + { + __result = __val; + } + } + return __result; +} + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce_min(const basic_vec<_Tp, _Abi>& __x, const typename basic_vec<_Tp, _Abi>::mask_type& __mask) noexcept +{ + auto __result = ::cuda::std::numeric_limits<_Tp>::max(); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __x.size; ++__i) + { + if (__mask[__i]) + { + const auto __val = __x[__i]; + if (!(__result < __val)) + { + __result = __val; + } + } + } + return __result; +} + +// [simd.reductions], reduce_max + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp reduce_max(const basic_vec<_Tp, _Abi>& __x) noexcept +{ + auto __result = __x[0]; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 1; __i < __x.size; ++__i) + { + const auto __val = __x[__i]; + if (!(__val < __result)) + { + __result = __val; + } + } + return __result; +} + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce_max(const basic_vec<_Tp, _Abi>& __x, const typename basic_vec<_Tp, _Abi>::mask_type& __mask) noexcept +{ + auto __result = ::cuda::std::numeric_limits<_Tp>::lowest(); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __x.size; ++__i) + { + if (__mask[__i]) + { + const auto __val = __x[__i]; + if (!(__val < __result)) + { + __result = __val; + } + } + } + return __result; +} + +// NOTE: mask reductions (all_of, any_of, none_of, reduce_count, reduce_min_index, reduce_max_index) +// and their bool scalar overloads are defined in basic_mask.h +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_REDUCTIONS_H diff --git a/cudax/include/cuda/experimental/__simd/to_remove/reference.h b/cudax/include/cuda/experimental/__simd/to_remove/reference.h deleted file mode 100644 index 6d1f2646fe7..00000000000 --- a/cudax/include/cuda/experimental/__simd/to_remove/reference.h +++ /dev/null @@ -1,216 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX___SIMD_REFERENCE_H -#define _CUDAX___SIMD_REFERENCE_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include -#include - -#include - -namespace cuda::experimental::simd -{ -template -class __simd_reference -{ - // P1928R15: basic_vec is the primary SIMD vector type - template - friend class basic_vec; - - // P1928R15: basic_mask is the primary SIMD mask type (indexed by Bytes) - template <::cuda::std::size_t, typename> - friend class basic_mask; - - _Storage& __s_; - ::cuda::std::size_t __idx_; - - _CCCL_API __simd_reference(_Storage& __s, ::cuda::std::size_t __idx) - : __s_{__s} - , __idx_{__idx} - {} - - [[nodiscard]] _CCCL_API constexpr _Vp __get() const noexcept - { - return __s_.__get(__idx_); - } - - _CCCL_API constexpr void __set(_Vp __v) noexcept - { - __s_.__set(__idx_, __v); - } - -public: - using value_type = _Vp; - - __simd_reference() = delete; - __simd_reference(const __simd_reference&) = delete; - - _CCCL_API constexpr operator value_type() const noexcept - { - return __get(); - } - - _CCCL_TEMPLATE(typename _Up) - _CCCL_REQUIRES(::cuda::std::is_assignable_v) - _CCCL_API __simd_reference operator=(_Up&& __v) && noexcept - { - __set(static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template - friend _CCCL_API void swap(__simd_reference<_Storage1, _Vp1>&& __a, __simd_reference<_Storage1, _Vp1>&& __b) noexcept; - - template - friend _CCCL_API void swap(_Vp1& __a, __simd_reference<_Storage1, _Vp1>&& __b) noexcept; - - template - friend _CCCL_API void swap(__simd_reference<_Storage1, _Vp1>&& __a, _Vp1& __b) noexcept; - - template () += ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator+=(_Up&& __v) && noexcept - { - __set(__get() + static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () -= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator-=(_Up&& __v) && noexcept - { - __set(__get() - static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () *= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator*=(_Up&& __v) && noexcept - { - __set(__get() * static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () /= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator/=(_Up&& __v) && noexcept - { - __set(__get() / static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () %= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator%=(_Up&& __v) && noexcept - { - __set(__get() % static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () &= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator&=(_Up&& __v) && noexcept - { - __set(__get() & static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () |= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator|=(_Up&& __v) && noexcept - { - __set(__get() | static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () ^= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator^=(_Up&& __v) && noexcept - { - __set(__get() ^ static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () <<= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator<<=(_Up&& __v) && noexcept - { - __set(__get() << static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - template () >>= ::cuda::std::declval<_Up>())> - _CCCL_API __simd_reference operator>>=(_Up&& __v) && noexcept - { - __set(__get() >> static_cast(::cuda::std::forward<_Up>(__v))); - return {__s_, __idx_}; - } - - _CCCL_API constexpr __simd_reference operator++() && noexcept - { - __set(__get() + 1); - return {__s_, __idx_}; - } - - _CCCL_API constexpr value_type operator++(int) && noexcept - { - auto __r = __get(); - __set(__get() + 1); - return __r; - } - - _CCCL_API constexpr __simd_reference operator--() && noexcept - { - __set(__get() - 1); - return {__s_, __idx_}; - } - - _CCCL_API constexpr value_type operator--(int) && noexcept - { - auto __r = __get(); - __set(__get() - 1); - return __r; - } -}; - -template -_CCCL_API void swap(__simd_reference<_Storage, _Vp>&& __a, __simd_reference<_Storage, _Vp>&& __b) noexcept -{ - _Vp __tmp(::cuda::std::move(__a)); - ::cuda::std::move(__a) = ::cuda::std::move(__b); - ::cuda::std::move(__b) = ::cuda::std::move(__tmp); -} - -template -_CCCL_API void swap(_Vp& __a, __simd_reference<_Storage, _Vp>&& __b) noexcept -{ - _Vp __tmp(::cuda::std::move(__a)); - __a = ::cuda::std::move(__b); - ::cuda::std::move(__b) = ::cuda::std::move(__tmp); -} - -template -_CCCL_API void swap(__simd_reference<_Storage, _Vp>&& __a, _Vp& __b) noexcept -{ - _Vp __tmp(::cuda::std::move(__a)); - ::cuda::std::move(__a) = ::cuda::std::move(__b); - __b = ::cuda::std::move(__tmp); -} -} // namespace cuda::experimental::simd - -#include - -#endif // _CUDAX___SIMD_REFERENCE_H diff --git a/cudax/include/cuda/experimental/__simd/to_remove/traits.h b/cudax/include/cuda/experimental/__simd/to_remove/traits.h deleted file mode 100644 index 9690822cc73..00000000000 --- a/cudax/include/cuda/experimental/__simd/to_remove/traits.h +++ /dev/null @@ -1,201 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX___SIMD_TRAITS_H -#define _CUDAX___SIMD_TRAITS_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include - -#include - -#include - -namespace cuda::experimental::simd -{ -struct element_aligned_tag -{ - template - [[nodiscard]] _CCCL_API static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept - { - return __ptr; - } -}; - -struct vector_aligned_tag -{ - template - [[nodiscard]] _CCCL_API static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept - { - return __ptr; - } -}; - -template <::cuda::std::size_t _Alignment> -struct overaligned_tag -{ - template - [[nodiscard]] _CCCL_API static constexpr _Ptr* __apply(_Ptr* __ptr) noexcept - { - _CCCL_ASSERT(::cuda::std::is_sufficiently_aligned<_Alignment>(__ptr), - "Pointer does not satisfy overaligned_tag alignment requirement"); - return __ptr; - } -}; - -inline constexpr element_aligned_tag element_aligned{}; -inline constexpr vector_aligned_tag vector_aligned{}; - -template <::cuda::std::size_t _Alignment> -inline constexpr overaligned_tag<_Alignment> overaligned{}; - -template -inline constexpr bool is_abi_tag_v = false; - -template -struct is_abi_tag : ::cuda::std::bool_constant> -{}; - -template -inline constexpr bool is_abi_tag_v> = true; - -template -inline constexpr bool is_vec_v = false; - -template -struct is_vec : ::cuda::std::bool_constant> -{}; - -template -inline constexpr bool is_mask_v = false; - -template -struct is_mask : ::cuda::std::bool_constant> -{}; - -template -inline constexpr bool is_simd_flag_type_v = false; - -template -struct is_simd_flag_type : ::cuda::std::bool_constant> -{}; - -template , bool = (__is_vectorizable_v<_Tp> && is_abi_tag_v<_Abi>)> -struct simd_size : ::cuda::std::integral_constant<::cuda::std::size_t, _Abi::__simd_size> -{}; - -template -struct simd_size<_Tp, _Abi, false> -{ - static constexpr ::cuda::std::size_t value = 0; -}; - - -template -inline constexpr bool is_vec_v> = true; - -template <::cuda::std::size_t _Bytes, typename _Abi> -inline constexpr bool is_mask_v> = true; - -template <> -inline constexpr bool is_simd_flag_type_v = true; - -template <> -inline constexpr bool is_simd_flag_type_v = true; - -template <::cuda::std::size_t _Alignment> -inline constexpr bool is_simd_flag_type_v> = true; - -// Memory alignment queries -template -struct memory_alignment; - -// P1928R15: basic_vec memory alignment -template -struct memory_alignment, element_aligned_tag> - : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Tp)> -{}; - -template -struct memory_alignment, vector_aligned_tag> - : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Tp) * __simd_size_v<_Tp, _Abi>> -{}; - -template -struct memory_alignment, overaligned_tag<_Alignment>> - : ::cuda::std::integral_constant<::cuda::std::size_t, _Alignment> -{}; - -// P1928R15: basic_mask memory alignment (indexed by Bytes) -template <::cuda::std::size_t _Bytes, typename _Abi> -struct memory_alignment, element_aligned_tag> - : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(bool)> -{}; - -template <::cuda::std::size_t _Bytes, typename _Abi> -struct memory_alignment, vector_aligned_tag> - : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(bool) * _Abi::__simd_size> -{}; - -template <::cuda::std::size_t _Bytes, typename _Abi, ::cuda::std::size_t _Alignment> -struct memory_alignment, overaligned_tag<_Alignment>> - : ::cuda::std::integral_constant<::cuda::std::size_t, _Alignment> -{}; - -template -inline constexpr ::cuda::std::size_t memory_alignment_v = memory_alignment<_Tp, _Flags>::value; - -// Rebind simd element type -template -struct rebind_simd; - -// P1928R15: rebind for basic_vec -template -struct rebind_simd<_Tp, basic_vec<_Up, _Abi>> -{ - using type = basic_vec<_Tp, _Abi>; -}; - -// P1928R15: rebind for basic_mask (creates mask with sizeof(_Tp) bytes) -template -struct rebind_simd<_Tp, basic_mask<_Bytes, _Abi>> -{ - using type = basic_mask; -}; - -template -using rebind_simd_t = typename rebind_simd<_Tp, _Simd>::type; - -// P1928R15: mask_element_size trait - get the Bytes value from a mask -template -struct mask_element_size; - -template <::cuda::std::size_t _Bytes, typename _Abi> -struct mask_element_size> : ::cuda::std::integral_constant<::cuda::std::size_t, _Bytes> -{}; - -template -inline constexpr ::cuda::std::size_t mask_element_size_v = mask_element_size<_Tp>::value; -} // namespace cuda::experimental::simd - -#include - -#endif // _CUDAX___SIMD_TRAITS_H diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index cf0029460dc..9a34a71c31e 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -21,7 +21,6 @@ # pragma system_header #endif // no system header -#include #include #include #include @@ -34,21 +33,13 @@ namespace cuda::experimental::simd { -template -inline constexpr bool __is_non_narrowing_convertible_v = false; - -template -inline constexpr bool - __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = - true; - -template <::cuda::std::size_t _Bytes> -constexpr bool __has_integer_from_v = - (_Bytes == 1 || _Bytes == 2 || _Bytes == 4 || _Bytes == 8 -#if _CCCL_HAS_INT128() - || _Bytes == 16 -#endif // _CCCL_HAS_INT128() - ); +// template +// inline constexpr bool __is_non_narrowing_convertible_v = false; +// +// template +// inline constexpr bool +// __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = +// true; template constexpr bool __is_abi_tag_v = false; diff --git a/cudax/include/cuda/experimental/simd.cuh b/cudax/include/cuda/experimental/simd.cuh index aecb5a96ae1..d51d2294bcf 100644 --- a/cudax/include/cuda/experimental/simd.cuh +++ b/cudax/include/cuda/experimental/simd.cuh @@ -13,5 +13,6 @@ #include #include +#include #endif // __CUDAX_SIMD___ From 6c1bf792ff71943c3b590040b47ba1fcb086dc1a Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 31 Mar 2026 14:26:41 -0700 Subject: [PATCH 31/32] load and store --- .../cuda/experimental/__simd/basic_vec.h | 44 +- .../cuda/experimental/__simd/load_store.h | 453 ++++++++++++++++++ .../cuda/experimental/__simd/utility.h | 42 +- cudax/test/simd/simd.cu | 2 +- 4 files changed, 497 insertions(+), 44 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h index 0bd10dabfdd..4b8309d5e6e 100644 --- a/cudax/include/cuda/experimental/__simd/basic_vec.h +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -21,14 +21,12 @@ # pragma system_header #endif // no system header -#include #include #include #include #include #include #include -#include #include #include @@ -37,7 +35,6 @@ #include #include #include -#include #include #include @@ -145,7 +142,7 @@ class basic_vec : public __simd_operations<_Tp, _Abi> _Range, _Size, ::cuda::std::void_t>)>> = - (__simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>} == _Size); + (__static_range_size_v<_Range> == _Size); template static constexpr bool __is_compatible_range_v = @@ -153,25 +150,6 @@ class basic_vec : public __simd_operations<_Tp, _Abi> && __range_static_size_matches_v<_Range, size()> && __is_vectorizable_v<::cuda::std::ranges::range_value_t<_Range>> && __explicitly_convertible_to>; - template - _CCCL_API constexpr static void - __assert_alignment([[maybe_unused]] const ::cuda::std::ranges::range_value_t<_Range>* __data) noexcept - { - _CCCL_IF_NOT_CONSTEVAL_DEFAULT - { - if constexpr (__has_aligned_flag_v<_Flags...>) - { - _CCCL_ASSERT(::cuda::is_aligned(__data, alignment_v>), - "flag_aligned requires data to be aligned to alignment_v>"); - } - else if constexpr (__has_overaligned_flag_v<_Flags...>) - { - _CCCL_ASSERT(::cuda::is_aligned(__data, __overaligned_alignment_v<_Flags...>), - "flag_overaligned requires data to be aligned to N"); - } - } - } - // [simd.ctor] range constructor _CCCL_TEMPLATE(typename _Range, typename... _Flags) _CCCL_REQUIRES(__is_compatible_range_v<_Range>) @@ -181,7 +159,8 @@ class basic_vec : public __simd_operations<_Tp, _Abi> || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>, "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); const auto __data = ::cuda::std::ranges::data(__range); - __assert_alignment<_Range, _Flags...>(__data); + ::cuda::experimental::simd:: + __assert_load_store_alignment, _Flags...>(__data); _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < size; ++__i) { @@ -198,7 +177,8 @@ class basic_vec : public __simd_operations<_Tp, _Abi> || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>, "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); const auto __data = ::cuda::std::ranges::data(__range); - __assert_alignment<_Range, _Flags...>(__data); + ::cuda::experimental::simd:: + __assert_load_store_alignment, _Flags...>(__data); _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < size; ++__i) { @@ -513,10 +493,6 @@ class basic_vec : public __simd_operations<_Tp, _Abi> // const mask_type&, const basic_vec&, const basic_vec&) noexcept; }; -// Proxy for ranges::size(r) is a constant expression -template -_CCCL_CONCEPT __has_static_size = _CCCL_REQUIRES_EXPR((_Range))((__simd_size_type{::cuda::std::tuple_size_v<_Range>})); - // [simd.ctor] deduction guide from contiguous sized range // Deduces vec, static_cast(ranges::size(r))> // * it is not possible to use the alias "vec" for the deduction guide @@ -524,11 +500,10 @@ _CCCL_CONCEPT __has_static_size = _CCCL_REQUIRES_EXPR((_Range))((__simd_size_typ // * where _Np is __simd_size_v<_Tp, tuple_size_v<_Range>> _CCCL_TEMPLATE(typename _Range, typename... _Ts) _CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> - _CCCL_AND __has_static_size<::cuda::std::remove_cvref_t<_Range>>) -basic_vec(_Range&&, _Ts...) -> basic_vec< - ::cuda::std::ranges::range_value_t<_Range>, - simd_abi::__deduce_abi_t<::cuda::std::ranges::range_value_t<_Range>, - __simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>}>>; + _CCCL_AND __has_static_size<_Range>) +basic_vec(_Range&&, _Ts...) + -> basic_vec<::cuda::std::ranges::range_value_t<_Range>, + simd_abi::__deduce_abi_t<::cuda::std::ranges::range_value_t<_Range>, __static_range_size_v<_Range>>>; // [simd.ctor] deduction guide from basic_mask // basic_vec<__integer_from, Abi> is equivalent to decltype(+k): @@ -539,7 +514,6 @@ basic_vec(_Range&&, _Ts...) -> basic_vec< _CCCL_TEMPLATE(::cuda::std::size_t _Bytes, typename _Abi) _CCCL_REQUIRES(__has_unary_plus>) basic_vec(basic_mask<_Bytes, _Abi>) -> basic_vec<__integer_from<_Bytes>, _Abi>; - } // namespace cuda::experimental::simd #include diff --git a/cudax/include/cuda/experimental/__simd/load_store.h b/cudax/include/cuda/experimental/__simd/load_store.h index e69de29bb2d..3113f9250a8 100644 --- a/cudax/include/cuda/experimental/__simd/load_store.h +++ b/cudax/include/cuda/experimental/__simd/load_store.h @@ -0,0 +1,453 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_LOAD_STORE_H +#define _CUDAX___SIMD_LOAD_STORE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.loadstore] helper: resolves default V template parameter for load functions +// When _Vp = void (default), resolves to basic_vec<_Up>; otherwise uses the explicit _Vp +template +struct __load_vec_type +{ + using type = _Vp; +}; + +template +struct __load_vec_type +{ + using type = basic_vec<_Up>; +}; + +template +using __load_vec_t = typename __load_vec_type<_Vp, _Up>::type; + +// [simd.loadstore] helper: core partial load from pointer + count + mask +template +[[nodiscard]] _CCCL_API constexpr _Result +__partial_load_from_ptr(const _Up* __ptr, __simd_size_type __count, const typename _Result::mask_type& __mask) +{ + using _Tp = typename _Result::value_type; + static_assert(::cuda::std::same_as<::cuda::std::remove_cvref_t<_Result>, _Result>, + "V must not be a reference or cv-qualified type"); + static_assert(__is_vectorizable_v<_Tp> && __is_abi_tag_v, + "V must be an enabled specialization of basic_vec"); + static_assert(__is_vectorizable_v<_Up>, "range_value_t must be a vectorizable type"); + static_assert(__explicitly_convertible_to<_Tp, _Up>, + "range_value_t must satisfy explicitly-convertible-to"); + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Up, _Tp>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + ::cuda::experimental::simd::__assert_load_store_alignment<_Result, _Up, _Flags...>(__ptr); + _Result __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Result::size; ++__i) + { + if (__mask[__i] && __i < __count) + { + __result[__i] = static_cast<_Tp>(__ptr[__i]); + } + } + return __result; +} + +// [simd.loadstore] helper: core partial store to pointer + count + mask +template +_CCCL_API constexpr void __partial_store_to_ptr( + const basic_vec<_Tp, _Abi>& __v, + _Up* __ptr, + __simd_size_type __count, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask) +{ + static_assert(__is_vectorizable_v<_Up>, "range_value_t must be a vectorizable type"); + static_assert(__explicitly_convertible_to<_Up, _Tp>, + "value_type must satisfy explicitly-convertible-to>"); + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Tp, _Up>, + "Conversion from value_type to range_value_t is not value-preserving; use flag_convert"); + ::cuda::experimental::simd::__assert_load_store_alignment, _Up, _Flags...>(__ptr); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < basic_vec<_Tp, _Abi>::size; ++__i) + { + if (__mask[__i] && __i < __count) + { + __ptr[__i] = static_cast<_Up>(__v[__i]); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] partial_load + +// partial_load: range, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> partial_load( + _Range&& __r, + const typename __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + using _Up = ::cuda::std::ranges::range_value_t<_Range>; + return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>( + ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask); +} + +// partial_load: range, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> +partial_load(_Range&& __r, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + return ::cuda::experimental::simd::partial_load<_Vp>( + ::cuda::std::forward<_Range>(__r), typename _Result::mask_type(true), __f); +} + +// partial_load: iterator + count, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> partial_load( + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + using _Up = ::cuda::std::iter_value_t<_Ip>; + return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>( + ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask); +} + +// partial_load: iterator + count, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +partial_load(_Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __n, typename _Result::mask_type(true), __f); +} + +// partial_load: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> partial_load( + _Ip __first, + _Sp __last, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + using _Up = ::cuda::std::iter_value_t<_Ip>; + return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>( + ::cuda::std::to_address(__first), static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), __mask); +} + +// partial_load: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +partial_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __last, typename _Result::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] unchecked_load + +// unchecked_load: range, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> unchecked_load( + _Range&& __r, + const typename __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + if constexpr (__has_static_size<_Range>) + { + static_assert(__static_range_size_v<_Range> >= _Result::size(), + "unchecked_load requires ranges::size(r) >= V::size()"); + } + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), _Result::size()), + "unchecked_load requires ranges::size(r) >= V::size()"); + return ::cuda::experimental::simd::partial_load<_Vp>(::cuda::std::forward<_Range>(__r), __mask, __f); +} + +// unchecked_load: range, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> +unchecked_load(_Range&& __r, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + return ::cuda::experimental::simd::unchecked_load<_Vp>( + ::cuda::std::forward<_Range>(__r), typename _Result::mask_type(true), __f); +} + +// unchecked_load: iterator + count, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> unchecked_load( + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, _Result::size()), "unchecked_load requires n >= V::size()"); + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __n, __mask, __f); +} + +// unchecked_load: iterator + count, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +unchecked_load(_Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::unchecked_load<_Vp>(__first, __n, typename _Result::mask_type(true), __f); +} + +// unchecked_load: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> unchecked_load( + _Ip __first, + _Sp __last, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), _Result::size()), + "unchecked_load requires distance(first, last) >= V::size()"); + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __last, __mask, __f); +} + +// unchecked_load: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +unchecked_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::unchecked_load<_Vp>(__first, __last, typename _Result::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] partial_store + +// partial_store: range, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Range&& __r, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert( + ::cuda::std::indirectly_writable<::cuda::std::ranges::iterator_t<_Range>, ::cuda::std::ranges::range_value_t<_Range>>, + "ranges::iterator_t must model indirectly_writable>"); + using _Up = ::cuda::std::ranges::range_value_t<_Range>; + ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask); +} + +// partial_store: range, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void partial_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::partial_store( + __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// partial_store: iterator + count, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert(::cuda::std::indirectly_writable<_Ip, ::cuda::std::iter_value_t<_Ip>>, + "I must model indirectly_writable>"); + using _Up = ::cuda::std::iter_value_t<_Ip>; + ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask); +} + +// partial_store: iterator + count, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, _Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::partial_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// partial_store: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + _Sp __last, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert(::cuda::std::indirectly_writable<_Ip, ::cuda::std::iter_value_t<_Ip>>, + "I must model indirectly_writable>"); + using _Up = ::cuda::std::iter_value_t<_Ip>; + ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, + ::cuda::std::to_address(__first), + static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), + __mask); +} + +// partial_store: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void +partial_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::partial_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] unchecked_store + +// unchecked_store: range, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Range&& __r, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + if constexpr (__has_static_size<_Range>) + { + static_assert(__static_range_size_v<_Range> >= basic_vec<_Tp, _Abi>::size(), + "unchecked_store requires ranges::size(r) >= V::size()"); + } + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), __v.size), + "unchecked_store requires ranges::size(r) >= V::size()"); + ::cuda::experimental::simd::partial_store(__v, ::cuda::std::forward<_Range>(__r), __mask, __f); +} + +// unchecked_store: range, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::unchecked_store( + __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// unchecked_store: iterator + count, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, __v.size), "unchecked_store requires n >= V::size()"); + ::cuda::experimental::simd::partial_store(__v, __first, __n, __mask, __f); +} + +// unchecked_store: iterator + count, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, _Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::unchecked_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// unchecked_store: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + _Sp __last, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), __v.size), + "unchecked_store requires distance(first, last) >= V::size()"); + ::cuda::experimental::simd::partial_store(__v, __first, __last, __mask, __f); +} + +// unchecked_store: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void +unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::unchecked_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_LOAD_STORE_H diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h index 9a34a71c31e..bf35ccc067e 100644 --- a/cudax/include/cuda/experimental/__simd/utility.h +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -21,26 +21,24 @@ # pragma system_header #endif // no system header +#include +#include +#include #include #include +#include #include #include #include #include +#include +#include #include namespace cuda::experimental::simd { -// template -// inline constexpr bool __is_non_narrowing_convertible_v = false; -// -// template -// inline constexpr bool -// __is_non_narrowing_convertible_v<_From, _To, ::cuda::std::void_t()})>> = -// true; - template constexpr bool __is_abi_tag_v = false; @@ -70,6 +68,34 @@ _CCCL_API constexpr bool __can_generate(::cuda::std::integer_sequence<__simd_siz template constexpr bool __can_generate_v = ::cuda::experimental::simd::__can_generate<_Tp, _Generator>( ::cuda::std::make_integer_sequence<__simd_size_type, _Size>()); + +// Proxy for ranges::size(r) is a constant expression +template +_CCCL_CONCEPT __has_static_size = + _CCCL_REQUIRES_EXPR((_Range))((__simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>})); + +template +constexpr __simd_size_type __static_range_size_v = + __simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>}; + +// [simd.flags] alignment assertion for load/store pointers +template +_CCCL_API constexpr void __assert_load_store_alignment([[maybe_unused]] const _Up* __data) noexcept +{ + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (__has_aligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, alignment_v<_Vec, _Up>), + "flag_aligned requires data to be aligned to alignment_v>"); + } + else if constexpr (__has_overaligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, __overaligned_alignment_v<_Flags...>), + "flag_overaligned requires data to be aligned to N"); + } + } +} } // namespace cuda::experimental::simd #include diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu index 342ef4e5fd3..5b51fdf1fa2 100644 --- a/cudax/test/simd/simd.cu +++ b/cudax/test/simd/simd.cu @@ -127,7 +127,7 @@ C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); dp::vec assigned = simd_t(linear_index_gen{}); - assigned = generated; + assigned = generated; expect_equal(assigned, array_t{0, 2, 4, 6}); auto incremented = generated; From 8b69b68263ab011e838170c0a07df0c71453fcb2 Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 31 Mar 2026 15:07:06 -0700 Subject: [PATCH 32/32] a few fixes --- .../cuda/experimental/__simd/basic_mask.h | 43 ++++++++++--------- .../cuda/experimental/__simd/basic_vec.h | 27 ++++++------ .../cuda/experimental/__simd/concepts.h | 16 +++++-- .../cuda/experimental/__simd/exposition.h | 10 +++-- .../cuda/experimental/__simd/load_store.h | 2 +- .../cuda/experimental/__simd/reductions.h | 2 + .../cuda/experimental/__simd/type_traits.h | 2 + 7 files changed, 61 insertions(+), 41 deletions(-) diff --git a/cudax/include/cuda/experimental/__simd/basic_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h index 83c1b9315a4..ec61a3bf9c4 100644 --- a/cudax/include/cuda/experimental/__simd/basic_mask.h +++ b/cudax/include/cuda/experimental/__simd/basic_mask.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -157,45 +158,45 @@ class basic_mask : public __mask_operations<_Bytes, _Abi> return {_Impl::__bitwise_not(__s_), __storage_tag}; } - template <::cuda::std::size_t _Bytes> + template <::cuda::std::size_t _ByteSize> static constexpr bool __has_integer_from_v = - (_Bytes == 1 || _Bytes == 2 || _Bytes == 4 || _Bytes == 8 + (_ByteSize == 1 || _ByteSize == 2 || _ByteSize == 4 || _ByteSize == 8 #if _CCCL_HAS_INT128() - || _Bytes == 16 + || _ByteSize == 16 #endif // _CCCL_HAS_INT128() ); - _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) - _CCCL_REQUIRES(__has_integer_from_v<_B>) - [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_B>, _Abi> operator+() const noexcept + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator+() const noexcept { - return static_cast, _Abi>>(*this); + return static_cast, _Abi>>(*this); } - _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) - _CCCL_REQUIRES((!__has_integer_from_v<_B>) ) + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) _CCCL_API void operator+() const noexcept = delete; - _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) - _CCCL_REQUIRES(__has_integer_from_v<_B>) - [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_B>, _Abi> operator-() const noexcept + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator-() const noexcept { - return -static_cast, _Abi>>(*this); + return -static_cast, _Abi>>(*this); } - _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) - _CCCL_REQUIRES((!__has_integer_from_v<_B>) ) + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) _CCCL_API void operator-() const noexcept = delete; - _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) - _CCCL_REQUIRES(__has_integer_from_v<_B>) - [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_B>, _Abi> operator~() const noexcept + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator~() const noexcept { - return ~static_cast, _Abi>>(*this); + return ~static_cast, _Abi>>(*this); } - _CCCL_TEMPLATE(::cuda::std::size_t _B = _Bytes) - _CCCL_REQUIRES((!__has_integer_from_v<_B>) ) + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) _CCCL_API void operator~() const noexcept = delete; // [simd.mask.conv], basic_mask conversions diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h index 4b8309d5e6e..dd241191aec 100644 --- a/cudax/include/cuda/experimental/__simd/basic_vec.h +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -69,6 +69,12 @@ class basic_vec : public __simd_operations<_Tp, _Abi> using mask_type = basic_mask; using abi_type = _Abi; + // operator[] is const only. We need this function to set values + _CCCL_API constexpr void __set(__simd_size_type __i, value_type __v) noexcept + { + __s_.__set(__i, __v); + } + // TODO(fbusato): add simd-iterator // using iterator = simd-iterator; // using const_iterator = simd-iterator; @@ -134,20 +140,17 @@ class basic_vec : public __simd_operations<_Tp, _Abi> // [simd.ctor] range constructor - template - static constexpr bool __range_static_size_matches_v = false; - - template - static constexpr bool __range_static_size_matches_v< - _Range, - _Size, - ::cuda::std::void_t>)>> = - (__static_range_size_v<_Range> == _Size); + template + static constexpr bool __is_compatible_range_v = false; template - static constexpr bool __is_compatible_range_v = + static constexpr bool __is_compatible_range_v< + _Range, + ::cuda::std::void_t>::value), + ::cuda::std::ranges::range_value_t<_Range>>> = ::cuda::std::ranges::contiguous_range<_Range> && ::cuda::std::ranges::sized_range<_Range> - && __range_static_size_matches_v<_Range, size()> && __is_vectorizable_v<::cuda::std::ranges::range_value_t<_Range>> + && (__simd_size_type{::cuda::std::tuple_size<::cuda::std::remove_cvref_t<_Range>>::value} == size()) + && __is_vectorizable_v<::cuda::std::ranges::range_value_t<_Range>> && __explicitly_convertible_to>; // [simd.ctor] range constructor @@ -509,7 +512,7 @@ basic_vec(_Range&&, _Ts...) // basic_vec<__integer_from, Abi> is equivalent to decltype(+k): // * k has type basic_mask<_Bytes, _Abi> // * +k calls basic_mask::operator+() -// * the return type is basic_vec<__integer_from<_B>, _Abi> +// * the return type is basic_vec<__integer_from<_Bp>, _Abi> // The deduced type is equivalent to decltype(+k), i.e. basic_vec<__integer_from, Abi> _CCCL_TEMPLATE(::cuda::std::size_t _Bytes, typename _Abi) _CCCL_REQUIRES(__has_unary_plus>) diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/concepts.h index b36316dc827..c479ca2399a 100644 --- a/cudax/include/cuda/experimental/__simd/concepts.h +++ b/cudax/include/cuda/experimental/__simd/concepts.h @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include @@ -42,7 +44,7 @@ namespace cuda::experimental::simd template _CCCL_CONCEPT __explicitly_convertible_to = - _CCCL_REQUIRES_EXPR((_To, _From))(requires(static_cast<_To>(::cuda::std::declval<_From>()))); + _CCCL_REQUIRES_EXPR((_To, _From))((static_cast<_To>(::cuda::std::declval<_From>()))); // [simd.expos], constexpr-wrapper-like concept @@ -64,15 +66,21 @@ constexpr bool __is_value_preserving_v = || (::cuda::std::is_integral_v<_From> && ::cuda::is_floating_point_v<_To> && ::cuda::std::numeric_limits<_From>::digits <= ::cuda::std::numeric_limits<_To>::digits); +template +constexpr bool __is_constexpr_wrapper_value_preserving_v = false; + +template +constexpr bool __is_constexpr_wrapper_value_preserving_v<_From, _ValueType, ::cuda::std::void_t> = + ::cuda::std::is_arithmetic_v<::cuda::std::remove_cvref_t> + && __is_value_preserving_v<::cuda::std::remove_cvref_t, _ValueType>; + // [simd.ctor] implicit value constructor template > _CCCL_CONCEPT __is_value_ctor_implicit = ::cuda::std::convertible_to<_Up, _ValueType> && ((!::cuda::std::is_arithmetic_v<_From> && !__constexpr_wrapper_like<_From>) || (::cuda::std::is_arithmetic_v<_From> && __is_value_preserving_v<_From, _ValueType>) - || (__constexpr_wrapper_like<_From> - && ::cuda::std::is_arithmetic_v<::cuda::std::remove_cvref_t> - && __is_value_preserving_v<_From, _ValueType>) ); + || (__constexpr_wrapper_like<_From> && __is_constexpr_wrapper_value_preserving_v<_From, _ValueType>) ); // [conv.rank], integer conversion rank for [simd.ctor] p7 diff --git a/cudax/include/cuda/experimental/__simd/exposition.h b/cudax/include/cuda/experimental/__simd/exposition.h index 816b4d54f4c..ae2eeba4857 100644 --- a/cudax/include/cuda/experimental/__simd/exposition.h +++ b/cudax/include/cuda/experimental/__simd/exposition.h @@ -21,9 +21,10 @@ # pragma system_header #endif // no system header +#include #include -#include #include +#include #include #include #include @@ -39,10 +40,13 @@ namespace cuda::experimental::simd template <::cuda::std::size_t _Bytes> using __integer_from = ::cuda::std::__make_nbit_int_t<_Bytes * 8, true>; +// all standard integer types, character types, and the types float and double ([basic.fundamental]); +// std​::​float16_t, std​::​float32_t, and std​::​float64_t if defined ([basic.extended.fp]); and +// TODO(fbusato) complex where T is a vectorizable floating-point type. template constexpr bool __is_vectorizable_v = - ::cuda::std::is_arithmetic_v<_Tp> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp> - && !::cuda::std::is_same_v<_Tp, bool>; + (::cuda::std::is_integral_v<_Tp> || ::cuda::is_floating_point_v<_Tp>) + && !::cuda::std::is_same_v<_Tp, bool> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp>; template constexpr __simd_size_type __simd_size_v = 0; diff --git a/cudax/include/cuda/experimental/__simd/load_store.h b/cudax/include/cuda/experimental/__simd/load_store.h index 3113f9250a8..10cc75d6b6e 100644 --- a/cudax/include/cuda/experimental/__simd/load_store.h +++ b/cudax/include/cuda/experimental/__simd/load_store.h @@ -83,7 +83,7 @@ __partial_load_from_ptr(const _Up* __ptr, __simd_size_type __count, const typena { if (__mask[__i] && __i < __count) { - __result[__i] = static_cast<_Tp>(__ptr[__i]); + __result.__set(__i, static_cast<_Tp>(__ptr[__i])); } } return __result; diff --git a/cudax/include/cuda/experimental/__simd/reductions.h b/cudax/include/cuda/experimental/__simd/reductions.h index 37785464113..36bff50ab1a 100644 --- a/cudax/include/cuda/experimental/__simd/reductions.h +++ b/cudax/include/cuda/experimental/__simd/reductions.h @@ -131,6 +131,7 @@ _CCCL_TEMPLATE(typename _Tp, typename _Abi) _CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) [[nodiscard]] _CCCL_API constexpr _Tp reduce_min(const basic_vec<_Tp, _Abi>& __x) noexcept { + static_assert(__x.size > 0, "Vector is empty"); auto __result = __x[0]; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 1; __i < __x.size; ++__i) @@ -171,6 +172,7 @@ _CCCL_TEMPLATE(typename _Tp, typename _Abi) _CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) [[nodiscard]] _CCCL_API constexpr _Tp reduce_max(const basic_vec<_Tp, _Abi>& __x) noexcept { + static_assert(__x.size > 0, "Vector is empty"); auto __result = __x[0]; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 1; __i < __x.size; ++__i) diff --git a/cudax/include/cuda/experimental/__simd/type_traits.h b/cudax/include/cuda/experimental/__simd/type_traits.h index 3b1765c2697..9815420b1ac 100644 --- a/cudax/include/cuda/experimental/__simd/type_traits.h +++ b/cudax/include/cuda/experimental/__simd/type_traits.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include @@ -40,6 +41,7 @@ template struct alignment, _Up> : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Up) * __simd_size_v<_Tp, _Abi>> { + static_assert(::cuda::__is_valid_alignment(alignof(_Up) * __simd_size_v<_Tp, _Abi>), "Alignment is not valid"); static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type"); };