diff --git a/cudax/include/cuda/experimental/__simd/abi.h b/cudax/include/cuda/experimental/__simd/abi.h new file mode 100644 index 00000000000..687c89c9db6 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/abi.h @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_ABI_H +#define _CUDAX___SIMD_ABI_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +namespace cuda::experimental::simd +{ +using __simd_size_type = ::cuda::std::ptrdiff_t; + +// [simd.expos.abi], simd ABI tags +namespace simd_abi +{ +template <__simd_size_type _Np> +struct __fixed_size_simple; // internal ABI tag + +template <__simd_size_type _Np> +using fixed_size_simple = __fixed_size_simple<_Np>; // implementation-defined ABI + +template +using native = fixed_size_simple<1>; // implementation-defined ABI + +template +using __deduce_abi_t = fixed_size_simple<_Np>; // exposition-only +} // namespace simd_abi +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_ABI_H diff --git a/cudax/include/cuda/experimental/__simd/basic_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h new file mode 100644 index 00000000000..ec61a3bf9c4 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/basic_mask.h @@ -0,0 +1,434 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_BASIC_MASK_H +#define _CUDAX___SIMD_BASIC_MASK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.mask.class], class template basic_mask +template <::cuda::std::size_t _Bytes, typename _Abi> +class basic_mask : public __mask_operations<_Bytes, _Abi> +{ + static_assert(__is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag"); + + template + friend class basic_vec; + + using _Impl = __mask_operations<_Bytes, _Abi>; + using _Storage = typename _Impl::_MaskStorage; + + _Storage __s_; + + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + + _CCCL_API constexpr basic_mask(_Storage __v, __storage_tag_t) noexcept + : __s_{__v} + {} + +public: + using value_type = bool; + using abi_type = _Abi; + + // TODO(fbusato): add simd-iterator + // using iterator = simd-iterator; + // using const_iterator = simd-iterator; + + // constexpr iterator begin() noexcept { return {*this, 0}; } + // constexpr const_iterator begin() const noexcept { return {*this, 0}; } + // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; } + // constexpr default_sentinel_t end() const noexcept { return {}; } + // constexpr default_sentinel_t cend() const noexcept { return {}; } + + static constexpr ::cuda::std::integral_constant<__simd_size_type, __simd_size_v<__integer_from<_Bytes>, _Abi>> size{}; + + static constexpr auto __usize = ::cuda::std::size_t{size}; + + _CCCL_HIDE_FROM_ABI basic_mask() noexcept = default; + + // [simd.mask.ctor], basic_mask constructors + + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES(::cuda::std::same_as<_Up, value_type>) + _CCCL_API constexpr explicit basic_mask(_Up __v) noexcept + : __s_{_Impl::__broadcast(__v)} + {} + + _CCCL_TEMPLATE(::cuda::std::size_t _UBytes, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<__integer_from<_UBytes>, _UAbi> == size())) + _CCCL_API constexpr explicit basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, __x[__i]); + } + } + + _CCCL_TEMPLATE(typename _Generator) + _CCCL_REQUIRES(__can_generate_v) + _CCCL_API constexpr explicit basic_mask(_Generator&& __g) + : __s_{_Impl::__generate(__g)} + {} + + _CCCL_TEMPLATE(typename _Tp) + _CCCL_REQUIRES(::cuda::std::same_as<_Tp, ::cuda::std::bitset<__usize>>) + _CCCL_API constexpr basic_mask(const _Tp& __b) noexcept + : __s_{_Impl::__broadcast(false)} + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, static_cast(__b[__i])); + } + } + + _CCCL_TEMPLATE(typename _Tp) + _CCCL_REQUIRES((::cuda::std::__cccl_is_unsigned_integer_v<_Tp> && !::cuda::std::same_as<_Tp, value_type>) ) + _CCCL_API constexpr explicit basic_mask(_Tp __val) noexcept + : __s_{_Impl::__broadcast(false)} + { + constexpr auto __num_bits = __simd_size_type{::cuda::std::__num_bits_v<_Tp>}; + constexpr auto __size_as_int = size(); + constexpr auto __m = __size_as_int < __num_bits ? __size_as_int : __num_bits; + using __uint8_array_t = ::cuda::std::array<::cuda::std::uint8_t, sizeof(_Tp)>; + const auto __val1 = ::cuda::std::bit_cast<__uint8_array_t>(__val); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __m; ++__i) + { + const auto __byte = __val1[__i / CHAR_BIT]; + __s_.__set(__i, static_cast((__byte >> (__i % CHAR_BIT)) & _Tp{1})); + } + } + + // [simd.mask.subscr], basic_mask subscript operators + + [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __simd_size_type{size}), "Index is out of bounds"); + return static_cast(__s_.__get(__i)); + } + + // TODO(fbusato): subscript with integral indices, requires permute() + // template + // constexpr resize_t operator[](const I& indices) const; + + // [simd.mask.unary], basic_mask unary operators + + [[nodiscard]] _CCCL_API constexpr basic_mask operator!() const noexcept + { + return {_Impl::__bitwise_not(__s_), __storage_tag}; + } + + template <::cuda::std::size_t _ByteSize> + static constexpr bool __has_integer_from_v = + (_ByteSize == 1 || _ByteSize == 2 || _ByteSize == 4 || _ByteSize == 8 +#if _CCCL_HAS_INT128() + || _ByteSize == 16 +#endif // _CCCL_HAS_INT128() + ); + + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator+() const noexcept + { + return static_cast, _Abi>>(*this); + } + + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) + _CCCL_API void operator+() const noexcept = delete; + + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator-() const noexcept + { + return -static_cast, _Abi>>(*this); + } + + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) + _CCCL_API void operator-() const noexcept = delete; + + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES(__has_integer_from_v<_Bp>) + [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator~() const noexcept + { + return ~static_cast, _Abi>>(*this); + } + + _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes) + _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) ) + _CCCL_API void operator~() const noexcept = delete; + + // [simd.mask.conv], basic_mask conversions + + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES((sizeof(_Up) != _Bytes && __simd_size_v<_Up, _Ap> == size())) + _CCCL_API constexpr explicit operator basic_vec<_Up, _Ap>() const noexcept + { + basic_vec<_Up, _Ap> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __result.__s_.__set(__i, static_cast<_Up>((*this)[__i])); + } + return __result; + } + + _CCCL_TEMPLATE(typename _Up, typename _Ap) + _CCCL_REQUIRES((sizeof(_Up) == _Bytes && __simd_size_v<_Up, _Ap> == size())) + _CCCL_API constexpr operator basic_vec<_Up, _Ap>() const noexcept + { + basic_vec<_Up, _Ap> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __result.__s_.__set(__i, static_cast<_Up>((*this)[__i])); + } + return __result; + } + + [[nodiscard]] _CCCL_API constexpr ::cuda::std::bitset<__usize> to_bitset() const noexcept + { + ::cuda::std::bitset<__usize> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __result.set(__i, (*this)[__i]); + } + return __result; + } + + [[nodiscard]] _CCCL_API constexpr unsigned long long to_ullong() const + { + constexpr __simd_size_type __nbits = ::cuda::std::__num_bits_v; + if constexpr (size > __nbits) + { + for (auto __i = __nbits; __i < size; ++__i) + { + _CCCL_ASSERT(!(*this)[__i], "Bit above unsigned long long width is set"); + } + } + return to_bitset().to_ullong(); + } + + // [simd.mask.binary], basic_mask binary operators + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator&&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__logic_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator||(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__logic_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator|(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator^(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + // [simd.mask.cassign], basic_mask compound assignment + + _CCCL_API friend constexpr basic_mask& operator&=(basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs = __lhs & __rhs; + } + + _CCCL_API friend constexpr basic_mask& operator|=(basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs = __lhs | __rhs; + } + + _CCCL_API friend constexpr basic_mask& operator^=(basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs = __lhs ^ __rhs; + } + + // [simd.mask.comparison], basic_mask comparisons (element-wise) + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator==(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return !(__lhs ^ __rhs); + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator!=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs ^ __rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator>=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs || !__rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator<=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return !__lhs || __rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator>(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return __lhs && !__rhs; + } + + [[nodiscard]] _CCCL_API friend constexpr basic_mask + operator<(const basic_mask& __lhs, const basic_mask& __rhs) noexcept + { + return !__lhs && __rhs; + } + + // [simd.mask.reductions], reductions + + [[nodiscard]] _CCCL_API friend constexpr bool all_of(const basic_mask& __k) noexcept + { + return _Impl::__all(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr bool any_of(const basic_mask& __k) noexcept + { + return _Impl::__any(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr bool none_of(const basic_mask& __k) noexcept + { + return !any_of(__k); + } + + [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_count(const basic_mask& __k) noexcept + { + return _Impl::__count(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_min_index(const basic_mask& __k) + { + _CCCL_ASSERT(any_of(__k), "No bits are set"); + return _Impl::__min_index(__k.__s_); + } + + [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_max_index(const basic_mask& __k) + { + _CCCL_ASSERT(any_of(__k), "No bits are set"); + return _Impl::__max_index(__k.__s_); + } + + // TODO(fbusato): [simd.mask.cond], basic_mask exposition only conditional operators + // friend constexpr basic_mask __simd_select_impl( + // const basic_mask&, const basic_mask&, const basic_mask&) noexcept; + // friend constexpr basic_mask __simd_select_impl( + // const basic_mask&, same_as auto, same_as auto) noexcept; + // template + // friend constexpr vec __simd_select_impl( + // const basic_mask&, const T0&, const T1&) noexcept; +}; + +// Scalar bool overloads + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr bool all_of(_Tp __x) noexcept +{ + return __x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr bool any_of(_Tp __x) noexcept +{ + return __x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr bool none_of(_Tp __x) noexcept +{ + return !__x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_count(_Tp __x) noexcept +{ + return __x; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_min_index(_Tp __x) +{ + _CCCL_ASSERT(__x, "No bits are set"); + return 0; +} + +_CCCL_TEMPLATE(typename _Tp) +_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>) +[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_max_index(_Tp __x) +{ + _CCCL_ASSERT(__x, "No bits are set"); + return 0; +} +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_BASIC_MASK_H diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h new file mode 100644 index 00000000000..dd241191aec --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/basic_vec.h @@ -0,0 +1,524 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_BASIC_VEC_H +#define _CUDAX___SIMD_BASIC_VEC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.class], class template basic_vec +template +class basic_vec : public __simd_operations<_Tp, _Abi> +{ + static_assert(__is_vectorizable_v<_Tp>, "basic_vec requires a vectorizable type"); + static_assert(__is_abi_tag_v<_Abi>, "basic_vec requires a valid ABI tag"); + + template <::cuda::std::size_t, typename> + friend class basic_mask; + + using _Impl = __simd_operations<_Tp, _Abi>; + using _Storage = typename _Impl::_SimdStorage; + + _Storage __s_; + + struct __storage_tag_t + {}; + static constexpr __storage_tag_t __storage_tag{}; + + _CCCL_API constexpr basic_vec(_Storage __s, __storage_tag_t) noexcept + : __s_{__s} + {} + +public: + using value_type = _Tp; + using mask_type = basic_mask; + using abi_type = _Abi; + + // operator[] is const only. We need this function to set values + _CCCL_API constexpr void __set(__simd_size_type __i, value_type __v) noexcept + { + __s_.__set(__i, __v); + } + + // TODO(fbusato): add simd-iterator + // using iterator = simd-iterator; + // using const_iterator = simd-iterator; + + // constexpr iterator begin() noexcept { return {*this, 0}; } + // constexpr const_iterator begin() const noexcept { return {*this, 0}; } + // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; } + // constexpr default_sentinel_t end() const noexcept { return {}; } + // constexpr default_sentinel_t cend() const noexcept { return {}; } + + static constexpr ::cuda::std::integral_constant<__simd_size_type, __simd_size_v> size{}; + + _CCCL_HIDE_FROM_ABI basic_vec() noexcept = default; + + // [simd.ctor], basic_vec constructors + + // [simd.ctor] value broadcast constructor (explicit overload) + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES((__explicitly_convertible_to) _CCCL_AND(!__is_value_ctor_implicit<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept + : __s_{_Impl::__broadcast(static_cast(__v))} + {} + + // [simd.ctor] value broadcast constructor (implicit overload) + _CCCL_TEMPLATE(typename _Up) + _CCCL_REQUIRES((__explicitly_convertible_to) _CCCL_AND(__is_value_ctor_implicit<_Up, value_type>)) + _CCCL_API constexpr basic_vec(_Up&& __v) noexcept + : __s_{_Impl::__broadcast(static_cast(__v))} + {} + + // [simd.ctor] converting constructor from basic_vec (explicit overload) + _CCCL_TEMPLATE(typename _Up, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to) + _CCCL_AND(__is_vec_ctor_explicit<_Up, value_type>)) + _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, static_cast(__v[__i])); + } + } + + // [simd.ctor] converting constructor from basic_vec (implicit overload) + _CCCL_TEMPLATE(typename _Up, typename _UAbi) + _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to) + _CCCL_AND(!__is_vec_ctor_explicit<_Up, value_type>)) + _CCCL_API constexpr basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, static_cast(__v[__i])); + } + } + + // [simd.ctor] generator constructor + _CCCL_TEMPLATE(typename _Generator) + _CCCL_REQUIRES(__can_generate_v) + _CCCL_API constexpr explicit basic_vec(_Generator&& __g) + : __s_{_Impl::__generate(__g)} + {} + + // [simd.ctor] range constructor + + template + static constexpr bool __is_compatible_range_v = false; + + template + static constexpr bool __is_compatible_range_v< + _Range, + ::cuda::std::void_t>::value), + ::cuda::std::ranges::range_value_t<_Range>>> = + ::cuda::std::ranges::contiguous_range<_Range> && ::cuda::std::ranges::sized_range<_Range> + && (__simd_size_type{::cuda::std::tuple_size<::cuda::std::remove_cvref_t<_Range>>::value} == size()) + && __is_vectorizable_v<::cuda::std::ranges::range_value_t<_Range>> + && __explicitly_convertible_to>; + + // [simd.ctor] range constructor + _CCCL_TEMPLATE(typename _Range, typename... _Flags) + _CCCL_REQUIRES(__is_compatible_range_v<_Range>) + _CCCL_API constexpr basic_vec(_Range&& __range, flags<_Flags...> = {}) + { + static_assert(__has_convert_flag_v<_Flags...> + || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + const auto __data = ::cuda::std::ranges::data(__range); + ::cuda::experimental::simd:: + __assert_load_store_alignment, _Flags...>(__data); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, static_cast(__data[__i])); + } + } + + // [simd.ctor] masked range constructor + _CCCL_TEMPLATE(typename _Range, typename... _Flags) + _CCCL_REQUIRES(__is_compatible_range_v<_Range>) + _CCCL_API constexpr basic_vec(_Range&& __range, const mask_type& __mask, flags<_Flags...> = {}) + { + static_assert(__has_convert_flag_v<_Flags...> + || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + const auto __data = ::cuda::std::ranges::data(__range); + ::cuda::experimental::simd:: + __assert_load_store_alignment, _Flags...>(__data); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < size; ++__i) + { + __s_.__set(__i, __mask[__i] ? static_cast(__data[__i]) : value_type()); + } + } + + // TODO(fbusato): add complex constructor + // constexpr basic_vec(const real-type& __reals, const real-type& __imags = {}) noexcept; + + // [simd.subscr], basic_vec subscript operators + + [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const + { + _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __simd_size_type{size}), "Index is out of bounds"); + return __s_.__get(__i); + } + + // TODO(fbusato): subscript with integral indices, requires permute() + // template + // constexpr resize_t<_Idx::size(), basic_vec> operator[](const _Idx& __indices) const; + + // TODO(fbusato): [simd.complex.access], basic_vec complex accessors + // constexpr real-type real() const noexcept; + // constexpr real-type imag() const noexcept; + // constexpr void real(const real-type& __v) noexcept; + // constexpr void imag(const real-type& __v) noexcept; + + // [simd.unary], basic_vec unary operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_pre_increment<_Up>) + _CCCL_API constexpr basic_vec& operator++() noexcept + { + _Impl::__increment(__s_); + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_post_increment<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator++(int) noexcept + { + const basic_vec __r = *this; + _Impl::__increment(__s_); + return __r; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_pre_decrement<_Up>) + _CCCL_API constexpr basic_vec& operator--() noexcept + { + _Impl::__decrement(__s_); + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_post_decrement<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator--(int) noexcept + { + const basic_vec __r = *this; + _Impl::__decrement(__s_); + return __r; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_negate<_Up>) + [[nodiscard]] _CCCL_API constexpr mask_type operator!() const noexcept + { + return mask_type{_Impl::__negate(__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_not<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator~() const noexcept + { + return basic_vec{_Impl::__bitwise_not(__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_unary_plus<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator+() const noexcept + { + return *this; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_unary_minus<_Up>) + [[nodiscard]] _CCCL_API constexpr basic_vec operator-() const noexcept + { + return basic_vec{_Impl::__unary_minus(__s_), __storage_tag}; + } + + // [simd.binary], basic_vec binary operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_plus<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator+(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_minus<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator-(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_multiplies<_Up>) + [[nodiscard]] + _CCCL_API friend constexpr basic_vec operator*(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_divides<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator/(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_modulo<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator%(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_and<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator&(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_or<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator|(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_xor<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator^(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return basic_vec{_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left_size<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs << basic_vec{__n}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right_size<_Up>) + [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs >> basic_vec{__n}; + } + + // [simd.cassign], basic_vec compound assignment + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_plus<_Up>) + _CCCL_API friend constexpr basic_vec& operator+=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs + __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_binary_minus<_Up>) + _CCCL_API friend constexpr basic_vec& operator-=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs - __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_multiplies<_Up>) + _CCCL_API friend constexpr basic_vec& operator*=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs * __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_divides<_Up>) + _CCCL_API friend constexpr basic_vec& operator/=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs / __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_modulo<_Up>) + _CCCL_API friend constexpr basic_vec& operator%=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs % __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_and<_Up>) + _CCCL_API friend constexpr basic_vec& operator&=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs & __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_or<_Up>) + _CCCL_API friend constexpr basic_vec& operator|=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs | __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_bitwise_xor<_Up>) + _CCCL_API friend constexpr basic_vec& operator^=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs ^ __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left<_Up>) + _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs << __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right<_Up>) + _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return __lhs = __lhs >> __rhs; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_left_size<_Up>) + _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs = __lhs << __n; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_shift_right_size<_Up>) + _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, __simd_size_type __n) noexcept + { + return __lhs = __lhs >> __n; + } + + // [simd.comparison], basic_vec compare operators + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_equal_to<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator==(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_not_equal_to<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator!=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_greater_equal<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator>=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_less_equal<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator<=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__less_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_greater<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__greater(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + _CCCL_TEMPLATE(typename _Up = _Tp) + _CCCL_REQUIRES(__has_less<_Up>) + [[nodiscard]] _CCCL_API friend constexpr mask_type operator<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept + { + return mask_type{_Impl::__less(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag}; + } + + // TODO(fbusato): [simd.cond], basic_vec exposition-only conditional operators + // friend constexpr basic_vec __simd_select_impl( + // const mask_type&, const basic_vec&, const basic_vec&) noexcept; +}; + +// [simd.ctor] deduction guide from contiguous sized range +// Deduces vec, static_cast(ranges::size(r))> +// * it is not possible to use the alias "vec" for the deduction guide +// * "vec" is defined as basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>> +// * where _Np is __simd_size_v<_Tp, tuple_size_v<_Range>> +_CCCL_TEMPLATE(typename _Range, typename... _Ts) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __has_static_size<_Range>) +basic_vec(_Range&&, _Ts...) + -> basic_vec<::cuda::std::ranges::range_value_t<_Range>, + simd_abi::__deduce_abi_t<::cuda::std::ranges::range_value_t<_Range>, __static_range_size_v<_Range>>>; + +// [simd.ctor] deduction guide from basic_mask +// basic_vec<__integer_from, Abi> is equivalent to decltype(+k): +// * k has type basic_mask<_Bytes, _Abi> +// * +k calls basic_mask::operator+() +// * the return type is basic_vec<__integer_from<_Bp>, _Abi> +// The deduced type is equivalent to decltype(+k), i.e. basic_vec<__integer_from, Abi> +_CCCL_TEMPLATE(::cuda::std::size_t _Bytes, typename _Abi) +_CCCL_REQUIRES(__has_unary_plus>) +basic_vec(basic_mask<_Bytes, _Abi>) -> basic_vec<__integer_from<_Bytes>, _Abi>; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_BASIC_VEC_H diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/concepts.h new file mode 100644 index 00000000000..c479ca2399a --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/concepts.h @@ -0,0 +1,245 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_CONCEPTS_H +#define _CUDAX___SIMD_CONCEPTS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], explicitly-convertible-to concept + +template +_CCCL_CONCEPT __explicitly_convertible_to = + _CCCL_REQUIRES_EXPR((_To, _From))((static_cast<_To>(::cuda::std::declval<_From>()))); + +// [simd.expos], constexpr-wrapper-like concept + +template +_CCCL_CONCEPT __constexpr_wrapper_like = + ::cuda::std::convertible_to<_Tp, decltype(_Tp::value)> + && ::cuda::std::equality_comparable_with<_Tp, decltype(_Tp::value)> + && ::cuda::std::bool_constant<(_Tp() == _Tp::value)>::value + && ::cuda::std::bool_constant<(static_cast(_Tp()) == _Tp::value)>::value; + +// (c++draft)The conversion from an arithmetic type U to a vectorizable type T is value-preserving if all possible +// values of U can be represented with type T. +template +constexpr bool __is_value_preserving_v = + (::cuda::std::is_integral_v<_From> && ::cuda::std::is_integral_v<_To> + && ::cuda::__is_integer_representable_v<_From, _To>) + || (::cuda::is_floating_point_v<_From> && ::cuda::is_floating_point_v<_To> + && ::cuda::std::__fp_is_implicit_conversion_v<_From, _To>) + || (::cuda::std::is_integral_v<_From> && ::cuda::is_floating_point_v<_To> + && ::cuda::std::numeric_limits<_From>::digits <= ::cuda::std::numeric_limits<_To>::digits); + +template +constexpr bool __is_constexpr_wrapper_value_preserving_v = false; + +template +constexpr bool __is_constexpr_wrapper_value_preserving_v<_From, _ValueType, ::cuda::std::void_t> = + ::cuda::std::is_arithmetic_v<::cuda::std::remove_cvref_t> + && __is_value_preserving_v<::cuda::std::remove_cvref_t, _ValueType>; + +// [simd.ctor] implicit value constructor +template > +_CCCL_CONCEPT __is_value_ctor_implicit = + ::cuda::std::convertible_to<_Up, _ValueType> + && ((!::cuda::std::is_arithmetic_v<_From> && !__constexpr_wrapper_like<_From>) + || (::cuda::std::is_arithmetic_v<_From> && __is_value_preserving_v<_From, _ValueType>) + || (__constexpr_wrapper_like<_From> && __is_constexpr_wrapper_value_preserving_v<_From, _ValueType>) ); + +// [conv.rank], integer conversion rank for [simd.ctor] p7 + +template +inline constexpr int __integer_conversion_rank = 0; + +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 1; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 2; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 3; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 4; +template <> +inline constexpr int __integer_conversion_rank = 5; +template <> +inline constexpr int __integer_conversion_rank = 5; +#if _CCCL_HAS_INT128() +template <> +inline constexpr int __integer_conversion_rank<__int128_t> = 6; +template <> +inline constexpr int __integer_conversion_rank<__uint128_t> = 6; +#endif // _CCCL_HAS_INT128() + +// [conv.rank], floating-point conversion rank for [simd.ctor] p7 + +template +inline constexpr int __fp_conversion_rank = 0; + +#if _CCCL_HAS_NVFP16() +template <> +inline constexpr int __fp_conversion_rank<__half> = 1; +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank<__nv_bfloat16> = 1; +#endif // _CCCL_HAS_NVBF16() +template <> +inline constexpr int __fp_conversion_rank = 2; +template <> +inline constexpr int __fp_conversion_rank = 3; +#if _CCCL_HAS_LONG_DOUBLE() +template <> +inline constexpr int __fp_conversion_rank = 4; +#endif // _CCCL_HAS_LONG_DOUBLE() +#if _CCCL_HAS_FLOAT128() +template <> +inline constexpr int __fp_conversion_rank<__float128> = 5; +#endif // _CCCL_HAS_FLOAT128() + +// [simd.ctor] p7: explicit(see below) for basic_vec(const basic_vec&) +// explicit evaluates to true if either: +// - conversion from U to value_type is not value-preserving, or +// - both U and value_type are integral and integer_conversion_rank(U) > rank(value_type), or +// - both U and value_type are floating-point and fp_conversion_rank(U) > rank(value_type) +template +constexpr bool __is_vec_ctor_explicit = + !__is_value_preserving_v<_Up, _ValueType> + || (::cuda::std::is_integral_v<_Up> && ::cuda::std::is_integral_v<_ValueType> + && __integer_conversion_rank<_Up> > __integer_conversion_rank<_ValueType>) + || (::cuda::is_floating_point_v<_Up> && ::cuda::is_floating_point_v<_ValueType> + && __fp_conversion_rank<_Up> > __fp_conversion_rank<_ValueType>); + +// [simd.unary], operator constraints + +template +_CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t)); + +template +_CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++)); + +template +_CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t)); + +template +_CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--)); + +template +_CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((!__t)); + +template +_CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((~__t)); + +template +_CCCL_CONCEPT __has_unary_plus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((+__t)); + +template +_CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((-__t)); + +// [simd.binary], binary operator constraints + +template +_CCCL_CONCEPT __has_binary_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a + __b)); + +template +_CCCL_CONCEPT __has_binary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a - __b)); + +template +_CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a * __b)); + +template +_CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a / __b)); + +template +_CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a % __b)); + +template +_CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a & __b)); + +template +_CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a | __b)); + +template +_CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a ^ __b)); + +template +_CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a << __b)); + +template +_CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >> __b)); + +template +_CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{})); + +template +_CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{})); + +// [simd.comparison], comparison operator constraints + +template +_CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a == __b)); + +template +_CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a != __b)); + +template +_CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >= __b)); + +template +_CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a <= __b)); + +template +_CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a > __b)); + +template +_CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a < __b)); +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_CONCEPTS_H diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h new file mode 100644 index 00000000000..bc9d2487f8c --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/declaration.h @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_DECLARATION_H +#define _CUDAX___SIMD_DECLARATION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include +#include + +#include + +namespace cuda::experimental::simd +{ +template > +class basic_vec; + +template <::cuda::std::size_t _Bytes, typename _Abi = simd_abi::native<__integer_from<_Bytes>>> +class basic_mask; + +template >> +using vec = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>; + +template >> +using mask = basic_mask>; + +// specializations + +template +struct __simd_storage; + +template +struct __simd_operations; + +template <::cuda::std::size_t _Bytes, typename _Abi> +struct __mask_storage; + +template <::cuda::std::size_t _Bytes, typename _Abi> +struct __mask_operations; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_DECLARATION_H diff --git a/cudax/include/cuda/experimental/__simd/exposition.h b/cudax/include/cuda/experimental/__simd/exposition.h new file mode 100644 index 00000000000..ae2eeba4857 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/exposition.h @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_EXPOSITION_H +#define _CUDAX___SIMD_EXPOSITION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], exposition-only helpers + +template <::cuda::std::size_t _Bytes> +using __integer_from = ::cuda::std::__make_nbit_int_t<_Bytes * 8, true>; + +// all standard integer types, character types, and the types float and double ([basic.fundamental]); +// std​::​float16_t, std​::​float32_t, and std​::​float64_t if defined ([basic.extended.fp]); and +// TODO(fbusato) complex where T is a vectorizable floating-point type. +template +constexpr bool __is_vectorizable_v = + (::cuda::std::is_integral_v<_Tp> || ::cuda::is_floating_point_v<_Tp>) + && !::cuda::std::is_same_v<_Tp, bool> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp>; + +template +constexpr __simd_size_type __simd_size_v = 0; + +template +constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::fixed_size_simple<_Np>> = _Np; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_EXPOSITION_H diff --git a/cudax/include/cuda/experimental/__simd/flag.h b/cudax/include/cuda/experimental/__simd/flag.h new file mode 100644 index 00000000000..d2aad6e8ba8 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/flag.h @@ -0,0 +1,105 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_FLAG_H +#define _CUDAX___SIMD_FLAG_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], exposition-only flag types + +struct __convert_flag +{}; + +struct __aligned_flag +{}; + +template <::cuda::std::size_t _Np> +struct __overaligned_flag +{ + static_assert(::cuda::is_power_of_two(_Np), "Overaligned flag requires a power-of-2 alignment"); +}; + +template +constexpr bool __is_flag_type_v = false; + +template <> +constexpr bool __is_flag_type_v<__convert_flag> = true; + +template <> +constexpr bool __is_flag_type_v<__aligned_flag> = true; + +template <::cuda::std::size_t _Np> +constexpr bool __is_flag_type_v<__overaligned_flag<_Np>> = true; + +// [simd.flags.overview], class template flags + +template +struct flags +{ + static_assert((true && ... && __is_flag_type_v<_Flags>), + "Every flag type must be one of convert_flag, aligned_flag, or overaligned_flag"); + + // [simd.flags.oper], flags operators + template + [[nodiscard]] _CCCL_API friend constexpr flags<_Flags..., _Other...> operator|(flags, flags<_Other...>) noexcept + { + return {}; + } +}; + +// [simd.flags], flag constants + +inline constexpr flags<> flag_default{}; +inline constexpr flags<__convert_flag> flag_convert{}; +inline constexpr flags<__aligned_flag> flag_aligned{}; + +template <::cuda::std::size_t _Np> +constexpr flags<__overaligned_flag<_Np>> flag_overaligned{}; + +template +constexpr bool __has_convert_flag_v = (false || ... || ::cuda::std::is_same_v<_Flags, __convert_flag>); + +template +constexpr bool __has_aligned_flag_v = (false || ... || ::cuda::std::is_same_v<_Flags, __aligned_flag>); + +template +constexpr ::cuda::std::size_t __overaligned_value_v = 0; + +template <::cuda::std::size_t _Np> +constexpr ::cuda::std::size_t __overaligned_value_v<__overaligned_flag<_Np>> = _Np; + +template +constexpr bool __has_overaligned_flag_v = (false || ... || (__overaligned_value_v<_Flags> != 0)); + +template +constexpr ::cuda::std::size_t __overaligned_alignment_v = + (::cuda::std::size_t{0} | ... | __overaligned_value_v<_Flags>); +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_FLAG_H diff --git a/cudax/include/cuda/experimental/__simd/load_store.h b/cudax/include/cuda/experimental/__simd/load_store.h new file mode 100644 index 00000000000..10cc75d6b6e --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/load_store.h @@ -0,0 +1,453 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_LOAD_STORE_H +#define _CUDAX___SIMD_LOAD_STORE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.loadstore] helper: resolves default V template parameter for load functions +// When _Vp = void (default), resolves to basic_vec<_Up>; otherwise uses the explicit _Vp +template +struct __load_vec_type +{ + using type = _Vp; +}; + +template +struct __load_vec_type +{ + using type = basic_vec<_Up>; +}; + +template +using __load_vec_t = typename __load_vec_type<_Vp, _Up>::type; + +// [simd.loadstore] helper: core partial load from pointer + count + mask +template +[[nodiscard]] _CCCL_API constexpr _Result +__partial_load_from_ptr(const _Up* __ptr, __simd_size_type __count, const typename _Result::mask_type& __mask) +{ + using _Tp = typename _Result::value_type; + static_assert(::cuda::std::same_as<::cuda::std::remove_cvref_t<_Result>, _Result>, + "V must not be a reference or cv-qualified type"); + static_assert(__is_vectorizable_v<_Tp> && __is_abi_tag_v, + "V must be an enabled specialization of basic_vec"); + static_assert(__is_vectorizable_v<_Up>, "range_value_t must be a vectorizable type"); + static_assert(__explicitly_convertible_to<_Tp, _Up>, + "range_value_t must satisfy explicitly-convertible-to"); + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Up, _Tp>, + "Conversion from range_value_t to value_type is not value-preserving; use flag_convert"); + ::cuda::experimental::simd::__assert_load_store_alignment<_Result, _Up, _Flags...>(__ptr); + _Result __result{}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Result::size; ++__i) + { + if (__mask[__i] && __i < __count) + { + __result.__set(__i, static_cast<_Tp>(__ptr[__i])); + } + } + return __result; +} + +// [simd.loadstore] helper: core partial store to pointer + count + mask +template +_CCCL_API constexpr void __partial_store_to_ptr( + const basic_vec<_Tp, _Abi>& __v, + _Up* __ptr, + __simd_size_type __count, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask) +{ + static_assert(__is_vectorizable_v<_Up>, "range_value_t must be a vectorizable type"); + static_assert(__explicitly_convertible_to<_Up, _Tp>, + "value_type must satisfy explicitly-convertible-to>"); + static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Tp, _Up>, + "Conversion from value_type to range_value_t is not value-preserving; use flag_convert"); + ::cuda::experimental::simd::__assert_load_store_alignment, _Up, _Flags...>(__ptr); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < basic_vec<_Tp, _Abi>::size; ++__i) + { + if (__mask[__i] && __i < __count) + { + __ptr[__i] = static_cast<_Up>(__v[__i]); + } + } +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] partial_load + +// partial_load: range, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> partial_load( + _Range&& __r, + const typename __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + using _Up = ::cuda::std::ranges::range_value_t<_Range>; + return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>( + ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask); +} + +// partial_load: range, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> +partial_load(_Range&& __r, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + return ::cuda::experimental::simd::partial_load<_Vp>( + ::cuda::std::forward<_Range>(__r), typename _Result::mask_type(true), __f); +} + +// partial_load: iterator + count, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> partial_load( + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + using _Up = ::cuda::std::iter_value_t<_Ip>; + return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>( + ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask); +} + +// partial_load: iterator + count, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +partial_load(_Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __n, typename _Result::mask_type(true), __f); +} + +// partial_load: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> partial_load( + _Ip __first, + _Sp __last, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + using _Up = ::cuda::std::iter_value_t<_Ip>; + return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>( + ::cuda::std::to_address(__first), static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), __mask); +} + +// partial_load: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +partial_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __last, typename _Result::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] unchecked_load + +// unchecked_load: range, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> unchecked_load( + _Range&& __r, + const typename __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + if constexpr (__has_static_size<_Range>) + { + static_assert(__static_range_size_v<_Range> >= _Result::size(), + "unchecked_load requires ranges::size(r) >= V::size()"); + } + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), _Result::size()), + "unchecked_load requires ranges::size(r) >= V::size()"); + return ::cuda::experimental::simd::partial_load<_Vp>(::cuda::std::forward<_Range>(__r), __mask, __f); +} + +// unchecked_load: range, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> +unchecked_load(_Range&& __r, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>; + return ::cuda::experimental::simd::unchecked_load<_Vp>( + ::cuda::std::forward<_Range>(__r), typename _Result::mask_type(true), __f); +} + +// unchecked_load: iterator + count, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> unchecked_load( + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, _Result::size()), "unchecked_load requires n >= V::size()"); + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __n, __mask, __f); +} + +// unchecked_load: iterator + count, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +unchecked_load(_Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::unchecked_load<_Vp>(__first, __n, typename _Result::mask_type(true), __f); +} + +// unchecked_load: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> unchecked_load( + _Ip __first, + _Sp __last, + const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), _Result::size()), + "unchecked_load requires distance(first, last) >= V::size()"); + return ::cuda::experimental::simd::partial_load<_Vp>(__first, __last, __mask, __f); +} + +// unchecked_load: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>) +[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> +unchecked_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>; + return ::cuda::experimental::simd::unchecked_load<_Vp>(__first, __last, typename _Result::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] partial_store + +// partial_store: range, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Range&& __r, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert( + ::cuda::std::indirectly_writable<::cuda::std::ranges::iterator_t<_Range>, ::cuda::std::ranges::range_value_t<_Range>>, + "ranges::iterator_t must model indirectly_writable>"); + using _Up = ::cuda::std::ranges::range_value_t<_Range>; + ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask); +} + +// partial_store: range, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void partial_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::partial_store( + __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// partial_store: iterator + count, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert(::cuda::std::indirectly_writable<_Ip, ::cuda::std::iter_value_t<_Ip>>, + "I must model indirectly_writable>"); + using _Up = ::cuda::std::iter_value_t<_Ip>; + ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask); +} + +// partial_store: iterator + count, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, _Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::partial_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// partial_store: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void partial_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + _Sp __last, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> = {}) +{ + static_assert(::cuda::std::indirectly_writable<_Ip, ::cuda::std::iter_value_t<_Ip>>, + "I must model indirectly_writable>"); + using _Up = ::cuda::std::iter_value_t<_Ip>; + ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>( + __v, + ::cuda::std::to_address(__first), + static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), + __mask); +} + +// partial_store: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void +partial_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::partial_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +//---------------------------------------------------------------------------------------------------------------------- +// [simd.loadstore] unchecked_store + +// unchecked_store: range, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Range&& __r, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + if constexpr (__has_static_size<_Range>) + { + static_assert(__static_range_size_v<_Range> >= basic_vec<_Tp, _Abi>::size(), + "unchecked_store requires ranges::size(r) >= V::size()"); + } + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), __v.size), + "unchecked_store requires ranges::size(r) >= V::size()"); + ::cuda::experimental::simd::partial_store(__v, ::cuda::std::forward<_Range>(__r), __mask, __f); +} + +// unchecked_store: range, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range> + _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>) +_CCCL_API constexpr void unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::unchecked_store( + __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// unchecked_store: iterator + count, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + ::cuda::std::iter_difference_t<_Ip> __n, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, __v.size), "unchecked_store requires n >= V::size()"); + ::cuda::experimental::simd::partial_store(__v, __first, __n, __mask, __f); +} + +// unchecked_store: iterator + count, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags) +_CCCL_REQUIRES( + ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, _Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::unchecked_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} + +// unchecked_store: iterator + sentinel, masked +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void unchecked_store( + const basic_vec<_Tp, _Abi>& __v, + _Ip __first, + _Sp __last, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + flags<_Flags...> __f = {}) +{ + _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), __v.size), + "unchecked_store requires distance(first, last) >= V::size()"); + ::cuda::experimental::simd::partial_store(__v, __first, __last, __mask, __f); +} + +// unchecked_store: iterator + sentinel, no mask +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags) +_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND + __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>) +_CCCL_API constexpr void +unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {}) +{ + ::cuda::experimental::simd::unchecked_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f); +} +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_LOAD_STORE_H diff --git a/cudax/include/cuda/experimental/__simd/reductions.h b/cudax/include/cuda/experimental/__simd/reductions.h new file mode 100644 index 00000000000..36bff50ab1a --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/reductions.h @@ -0,0 +1,216 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_REDUCTIONS_H +#define _CUDAX___SIMD_REDUCTIONS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.expos], reduction-binary-operation concept + +template +_CCCL_CONCEPT __reduction_binary_operation = _CCCL_REQUIRES_EXPR( + (_BinaryOp, _Tp), const _BinaryOp __binary_op, const vec<_Tp, 1> __v)(_Same_as(vec<_Tp, 1>) __binary_op(__v, __v)); + +template +constexpr bool __is_reduce_default_supported_operation_v = + ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::plus<>> || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::multiplies<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_and<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_or<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_xor<>>; + +template +[[nodiscard]] _CCCL_API constexpr _Tp __default_identity_element() noexcept +{ + if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::plus<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_or<>> + || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_xor<>>) + { + return _Tp(); + } + else if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::multiplies<>>) + { + return _Tp(1); + } + else if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_and<>>) + { + return _Tp(~_Tp()); + } + else + { + static_assert(::cuda::std::__always_false_v<_Tp>, + "No default identity element for this BinaryOperation; provide one explicitly"); + return _Tp(); + } +} + +// [simd.reductions], reduce + +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation = ::cuda::std::plus<>) +_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce(const basic_vec<_Tp, _Abi>& __x, _BinaryOperation __binary_op = ::cuda::std::plus<>{}) +{ + vec<_Tp, 1> __result{__x[0]}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 1; __i < __x.size; ++__i) + { + __result = __binary_op(__result, vec<_Tp, 1>{__x[__i]}); + } + return __result[0]; +} + +// We need two overloads: +// 1) An argument for identity_element is provided for the invocation +// 2) unless BinaryOperation is one of plus<>, multiplies<>, bit_and<>, bit_or<>, or bit_xor<> +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation) +_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce(const basic_vec<_Tp, _Abi>& __x, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + _BinaryOperation __binary_op, + ::cuda::std::type_identity_t<_Tp> __identity_element) +{ + vec<_Tp, 1> __result{__identity_element}; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __x.size; ++__i) + { + if (__mask[__i]) + { + __result = __binary_op(__result, vec<_Tp, 1>{__x[__i]}); + } + } + return __result[0]; +} + +_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation) +_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp> _CCCL_AND + __is_reduce_default_supported_operation_v<_BinaryOperation>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce(const basic_vec<_Tp, _Abi>& __x, + const typename basic_vec<_Tp, _Abi>::mask_type& __mask, + _BinaryOperation __binary_op = ::cuda::std::plus<>{}) +{ + return ::cuda::experimental::simd::reduce( + __x, __mask, __binary_op, ::cuda::experimental::simd::__default_identity_element<_Tp, _BinaryOperation>()); +} + +// [simd.reductions], reduce_min + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp reduce_min(const basic_vec<_Tp, _Abi>& __x) noexcept +{ + static_assert(__x.size > 0, "Vector is empty"); + auto __result = __x[0]; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 1; __i < __x.size; ++__i) + { + const auto __val = __x[__i]; + if (!(__result < __val)) + { + __result = __val; + } + } + return __result; +} + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce_min(const basic_vec<_Tp, _Abi>& __x, const typename basic_vec<_Tp, _Abi>::mask_type& __mask) noexcept +{ + auto __result = ::cuda::std::numeric_limits<_Tp>::max(); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __x.size; ++__i) + { + if (__mask[__i]) + { + const auto __val = __x[__i]; + if (!(__result < __val)) + { + __result = __val; + } + } + } + return __result; +} + +// [simd.reductions], reduce_max + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp reduce_max(const basic_vec<_Tp, _Abi>& __x) noexcept +{ + static_assert(__x.size > 0, "Vector is empty"); + auto __result = __x[0]; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 1; __i < __x.size; ++__i) + { + const auto __val = __x[__i]; + if (!(__val < __result)) + { + __result = __val; + } + } + return __result; +} + +_CCCL_TEMPLATE(typename _Tp, typename _Abi) +_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>) +[[nodiscard]] _CCCL_API constexpr _Tp +reduce_max(const basic_vec<_Tp, _Abi>& __x, const typename basic_vec<_Tp, _Abi>::mask_type& __mask) noexcept +{ + auto __result = ::cuda::std::numeric_limits<_Tp>::lowest(); + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < __x.size; ++__i) + { + if (__mask[__i]) + { + const auto __val = __x[__i]; + if (!(__val < __result)) + { + __result = __val; + } + } + } + return __result; +} + +// NOTE: mask reductions (all_of, any_of, none_of, reduce_count, reduce_min_index, reduce_max_index) +// and their bool scalar overloads are defined in basic_mask.h +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_REDUCTIONS_H diff --git a/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h new file mode 100644 index 00000000000..50434015934 --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h @@ -0,0 +1,232 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H +#define _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +#include + +namespace cuda::experimental::simd +{ +// Bool-per-element mask storage for fixed_size_simple ABI +template <::cuda::std::size_t _Bytes, __simd_size_type _Np> +struct __mask_storage<_Bytes, simd_abi::__fixed_size_simple<_Np>> +{ + using value_type = bool; + static constexpr ::cuda::std::size_t __element_bytes = _Bytes; + + bool __data[_Np]; + + [[nodiscard]] _CCCL_API constexpr bool __get(__simd_size_type __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + return __data[__idx]; + } + + _CCCL_API constexpr void __set(__simd_size_type __idx, bool __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + __data[__idx] = __v; + } +}; + +// Mask operations for fixed_size_simple ABI with bool-per-element storage +template <::cuda::std::size_t _Bytes, __simd_size_type _Np> +struct __mask_operations<_Bytes, simd_abi::__fixed_size_simple<_Np>> +{ + using _MaskStorage = __mask_storage<_Bytes, simd_abi::__fixed_size_simple<_Np>>; + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __v; + } + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __generate_init(_Generator&& __g, ::cuda::std::integer_sequence<__simd_size_type, _Is...>) + { + _MaskStorage __result; + ((__result.__data[_Is] = static_cast(__g(::cuda::std::integral_constant<__simd_size_type, _Is>()))), ...); + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g) + { + return __generate_init(__g, ::cuda::std::make_integer_sequence<__simd_size_type, _Np>()); + } + + // Logical operators (for operator&& and operator||) + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __logic_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __logic_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i]; + } + return __result; + } + + // Bitwise operators (for operator&, operator|, operator^) + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage + __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = !__s.__data[__i]; + } + return __result; + } + + // Reductions + + [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (!__s.__data[__i]) + { + return false; + } + } + return true; + } + + [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (__s.__data[__i]) + { + return true; + } + } + return false; + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __count(const _MaskStorage& __s) noexcept + { + __simd_size_type __count = 0; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __count += static_cast<__simd_size_type>(__s.__data[__i]); + } + return __count; + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __min_index(const _MaskStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + if (__s.__data[__i]) + { + return __i; + } + } + _CCCL_UNREACHABLE(); + } + + [[nodiscard]] _CCCL_API static constexpr __simd_size_type __max_index(const _MaskStorage& __s) noexcept + { + for (__simd_size_type __i = _Np - 1; __i >= 0; --__i) + { + if (__s.__data[__i]) + { + return __i; + } + } + _CCCL_UNREACHABLE(); + } +}; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H diff --git a/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h new file mode 100644 index 00000000000..624d373ee8a --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h @@ -0,0 +1,218 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H +#define _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +#include + +namespace cuda::experimental::simd +{ +namespace simd_abi +{ +template <__simd_size_type _Np> +struct __fixed_size_simple +{ + static constexpr __simd_size_type __simd_size = _Np; +}; +} // namespace simd_abi + +// Element-per-slot simd storage for fixed_size_simple ABI +template +struct __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>> +{ + using value_type = _Tp; + _Tp __data[_Np]; + + [[nodiscard]] _CCCL_API constexpr _Tp __get(__simd_size_type __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + return __data[__idx]; + } + + _CCCL_API constexpr void __set(__simd_size_type __idx, _Tp __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + __data[__idx] = __v; + } +}; + +#define _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_STORAGE_TYPE, _NAME, _OP) \ + [[nodiscard]] _CCCL_API static constexpr _STORAGE_TYPE _NAME( \ + const _STORAGE_TYPE& __lhs, const _STORAGE_TYPE& __rhs) noexcept \ + { \ + _STORAGE_TYPE __result; \ + _CCCL_PRAGMA_UNROLL_FULL() \ + for (__simd_size_type __i = 0; __i < _Np; ++__i) \ + { \ + __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]); \ + } \ + return __result; \ + } + +#define _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(_NAME, _OP) \ + [[nodiscard]] _CCCL_API static constexpr _MaskStorage _NAME( \ + const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept \ + { \ + _MaskStorage __result; \ + _CCCL_PRAGMA_UNROLL_FULL() \ + for (__simd_size_type __i = 0; __i < _Np; ++__i) \ + { \ + __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]); \ + } \ + return __result; \ + } + +// Simd operations for fixed_size_simple ABI +template +struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>> +{ + using _SimdStorage = __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>>; + using _MaskStorage = __mask_storage>; + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = __v; + } + return __result; + } + + template + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __generate_init(_Generator&& __g, ::cuda::std::integer_sequence<__simd_size_type, _Is...>) + { + return _SimdStorage{{__g(::cuda::std::integral_constant<__simd_size_type, _Is>())...}}; + } + + template + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g) + { + return __generate_init(__g, ::cuda::std::make_integer_sequence<__simd_size_type, _Np>()); + } + + // Unary operations + + _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __s.__data[__i] += 1; + } + } + + _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept + { + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __s.__data[__i] -= 1; + } + } + + [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept + { + _MaskStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = !__s.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = ~__s.__data[__i]; + } + return __result; + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = -__s.__data[__i]; + } + return __result; + } + + // Binary arithmetic operations + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __plus, +) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __minus, -) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __multiplies, *) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __divides, /) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __modulo, %) + + // Comparison operations + + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__equal_to, ==) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__not_equal_to, !=) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__less, <) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__less_equal, <=) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__greater, >) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__greater_equal, >=) + + // Bitwise and shift operations + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_and, &) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_or, |) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_xor, ^) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __shift_left, <<) + + _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __shift_right, >>) +}; + +#undef _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP +#undef _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H diff --git a/cudax/include/cuda/experimental/__simd/type_traits.h b/cudax/include/cuda/experimental/__simd/type_traits.h new file mode 100644 index 00000000000..9815420b1ac --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/type_traits.h @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_TYPE_TRAITS_H +#define _CUDAX___SIMD_TYPE_TRAITS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +// [simd.traits], alignment +template +struct alignment; + +template +struct alignment, _Up> + : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Up) * __simd_size_v<_Tp, _Abi>> +{ + static_assert(::cuda::__is_valid_alignment(alignof(_Up) * __simd_size_v<_Tp, _Abi>), "Alignment is not valid"); + static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type"); +}; + +template +constexpr ::cuda::std::size_t alignment_v = alignment<_Tp, _Up>::value; + +// [simd.traits], rebind +template +struct rebind; + +template +struct rebind<_Tp, basic_vec<_Up, _Abi>> +{ + static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type"); + using type = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, __simd_size_v<_Up, _Abi>>>; +}; + +template +struct rebind<_Tp, basic_mask<_Bytes, _Abi>> +{ + static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type"); + using __integer_t = __integer_from; + using __integer_bytes_t = __integer_from<_Bytes>; + + using type = basic_mask>>; +}; + +template +using rebind_t = typename rebind<_Tp, _Vp>::type; + +// [simd.traits], resize +template <__simd_size_type _Np, typename _Vp> +struct resize; + +template <__simd_size_type _Np, typename _Tp, typename _Abi> +struct resize<_Np, basic_vec<_Tp, _Abi>> +{ + using type = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>; +}; + +template <__simd_size_type _Np, ::cuda::std::size_t _Bytes, typename _Abi> +struct resize<_Np, basic_mask<_Bytes, _Abi>> +{ + using type = basic_mask<_Bytes, simd_abi::__deduce_abi_t<__integer_from<_Bytes>, _Np>>; +}; + +template <__simd_size_type _Np, typename _Vp> +using resize_t = typename resize<_Np, _Vp>::type; +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_TYPE_TRAITS_H diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h new file mode 100644 index 00000000000..bf35ccc067e --- /dev/null +++ b/cudax/include/cuda/experimental/__simd/utility.h @@ -0,0 +1,103 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX___SIMD_UTILITY_H +#define _CUDAX___SIMD_UTILITY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace cuda::experimental::simd +{ +template +constexpr bool __is_abi_tag_v = false; + +template <__simd_size_type _Np> +constexpr bool __is_abi_tag_v> = true; + +template +constexpr bool __is_well_formed = false; + +template +constexpr bool __is_well_formed<_Tp, + _Generator, + _Idx, + ::cuda::std::void_t()( + ::cuda::std::integral_constant<__simd_size_type, _Idx>()))>> = + ::cuda::std::is_convertible_v< + decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<__simd_size_type, _Idx>())), + _Tp>; + +template +[[nodiscard]] +_CCCL_API constexpr bool __can_generate(::cuda::std::integer_sequence<__simd_size_type, _Indices...>) noexcept +{ + return (true && ... && __is_well_formed<_Tp, _Generator, _Indices>); +} + +template +constexpr bool __can_generate_v = ::cuda::experimental::simd::__can_generate<_Tp, _Generator>( + ::cuda::std::make_integer_sequence<__simd_size_type, _Size>()); + +// Proxy for ranges::size(r) is a constant expression +template +_CCCL_CONCEPT __has_static_size = + _CCCL_REQUIRES_EXPR((_Range))((__simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>})); + +template +constexpr __simd_size_type __static_range_size_v = + __simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>}; + +// [simd.flags] alignment assertion for load/store pointers +template +_CCCL_API constexpr void __assert_load_store_alignment([[maybe_unused]] const _Up* __data) noexcept +{ + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (__has_aligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, alignment_v<_Vec, _Up>), + "flag_aligned requires data to be aligned to alignment_v>"); + } + else if constexpr (__has_overaligned_flag_v<_Flags...>) + { + _CCCL_ASSERT(::cuda::is_aligned(__data, __overaligned_alignment_v<_Flags...>), + "flag_overaligned requires data to be aligned to N"); + } + } +} +} // namespace cuda::experimental::simd + +#include + +#endif // _CUDAX___SIMD_UTILITY_H diff --git a/cudax/include/cuda/experimental/simd.cuh b/cudax/include/cuda/experimental/simd.cuh new file mode 100644 index 00000000000..d51d2294bcf --- /dev/null +++ b/cudax/include/cuda/experimental/simd.cuh @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX_SIMD___ +#define __CUDAX_SIMD___ + +#include +#include +#include + +#endif // __CUDAX_SIMD___ diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 9ae7be85039..c4eaeefbe77 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -119,6 +119,10 @@ cudax_add_catch2_test(test_target hierarchy_groups hierarchy/group.cu ) +cudax_add_catch2_test(test_target simd + simd/simd.cu +) + if (cudax_ENABLE_CUFILE) cudax_add_catch2_test(test_target cufile.driver_attributes cufile/driver_attributes.cu diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu new file mode 100644 index 00000000000..5b51fdf1fa2 --- /dev/null +++ b/cudax/test/simd/simd.cu @@ -0,0 +1,345 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include + +namespace dp = cuda::experimental::simd; + +namespace +{ +struct linear_index_gen +{ + template + __host__ __device__ constexpr int operator()(Index idx) const + { + return static_cast(idx); + } +}; + +struct linear_index_x2_gen +{ + template + __host__ __device__ constexpr int operator()(Index idx) const + { + return static_cast(idx * 2); + } +}; + +struct alternating_mask_gen +{ + template + __host__ __device__ constexpr bool operator()(Index idx) const + { + return (idx % 2) == 0; + } +}; +} // namespace + +template +__host__ __device__ void expect_equal(const Simd& actual, const ::cuda::std::array& expected) +{ + static_assert(N == Simd::size(), "Mismatch between expected values and simd width"); + for (size_t i = 0; i < N; ++i) + { + CUDAX_REQUIRE(actual[i] == expected[i]); + } +} + +template +using simd_array_t = ::cuda::std::array; + +C2H_CCCLRT_TEST("simd.traits", "[simd][traits]") +{ + using abi_t = dp::simd_abi::fixed_size_simple<4>; + using simd_t = dp::vec; + using mask_t = dp::mask; + using other_t = dp::vec; + using rebind_t = dp::rebind_simd_t; + + STATIC_REQUIRE(dp::is_abi_tag_v); + STATIC_REQUIRE(!dp::is_abi_tag_v); + STATIC_REQUIRE(dp::__simd_size_v == 4); + STATIC_REQUIRE(dp::__simd_size_v == 4); + STATIC_REQUIRE(dp::__simd_size_v == 0); + + STATIC_REQUIRE(dp::is_vec_v); + STATIC_REQUIRE(!dp::is_vec_v); + STATIC_REQUIRE(dp::is_mask_v); + STATIC_REQUIRE(!dp::is_mask_v); + + STATIC_REQUIRE(dp::is_simd_flag_type_v); + STATIC_REQUIRE(dp::is_simd_flag_type_v); + STATIC_REQUIRE(dp::is_simd_flag_type_v>); + + STATIC_REQUIRE(simd_t::size() == 4); + STATIC_REQUIRE(mask_t::size() == simd_t::size()); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(int)); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(int) * simd_t::size()); + STATIC_REQUIRE(dp::memory_alignment_v> == 128); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(bool)); + STATIC_REQUIRE(dp::memory_alignment_v == alignof(bool) * mask_t::size()); + + STATIC_REQUIRE(::cuda::std::is_same_v); +} + +C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]") +{ + constexpr auto size = 4; + using simd_t = dp::vec; + using mask_t = simd_t::mask_type; + using array_t = simd_array_t; + + simd_t broadcast(7); + expect_equal(broadcast, array_t{7, 7, 7, 7}); + + simd_t generated(linear_index_x2_gen{}); + expect_equal(generated, array_t{0, 2, 4, 6}); + + alignas(64) int storage[size] = {0, 1, 2, 3}; + simd_t from_ptr(storage, dp::overaligned<64>); + expect_equal(from_ptr, array_t{0, 1, 2, 3}); + + alignas(64) int roundtrip[size] = {}; + generated.copy_to(roundtrip, dp::overaligned<64>); + + simd_t loaded; + loaded.copy_from(roundtrip, dp::overaligned<64>); + expect_equal(loaded, array_t{0, 2, 4, 6}); + + dp::vec widened(generated); + expect_equal(widened, ::cuda::std::array{0.0f, 2.0f, 4.0f, 6.0f}); + + mask_t from_simd = (generated != simd_t(0)); + expect_equal(from_simd, ::cuda::std::array{false, true, true, true}); + + dp::vec assigned = simd_t(linear_index_gen{}); + assigned = generated; + expect_equal(assigned, array_t{0, 2, 4, 6}); + + auto incremented = generated; + ++incremented; + expect_equal(incremented, array_t{1, 3, 5, 7}); + + auto decremented = incremented; + decremented--; + expect_equal(decremented, array_t{0, 2, 4, 6}); +} + +C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]") +{ + using simd_t = dp::vec; + using mask_t = simd_t::mask_type; + using array_t = simd_array_t; + + simd_t lhs(linear_index_gen{}); + simd_t rhs(2); + + auto sum = lhs + rhs; + expect_equal(sum, array_t{2, 3, 4, 5}); + + auto difference = sum - 1; + expect_equal(difference, array_t{1, 2, 3, 4}); + + auto vec_plus_scalar = lhs + 5; + expect_equal(vec_plus_scalar, array_t{5, 6, 7, 8}); + + auto scalar_plus_vec = 5 + lhs; + expect_equal(scalar_plus_vec, array_t{5, 6, 7, 8}); + + auto scalar_minus_vec = 5 - lhs; + expect_equal(scalar_minus_vec, array_t{5, 4, 3, 2}); + + auto product = lhs * rhs; + expect_equal(product, array_t{0, 2, 4, 6}); + + auto quotient = product / rhs; + expect_equal(quotient, array_t{0, 1, 2, 3}); + + auto modulo = product % rhs; + expect_equal(modulo, array_t{0, 0, 0, 0}); + + auto bit_and = product & simd_t(3); + expect_equal(bit_and, array_t{0, 2, 0, 2}); + + auto bit_or = bit_and | simd_t(4); + expect_equal(bit_or, array_t{4, 6, 4, 6}); + + auto bit_xor = bit_and ^ simd_t(1); + expect_equal(bit_xor, array_t{1, 3, 1, 3}); + + auto vec_or_scalar = lhs | 1; + expect_equal(vec_or_scalar, array_t{1, 1, 3, 3}); + + auto scalar_or_vec = 1 | lhs; + expect_equal(scalar_or_vec, array_t{1, 1, 3, 3}); + + auto shift_left = simd_t(1) << lhs; + expect_equal(shift_left, array_t{1, 2, 4, 8}); + + auto shift_right = shift_left >> simd_t(1); + expect_equal(shift_right, array_t{0, 1, 2, 4}); + + auto vector_shift_scalar = lhs << 1; + expect_equal(vector_shift_scalar, array_t{0, 2, 4, 6}); + + auto scalar_shift_vector = 1 << lhs; + expect_equal(scalar_shift_vector, array_t{1, 2, 4, 8}); + + auto compound = lhs; + compound += rhs; + compound -= rhs; + expect_equal(compound, array_t{0, 1, 2, 3}); + + auto bitwise_compound = simd_t(3); + bitwise_compound &= rhs; + bitwise_compound |= simd_t(4); + bitwise_compound ^= simd_t(1); + expect_equal(bitwise_compound, array_t{7, 7, 7, 7}); + + auto shift_compound = simd_t(1); + shift_compound <<= rhs; + shift_compound >>= rhs; + expect_equal(shift_compound, array_t{1, 1, 1, 1}); + + mask_t eq_mask = (lhs == lhs); + CUDAX_REQUIRE(dp::all_of(eq_mask)); + mask_t lt_mask = (lhs < 2); + CUDAX_REQUIRE(dp::reduce_count(lt_mask) == 2); + + mask_t scalar_first_lt = (2 <= lhs); + CUDAX_REQUIRE(dp::reduce_count(scalar_first_lt) == 2); + + mask_t scalar_eq_rhs = (lhs == 1); + CUDAX_REQUIRE(dp::reduce_count(scalar_eq_rhs) == 1); + + mask_t scalar_eq_lhs = (1 == lhs); + CUDAX_REQUIRE(dp::reduce_count(scalar_eq_lhs) == 1); + + mask_t ge_mask = (lhs >= 1); + CUDAX_REQUIRE(dp::any_of(ge_mask)); + CUDAX_REQUIRE(!dp::none_of(ge_mask)); + + auto negated = -lhs; + expect_equal(negated, array_t{0, -1, -2, -3}); + + auto bitwise_not = ~lhs; + expect_equal(bitwise_not, array_t{-1, -2, -3, -4}); +} + +C2H_CCCLRT_TEST("simd.mask", "[simd][mask]") +{ + using mask_t = dp::mask; + using simd_t = dp::vec; + using mask_array_t = ::cuda::std::array; + using simd_array_typed = simd_array_t; + + mask_t alternating(alternating_mask_gen{}); + expect_equal(alternating, mask_array_t{true, false, true, false}); + CUDAX_REQUIRE(dp::reduce_count(alternating) == 2); + CUDAX_REQUIRE(dp::any_of(alternating)); + CUDAX_REQUIRE(!dp::all_of(alternating)); + CUDAX_REQUIRE(!dp::none_of(alternating)); + + mask_t inverted = !alternating; + expect_equal(inverted, mask_array_t{false, true, false, true}); + + mask_t zero = alternating & inverted; + CUDAX_REQUIRE(dp::none_of(zero)); + + mask_t combined = alternating | inverted; + CUDAX_REQUIRE(dp::all_of(combined)); + + auto vec_from_mask = static_cast(alternating); + expect_equal(vec_from_mask, simd_array_typed{1, 0, 1, 0}); + + mask_t xor_mask = alternating ^ inverted; + CUDAX_REQUIRE(dp::all_of(xor_mask)); + + mask_t assigned = alternating; + assigned ^= inverted; + CUDAX_REQUIRE(dp::all_of(assigned)); + + assigned &= combined; + CUDAX_REQUIRE(dp::all_of(assigned)); + + mask_t or_test(false); + or_test |= alternating; + CUDAX_REQUIRE(dp::all_of(or_test == alternating)); + + mask_t broadcast_true(true); + CUDAX_REQUIRE(dp::all_of(broadcast_true)); + + mask_t broadcast_false(false); + CUDAX_REQUIRE(dp::none_of(broadcast_false)); + + // Element-wise comparison operators + mask_t eq_result = (alternating == alternating); + CUDAX_REQUIRE(dp::all_of(eq_result)); + + mask_t ne_result = (alternating != inverted); + CUDAX_REQUIRE(dp::all_of(ne_result)); + + mask_t a(true); + mask_t b(false); + CUDAX_REQUIRE(dp::all_of(a >= b)); + CUDAX_REQUIRE(dp::all_of(b <= a)); + CUDAX_REQUIRE(dp::all_of(a > b)); + CUDAX_REQUIRE(dp::all_of(b < a)); + CUDAX_REQUIRE(dp::all_of(a >= a)); + CUDAX_REQUIRE(dp::all_of(a <= a)); + CUDAX_REQUIRE(dp::none_of(a > a)); + CUDAX_REQUIRE(dp::none_of(a < a)); + + // Unsigned integer constructor + mask_t from_bits(static_cast(0b1010)); + expect_equal(from_bits, mask_array_t{false, true, false, true}); + + // Unary operators returning basic_vec + mask_t pos_input(true); + auto plus_result = +pos_input; + CUDAX_REQUIRE(dp::all_of(plus_result == simd_t(1))); + + auto minus_result = -pos_input; + CUDAX_REQUIRE(dp::all_of(minus_result == simd_t(-1))); + + // Logical operators + mask_t logical_and = alternating && inverted; + CUDAX_REQUIRE(dp::none_of(logical_and)); + + mask_t logical_or = alternating || inverted; + CUDAX_REQUIRE(dp::all_of(logical_or)); +} + +C2H_CCCLRT_TEST("simd.reference", "[simd][reference]") +{ + using simd_t = dp::vec; + using array_t = simd_array_t; + + simd_t values(linear_index_gen{}); + values[2] += 5; + expect_equal(values, array_t{0, 1, 7, 3}); + + using ::cuda::std::swap; + + swap(values[0], values[3]); + int scalar = 42; + swap(values[1], scalar); + swap(scalar, values[2]); + + expect_equal(values, array_t{3, 42, 1, 0}); + CUDAX_REQUIRE(scalar == 7); +}