diff --git a/cudax/include/cuda/experimental/__simd/abi.h b/cudax/include/cuda/experimental/__simd/abi.h
new file mode 100644
index 00000000000..687c89c9db6
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/abi.h
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_ABI_H
+#define _CUDAX___SIMD_ABI_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cstddef/types.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+using __simd_size_type = ::cuda::std::ptrdiff_t;
+
+// [simd.expos.abi], simd ABI tags
+namespace simd_abi
+{
+template <__simd_size_type _Np>
+struct __fixed_size_simple; // internal ABI tag
+
+template <__simd_size_type _Np>
+using fixed_size_simple = __fixed_size_simple<_Np>; // implementation-defined ABI
+
+template <typename>
+using native = fixed_size_simple<1>; // implementation-defined ABI
+
+template <typename, __simd_size_type _Np>
+using __deduce_abi_t = fixed_size_simple<_Np>; // exposition-only
+} // namespace simd_abi
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_ABI_H
diff --git a/cudax/include/cuda/experimental/__simd/basic_mask.h b/cudax/include/cuda/experimental/__simd/basic_mask.h
new file mode 100644
index 00000000000..ec61a3bf9c4
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/basic_mask.h
@@ -0,0 +1,434 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_BASIC_MASK_H
+#define _CUDAX___SIMD_BASIC_MASK_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__bit/bit_cast.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/same_as.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/__type_traits/num_bits.h>
+#include <cuda/std/array>
+#include <cuda/std/bitset>
+#include <cuda/std/climits>
+#include <cuda/std/cstdint>
+
+#include <cuda/experimental/__simd/declaration.h>
+#include <cuda/experimental/__simd/specializations/fixed_size_simple_mask.h>
+#include <cuda/experimental/__simd/utility.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.mask.class], class template basic_mask
+template <::cuda::std::size_t _Bytes, typename _Abi>
+class basic_mask : public __mask_operations<_Bytes, _Abi>
+{
+  static_assert(__is_abi_tag_v<_Abi>, "basic_mask requires a valid ABI tag");
+
+  template <typename, typename>
+  friend class basic_vec;
+
+  using _Impl    = __mask_operations<_Bytes, _Abi>;
+  using _Storage = typename _Impl::_MaskStorage;
+
+  _Storage __s_;
+
+  struct __storage_tag_t
+  {};
+  static constexpr __storage_tag_t __storage_tag{};
+
+  _CCCL_API constexpr basic_mask(_Storage __v, __storage_tag_t) noexcept
+      : __s_{__v}
+  {}
+
+public:
+  using value_type = bool;
+  using abi_type   = _Abi;
+
+  // TODO(fbusato): add simd-iterator
+  // using iterator       = simd-iterator<basic_mask>;
+  // using const_iterator = simd-iterator<const basic_mask>;
+
+  // constexpr iterator begin() noexcept { return {*this, 0}; }
+  // constexpr const_iterator begin() const noexcept { return {*this, 0}; }
+  // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; }
+  // constexpr default_sentinel_t end() const noexcept { return {}; }
+  // constexpr default_sentinel_t cend() const noexcept { return {}; }
+
+  static constexpr ::cuda::std::integral_constant<__simd_size_type, __simd_size_v<__integer_from<_Bytes>, _Abi>> size{};
+
+  static constexpr auto __usize = ::cuda::std::size_t{size};
+
+  _CCCL_HIDE_FROM_ABI basic_mask() noexcept = default;
+
+  // [simd.mask.ctor], basic_mask constructors
+
+  _CCCL_TEMPLATE(typename _Up)
+  _CCCL_REQUIRES(::cuda::std::same_as<_Up, value_type>)
+  _CCCL_API constexpr explicit basic_mask(_Up __v) noexcept
+      : __s_{_Impl::__broadcast(__v)}
+  {}
+
+  _CCCL_TEMPLATE(::cuda::std::size_t _UBytes, typename _UAbi)
+  _CCCL_REQUIRES((__simd_size_v<__integer_from<_UBytes>, _UAbi> == size()))
+  _CCCL_API constexpr explicit basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __s_.__set(__i, __x[__i]);
+    }
+  }
+
+  _CCCL_TEMPLATE(typename _Generator)
+  _CCCL_REQUIRES(__can_generate_v<bool, _Generator, size>)
+  _CCCL_API constexpr explicit basic_mask(_Generator&& __g)
+      : __s_{_Impl::__generate(__g)}
+  {}
+
+  _CCCL_TEMPLATE(typename _Tp)
+  _CCCL_REQUIRES(::cuda::std::same_as<_Tp, ::cuda::std::bitset<__usize>>)
+  _CCCL_API constexpr basic_mask(const _Tp& __b) noexcept
+      : __s_{_Impl::__broadcast(false)}
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __s_.__set(__i, static_cast<bool>(__b[__i]));
+    }
+  }
+
+  _CCCL_TEMPLATE(typename _Tp)
+  _CCCL_REQUIRES((::cuda::std::__cccl_is_unsigned_integer_v<_Tp> && !::cuda::std::same_as<_Tp, value_type>) )
+  _CCCL_API constexpr explicit basic_mask(_Tp __val) noexcept
+      : __s_{_Impl::__broadcast(false)}
+  {
+    constexpr auto __num_bits    = __simd_size_type{::cuda::std::__num_bits_v<_Tp>};
+    constexpr auto __size_as_int = size();
+    constexpr auto __m           = __size_as_int < __num_bits ? __size_as_int : __num_bits;
+    using __uint8_array_t        = ::cuda::std::array<::cuda::std::uint8_t, sizeof(_Tp)>;
+    const auto __val1            = ::cuda::std::bit_cast<__uint8_array_t>(__val);
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < __m; ++__i)
+    {
+      const auto __byte = __val1[__i / CHAR_BIT];
+      __s_.__set(__i, static_cast<bool>((__byte >> (__i % CHAR_BIT)) & _Tp{1}));
+    }
+  }
+
+  // [simd.mask.subscr], basic_mask subscript operators
+
+  [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __simd_size_type{size}), "Index is out of bounds");
+    return static_cast<bool>(__s_.__get(__i));
+  }
+
+  // TODO(fbusato): subscript with integral indices, requires permute()
+  // template<simd-integral I>
+  // constexpr resize_t<I::size(), basic_mask> operator[](const I& indices) const;
+
+  // [simd.mask.unary], basic_mask unary operators
+
+  [[nodiscard]] _CCCL_API constexpr basic_mask operator!() const noexcept
+  {
+    return {_Impl::__bitwise_not(__s_), __storage_tag};
+  }
+
+  template <::cuda::std::size_t _ByteSize>
+  static constexpr bool __has_integer_from_v =
+    (_ByteSize == 1 || _ByteSize == 2 || _ByteSize == 4 || _ByteSize == 8
+#if _CCCL_HAS_INT128()
+     || _ByteSize == 16
+#endif // _CCCL_HAS_INT128()
+    );
+
+  _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes)
+  _CCCL_REQUIRES(__has_integer_from_v<_Bp>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator+() const noexcept
+  {
+    return static_cast<basic_vec<__integer_from<_Bp>, _Abi>>(*this);
+  }
+
+  _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes)
+  _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) )
+  _CCCL_API void operator+() const noexcept = delete;
+
+  _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes)
+  _CCCL_REQUIRES(__has_integer_from_v<_Bp>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator-() const noexcept
+  {
+    return -static_cast<basic_vec<__integer_from<_Bp>, _Abi>>(*this);
+  }
+
+  _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes)
+  _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) )
+  _CCCL_API void operator-() const noexcept = delete;
+
+  _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes)
+  _CCCL_REQUIRES(__has_integer_from_v<_Bp>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec<__integer_from<_Bp>, _Abi> operator~() const noexcept
+  {
+    return ~static_cast<basic_vec<__integer_from<_Bp>, _Abi>>(*this);
+  }
+
+  _CCCL_TEMPLATE(::cuda::std::size_t _Bp = _Bytes)
+  _CCCL_REQUIRES((!__has_integer_from_v<_Bp>) )
+  _CCCL_API void operator~() const noexcept = delete;
+
+  // [simd.mask.conv], basic_mask conversions
+
+  _CCCL_TEMPLATE(typename _Up, typename _Ap)
+  _CCCL_REQUIRES((sizeof(_Up) != _Bytes && __simd_size_v<_Up, _Ap> == size()))
+  _CCCL_API constexpr explicit operator basic_vec<_Up, _Ap>() const noexcept
+  {
+    basic_vec<_Up, _Ap> __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __result.__s_.__set(__i, static_cast<_Up>((*this)[__i]));
+    }
+    return __result;
+  }
+
+  _CCCL_TEMPLATE(typename _Up, typename _Ap)
+  _CCCL_REQUIRES((sizeof(_Up) == _Bytes && __simd_size_v<_Up, _Ap> == size()))
+  _CCCL_API constexpr operator basic_vec<_Up, _Ap>() const noexcept
+  {
+    basic_vec<_Up, _Ap> __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __result.__s_.__set(__i, static_cast<_Up>((*this)[__i]));
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr ::cuda::std::bitset<__usize> to_bitset() const noexcept
+  {
+    ::cuda::std::bitset<__usize> __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __result.set(__i, (*this)[__i]);
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr unsigned long long to_ullong() const
+  {
+    constexpr __simd_size_type __nbits = ::cuda::std::__num_bits_v<unsigned long long>;
+    if constexpr (size > __nbits)
+    {
+      for (auto __i = __nbits; __i < size; ++__i)
+      {
+        _CCCL_ASSERT(!(*this)[__i], "Bit above unsigned long long width is set");
+      }
+    }
+    return to_bitset().to_ullong();
+  }
+
+  // [simd.mask.binary], basic_mask binary operators
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator&&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__logic_and(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator||(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__logic_or(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator&(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator|(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator^(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return {_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  // [simd.mask.cassign], basic_mask compound assignment
+
+  _CCCL_API friend constexpr basic_mask& operator&=(basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs = __lhs & __rhs;
+  }
+
+  _CCCL_API friend constexpr basic_mask& operator|=(basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs = __lhs | __rhs;
+  }
+
+  _CCCL_API friend constexpr basic_mask& operator^=(basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs = __lhs ^ __rhs;
+  }
+
+  // [simd.mask.comparison], basic_mask comparisons (element-wise)
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator==(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return !(__lhs ^ __rhs);
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator!=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs ^ __rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator>=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs || !__rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator<=(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return !__lhs || __rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator>(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return __lhs && !__rhs;
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr basic_mask
+  operator<(const basic_mask& __lhs, const basic_mask& __rhs) noexcept
+  {
+    return !__lhs && __rhs;
+  }
+
+  // [simd.mask.reductions], reductions
+
+  [[nodiscard]] _CCCL_API friend constexpr bool all_of(const basic_mask& __k) noexcept
+  {
+    return _Impl::__all(__k.__s_);
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr bool any_of(const basic_mask& __k) noexcept
+  {
+    return _Impl::__any(__k.__s_);
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr bool none_of(const basic_mask& __k) noexcept
+  {
+    return !any_of(__k);
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_count(const basic_mask& __k) noexcept
+  {
+    return _Impl::__count(__k.__s_);
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_min_index(const basic_mask& __k)
+  {
+    _CCCL_ASSERT(any_of(__k), "No bits are set");
+    return _Impl::__min_index(__k.__s_);
+  }
+
+  [[nodiscard]] _CCCL_API friend constexpr __simd_size_type reduce_max_index(const basic_mask& __k)
+  {
+    _CCCL_ASSERT(any_of(__k), "No bits are set");
+    return _Impl::__max_index(__k.__s_);
+  }
+
+  // TODO(fbusato): [simd.mask.cond], basic_mask exposition only conditional operators
+  // friend constexpr basic_mask __simd_select_impl(
+  //   const basic_mask&, const basic_mask&, const basic_mask&) noexcept;
+  // friend constexpr basic_mask __simd_select_impl(
+  //   const basic_mask&, same_as<bool> auto, same_as<bool> auto) noexcept;
+  // template<class T0, class T1>
+  //   friend constexpr vec<see below, size()> __simd_select_impl(
+  //     const basic_mask&, const T0&, const T1&) noexcept;
+};
+
+// Scalar bool overloads
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>)
+[[nodiscard]] _CCCL_API constexpr bool all_of(_Tp __x) noexcept
+{
+  return __x;
+}
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>)
+[[nodiscard]] _CCCL_API constexpr bool any_of(_Tp __x) noexcept
+{
+  return __x;
+}
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>)
+[[nodiscard]] _CCCL_API constexpr bool none_of(_Tp __x) noexcept
+{
+  return !__x;
+}
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>)
+[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_count(_Tp __x) noexcept
+{
+  return __x;
+}
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>)
+[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_min_index(_Tp __x)
+{
+  _CCCL_ASSERT(__x, "No bits are set");
+  return 0;
+}
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::same_as<_Tp, bool>)
+[[nodiscard]] _CCCL_API constexpr __simd_size_type reduce_max_index(_Tp __x)
+{
+  _CCCL_ASSERT(__x, "No bits are set");
+  return 0;
+}
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_BASIC_MASK_H
diff --git a/cudax/include/cuda/experimental/__simd/basic_vec.h b/cudax/include/cuda/experimental/__simd/basic_vec.h
new file mode 100644
index 00000000000..dd241191aec
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/basic_vec.h
@@ -0,0 +1,524 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_BASIC_VEC_H
+#define _CUDAX___SIMD_BASIC_VEC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__ranges/data.h>
+#include <cuda/std/__tuple_dir/tuple_size.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/void_t.h>
+
+#include <cuda/experimental/__simd/basic_mask.h>
+#include <cuda/experimental/__simd/concepts.h>
+#include <cuda/experimental/__simd/declaration.h>
+#include <cuda/experimental/__simd/flag.h>
+#include <cuda/experimental/__simd/specializations/fixed_size_simple_vec.h>
+#include <cuda/experimental/__simd/utility.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.class], class template basic_vec
+template <typename _Tp, typename _Abi>
+class basic_vec : public __simd_operations<_Tp, _Abi>
+{
+  static_assert(__is_vectorizable_v<_Tp>, "basic_vec requires a vectorizable type");
+  static_assert(__is_abi_tag_v<_Abi>, "basic_vec requires a valid ABI tag");
+
+  template <::cuda::std::size_t, typename>
+  friend class basic_mask;
+
+  using _Impl    = __simd_operations<_Tp, _Abi>;
+  using _Storage = typename _Impl::_SimdStorage;
+
+  _Storage __s_;
+
+  struct __storage_tag_t
+  {};
+  static constexpr __storage_tag_t __storage_tag{};
+
+  _CCCL_API constexpr basic_vec(_Storage __s, __storage_tag_t) noexcept
+      : __s_{__s}
+  {}
+
+public:
+  using value_type = _Tp;
+  using mask_type  = basic_mask<sizeof(value_type), _Abi>;
+  using abi_type   = _Abi;
+
+  // operator[] is const only. We need this function to set values
+  _CCCL_API constexpr void __set(__simd_size_type __i, value_type __v) noexcept
+  {
+    __s_.__set(__i, __v);
+  }
+
+  // TODO(fbusato): add simd-iterator
+  // using iterator       = simd-iterator<basic_vec>;
+  // using const_iterator = simd-iterator<const basic_vec>;
+
+  // constexpr iterator begin() noexcept { return {*this, 0}; }
+  // constexpr const_iterator begin() const noexcept { return {*this, 0}; }
+  // constexpr const_iterator cbegin() const noexcept { return {*this, 0}; }
+  // constexpr default_sentinel_t end() const noexcept { return {}; }
+  // constexpr default_sentinel_t cend() const noexcept { return {}; }
+
+  static constexpr ::cuda::std::integral_constant<__simd_size_type, __simd_size_v<value_type, abi_type>> size{};
+
+  _CCCL_HIDE_FROM_ABI basic_vec() noexcept = default;
+
+  // [simd.ctor], basic_vec constructors
+
+  // [simd.ctor] value broadcast constructor (explicit overload)
+  _CCCL_TEMPLATE(typename _Up)
+  _CCCL_REQUIRES((__explicitly_convertible_to<value_type, _Up>) _CCCL_AND(!__is_value_ctor_implicit<_Up, value_type>))
+  _CCCL_API constexpr explicit basic_vec(_Up&& __v) noexcept
+      : __s_{_Impl::__broadcast(static_cast<value_type>(__v))}
+  {}
+
+  // [simd.ctor] value broadcast constructor (implicit overload)
+  _CCCL_TEMPLATE(typename _Up)
+  _CCCL_REQUIRES((__explicitly_convertible_to<value_type, _Up>) _CCCL_AND(__is_value_ctor_implicit<_Up, value_type>))
+  _CCCL_API constexpr basic_vec(_Up&& __v) noexcept
+      : __s_{_Impl::__broadcast(static_cast<value_type>(__v))}
+  {}
+
+  // [simd.ctor] converting constructor from basic_vec<U, UAbi> (explicit overload)
+  _CCCL_TEMPLATE(typename _Up, typename _UAbi)
+  _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to<value_type, _Up>)
+                   _CCCL_AND(__is_vec_ctor_explicit<_Up, value_type>))
+  _CCCL_API constexpr explicit basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __s_.__set(__i, static_cast<value_type>(__v[__i]));
+    }
+  }
+
+  // [simd.ctor] converting constructor from basic_vec<U, UAbi> (implicit overload)
+  _CCCL_TEMPLATE(typename _Up, typename _UAbi)
+  _CCCL_REQUIRES((__simd_size_v<_Up, _UAbi> == size()) _CCCL_AND(__explicitly_convertible_to<value_type, _Up>)
+                   _CCCL_AND(!__is_vec_ctor_explicit<_Up, value_type>))
+  _CCCL_API constexpr basic_vec(const basic_vec<_Up, _UAbi>& __v) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __s_.__set(__i, static_cast<value_type>(__v[__i]));
+    }
+  }
+
+  // [simd.ctor] generator constructor
+  _CCCL_TEMPLATE(typename _Generator)
+  _CCCL_REQUIRES(__can_generate_v<value_type, _Generator, size>)
+  _CCCL_API constexpr explicit basic_vec(_Generator&& __g)
+      : __s_{_Impl::__generate(__g)}
+  {}
+
+  // [simd.ctor] range constructor
+
+  template <typename _Range, typename = void>
+  static constexpr bool __is_compatible_range_v = false;
+
+  template <typename _Range>
+  static constexpr bool __is_compatible_range_v<
+    _Range,
+    ::cuda::std::void_t<decltype(::cuda::std::tuple_size<::cuda::std::remove_cvref_t<_Range>>::value),
+                        ::cuda::std::ranges::range_value_t<_Range>>> =
+    ::cuda::std::ranges::contiguous_range<_Range> && ::cuda::std::ranges::sized_range<_Range>
+    && (__simd_size_type{::cuda::std::tuple_size<::cuda::std::remove_cvref_t<_Range>>::value} == size())
+    && __is_vectorizable_v<::cuda::std::ranges::range_value_t<_Range>>
+    && __explicitly_convertible_to<value_type, ::cuda::std::ranges::range_value_t<_Range>>;
+
+  // [simd.ctor] range constructor
+  _CCCL_TEMPLATE(typename _Range, typename... _Flags)
+  _CCCL_REQUIRES(__is_compatible_range_v<_Range>)
+  _CCCL_API constexpr basic_vec(_Range&& __range, flags<_Flags...> = {})
+  {
+    static_assert(__has_convert_flag_v<_Flags...>
+                    || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>,
+                  "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
+    const auto __data = ::cuda::std::ranges::data(__range);
+    ::cuda::experimental::simd::
+      __assert_load_store_alignment<basic_vec, ::cuda::std::ranges::range_value_t<_Range>, _Flags...>(__data);
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __s_.__set(__i, static_cast<value_type>(__data[__i]));
+    }
+  }
+
+  // [simd.ctor] masked range constructor
+  _CCCL_TEMPLATE(typename _Range, typename... _Flags)
+  _CCCL_REQUIRES(__is_compatible_range_v<_Range>)
+  _CCCL_API constexpr basic_vec(_Range&& __range, const mask_type& __mask, flags<_Flags...> = {})
+  {
+    static_assert(__has_convert_flag_v<_Flags...>
+                    || __is_value_preserving_v<::cuda::std::ranges::range_value_t<_Range>, value_type>,
+                  "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
+    const auto __data = ::cuda::std::ranges::data(__range);
+    ::cuda::experimental::simd::
+      __assert_load_store_alignment<basic_vec, ::cuda::std::ranges::range_value_t<_Range>, _Flags...>(__data);
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < size; ++__i)
+    {
+      __s_.__set(__i, __mask[__i] ? static_cast<value_type>(__data[__i]) : value_type());
+    }
+  }
+
+  // TODO(fbusato): add complex constructor
+  // constexpr basic_vec(const real-type& __reals, const real-type& __imags = {}) noexcept;
+
+  // [simd.subscr], basic_vec subscript operators
+
+  [[nodiscard]] _CCCL_API constexpr value_type operator[](__simd_size_type __i) const
+  {
+    _CCCL_ASSERT(::cuda::in_range(__i, __simd_size_type{0}, __simd_size_type{size}), "Index is out of bounds");
+    return __s_.__get(__i);
+  }
+
+  // TODO(fbusato): subscript with integral indices, requires permute()
+  // template<simd-integral _Idx>
+  //   constexpr resize_t<_Idx::size(), basic_vec> operator[](const _Idx& __indices) const;
+
+  // TODO(fbusato): [simd.complex.access], basic_vec complex accessors
+  // constexpr real-type real() const noexcept;
+  // constexpr real-type imag() const noexcept;
+  // constexpr void real(const real-type& __v) noexcept;
+  // constexpr void imag(const real-type& __v) noexcept;
+
+  // [simd.unary], basic_vec unary operators
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_pre_increment<_Up>)
+  _CCCL_API constexpr basic_vec& operator++() noexcept
+  {
+    _Impl::__increment(__s_);
+    return *this;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_post_increment<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator++(int) noexcept
+  {
+    const basic_vec __r = *this;
+    _Impl::__increment(__s_);
+    return __r;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_pre_decrement<_Up>)
+  _CCCL_API constexpr basic_vec& operator--() noexcept
+  {
+    _Impl::__decrement(__s_);
+    return *this;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_post_decrement<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator--(int) noexcept
+  {
+    const basic_vec __r = *this;
+    _Impl::__decrement(__s_);
+    return __r;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_negate<_Up>)
+  [[nodiscard]] _CCCL_API constexpr mask_type operator!() const noexcept
+  {
+    return mask_type{_Impl::__negate(__s_), mask_type::__storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_not<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator~() const noexcept
+  {
+    return basic_vec{_Impl::__bitwise_not(__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_unary_plus<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator+() const noexcept
+  {
+    return *this;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_unary_minus<_Up>)
+  [[nodiscard]] _CCCL_API constexpr basic_vec operator-() const noexcept
+  {
+    return basic_vec{_Impl::__unary_minus(__s_), __storage_tag};
+  }
+
+  // [simd.binary], basic_vec binary operators
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_plus<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator+(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__plus(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_minus<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator-(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__minus(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_multiplies<_Up>)
+  [[nodiscard]]
+  _CCCL_API friend constexpr basic_vec operator*(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__multiplies(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_divides<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator/(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__divides(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_modulo<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator%(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__modulo(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_and<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator&(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__bitwise_and(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_or<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator|(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__bitwise_or(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_xor<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator^(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__bitwise_xor(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__shift_left(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return basic_vec{_Impl::__shift_right(__lhs.__s_, __rhs.__s_), __storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left_size<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator<<(const basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs << basic_vec{__n};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right_size<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr basic_vec operator>>(const basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs >> basic_vec{__n};
+  }
+
+  // [simd.cassign], basic_vec compound assignment
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_plus<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator+=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs + __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_binary_minus<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator-=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs - __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_multiplies<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator*=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs * __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_divides<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator/=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs / __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_modulo<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator%=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs % __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_and<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator&=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs & __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_or<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator|=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs | __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_bitwise_xor<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator^=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs ^ __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs << __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return __lhs = __lhs >> __rhs;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_left_size<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator<<=(basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs = __lhs << __n;
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_shift_right_size<_Up>)
+  _CCCL_API friend constexpr basic_vec& operator>>=(basic_vec& __lhs, __simd_size_type __n) noexcept
+  {
+    return __lhs = __lhs >> __n;
+  }
+
+  // [simd.comparison], basic_vec compare operators
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_equal_to<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator==(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return mask_type{_Impl::__equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_not_equal_to<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator!=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return mask_type{_Impl::__not_equal_to(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_greater_equal<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator>=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return mask_type{_Impl::__greater_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_less_equal<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator<=(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return mask_type{_Impl::__less_equal(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_greater<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator>(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return mask_type{_Impl::__greater(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag};
+  }
+
+  _CCCL_TEMPLATE(typename _Up = _Tp)
+  _CCCL_REQUIRES(__has_less<_Up>)
+  [[nodiscard]] _CCCL_API friend constexpr mask_type operator<(const basic_vec& __lhs, const basic_vec& __rhs) noexcept
+  {
+    return mask_type{_Impl::__less(__lhs.__s_, __rhs.__s_), mask_type::__storage_tag};
+  }
+
+  // TODO(fbusato): [simd.cond], basic_vec exposition-only conditional operators
+  // friend constexpr basic_vec __simd_select_impl(
+  //   const mask_type&, const basic_vec&, const basic_vec&) noexcept;
+};
+
+// [simd.ctor] deduction guide from contiguous sized range
+// Deduces vec<range_value_t<R>, static_cast<simd-size-type>(ranges::size(r))>
+//    * it is not possible to use the alias "vec" for the deduction guide
+//    * "vec" is defined as basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>
+//    * where _Np is __simd_size_v<_Tp, tuple_size_v<_Range>>
+_CCCL_TEMPLATE(typename _Range, typename... _Ts)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>
+                 _CCCL_AND __has_static_size<_Range>)
+basic_vec(_Range&&, _Ts...)
+  -> basic_vec<::cuda::std::ranges::range_value_t<_Range>,
+               simd_abi::__deduce_abi_t<::cuda::std::ranges::range_value_t<_Range>, __static_range_size_v<_Range>>>;
+
+// [simd.ctor] deduction guide from basic_mask
+// basic_vec<__integer_from<Bytes>, Abi> is equivalent to decltype(+k):
+//   * k has type basic_mask<_Bytes, _Abi>
+//   * +k calls basic_mask::operator+()
+//   * the return type is basic_vec<__integer_from<_Bp>, _Abi>
+// The deduced type is equivalent to decltype(+k), i.e. basic_vec<__integer_from<Bytes>, Abi>
+_CCCL_TEMPLATE(::cuda::std::size_t _Bytes, typename _Abi)
+_CCCL_REQUIRES(__has_unary_plus<basic_mask<_Bytes, _Abi>>)
+basic_vec(basic_mask<_Bytes, _Abi>) -> basic_vec<__integer_from<_Bytes>, _Abi>;
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_BASIC_VEC_H
diff --git a/cudax/include/cuda/experimental/__simd/concepts.h b/cudax/include/cuda/experimental/__simd/concepts.h
new file mode 100644
index 00000000000..c479ca2399a
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/concepts.h
@@ -0,0 +1,245 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_CONCEPTS_H
+#define _CUDAX___SIMD_CONCEPTS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__numeric/overflow_cast.h>
+#include <cuda/__type_traits/is_floating_point.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/convertible_to.h>
+#include <cuda/std/__concepts/equality_comparable.h>
+#include <cuda/std/__floating_point/conversion_rank_order.h>
+#include <cuda/std/__limits/numeric_limits.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/void_t.h>
+#include <cuda/std/__utility/declval.h>
+
+#include <cuda/experimental/__simd/abi.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.expos], explicitly-convertible-to concept
+
+template <typename _To, typename _From>
+_CCCL_CONCEPT __explicitly_convertible_to =
+  _CCCL_REQUIRES_EXPR((_To, _From))((static_cast<_To>(::cuda::std::declval<_From>())));
+
+// [simd.expos], constexpr-wrapper-like concept
+
+template <typename _Tp>
+_CCCL_CONCEPT __constexpr_wrapper_like =
+  ::cuda::std::convertible_to<_Tp, decltype(_Tp::value)>
+  && ::cuda::std::equality_comparable_with<_Tp, decltype(_Tp::value)>
+  && ::cuda::std::bool_constant<(_Tp() == _Tp::value)>::value
+  && ::cuda::std::bool_constant<(static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value)>::value;
+
+// (c++draft)The conversion from an arithmetic type U to a vectorizable type T is value-preserving if all possible
+// values of U can be represented with type T.
+template <typename _From, typename _To>
+constexpr bool __is_value_preserving_v =
+  (::cuda::std::is_integral_v<_From> && ::cuda::std::is_integral_v<_To>
+   && ::cuda::__is_integer_representable_v<_From, _To>)
+  || (::cuda::is_floating_point_v<_From> && ::cuda::is_floating_point_v<_To>
+      && ::cuda::std::__fp_is_implicit_conversion_v<_From, _To>)
+  || (::cuda::std::is_integral_v<_From> && ::cuda::is_floating_point_v<_To>
+      && ::cuda::std::numeric_limits<_From>::digits <= ::cuda::std::numeric_limits<_To>::digits);
+
+template <typename _From, typename _ValueType, typename = void>
+constexpr bool __is_constexpr_wrapper_value_preserving_v = false;
+
+template <typename _From, typename _ValueType>
+constexpr bool __is_constexpr_wrapper_value_preserving_v<_From, _ValueType, ::cuda::std::void_t<decltype(_From::value)>> =
+  ::cuda::std::is_arithmetic_v<::cuda::std::remove_cvref_t<decltype(_From::value)>>
+  && __is_value_preserving_v<::cuda::std::remove_cvref_t<decltype(_From::value)>, _ValueType>;
+
+// [simd.ctor] implicit value constructor
+template <typename _Up, typename _ValueType, typename _From = ::cuda::std::remove_cvref_t<_Up>>
+_CCCL_CONCEPT __is_value_ctor_implicit =
+  ::cuda::std::convertible_to<_Up, _ValueType>
+  && ((!::cuda::std::is_arithmetic_v<_From> && !__constexpr_wrapper_like<_From>)
+      || (::cuda::std::is_arithmetic_v<_From> && __is_value_preserving_v<_From, _ValueType>)
+      || (__constexpr_wrapper_like<_From> && __is_constexpr_wrapper_value_preserving_v<_From, _ValueType>) );
+
+// [conv.rank], integer conversion rank for [simd.ctor] p7
+
+template <typename _Tp>
+inline constexpr int __integer_conversion_rank = 0;
+
+template <>
+inline constexpr int __integer_conversion_rank<signed char> = 1;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned char> = 1;
+template <>
+inline constexpr int __integer_conversion_rank<char> = 1;
+template <>
+inline constexpr int __integer_conversion_rank<short> = 2;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned short> = 2;
+template <>
+inline constexpr int __integer_conversion_rank<int> = 3;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned int> = 3;
+template <>
+inline constexpr int __integer_conversion_rank<long> = 4;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned long> = 4;
+template <>
+inline constexpr int __integer_conversion_rank<long long> = 5;
+template <>
+inline constexpr int __integer_conversion_rank<unsigned long long> = 5;
+#if _CCCL_HAS_INT128()
+template <>
+inline constexpr int __integer_conversion_rank<__int128_t> = 6;
+template <>
+inline constexpr int __integer_conversion_rank<__uint128_t> = 6;
+#endif // _CCCL_HAS_INT128()
+
+// [conv.rank], floating-point conversion rank for [simd.ctor] p7
+
+template <typename _Tp>
+inline constexpr int __fp_conversion_rank = 0;
+
+#if _CCCL_HAS_NVFP16()
+template <>
+inline constexpr int __fp_conversion_rank<__half> = 1;
+#endif // _CCCL_HAS_NVFP16()
+#if _CCCL_HAS_NVBF16()
+template <>
+inline constexpr int __fp_conversion_rank<__nv_bfloat16> = 1;
+#endif // _CCCL_HAS_NVBF16()
+template <>
+inline constexpr int __fp_conversion_rank<float> = 2;
+template <>
+inline constexpr int __fp_conversion_rank<double> = 3;
+#if _CCCL_HAS_LONG_DOUBLE()
+template <>
+inline constexpr int __fp_conversion_rank<long double> = 4;
+#endif // _CCCL_HAS_LONG_DOUBLE()
+#if _CCCL_HAS_FLOAT128()
+template <>
+inline constexpr int __fp_conversion_rank<__float128> = 5;
+#endif // _CCCL_HAS_FLOAT128()
+
+// [simd.ctor] p7: explicit(see below) for basic_vec(const basic_vec<U, UAbi>&)
+// explicit evaluates to true if either:
+//   - conversion from U to value_type is not value-preserving, or
+//   - both U and value_type are integral and integer_conversion_rank(U) > rank(value_type), or
+//   - both U and value_type are floating-point and fp_conversion_rank(U) > rank(value_type)
+template <typename _Up, typename _ValueType>
+constexpr bool __is_vec_ctor_explicit =
+  !__is_value_preserving_v<_Up, _ValueType>
+  || (::cuda::std::is_integral_v<_Up> && ::cuda::std::is_integral_v<_ValueType>
+      && __integer_conversion_rank<_Up> > __integer_conversion_rank<_ValueType>)
+  || (::cuda::is_floating_point_v<_Up> && ::cuda::is_floating_point_v<_ValueType>
+      && __fp_conversion_rank<_Up> > __fp_conversion_rank<_ValueType>);
+
+// [simd.unary], operator constraints
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_pre_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((++__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_post_increment = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t++));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_pre_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp& __t)((--__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_post_decrement = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t--));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_negate = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((!__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_not = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((~__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_unary_plus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((+__t));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_unary_minus = _CCCL_REQUIRES_EXPR((_Tp), const _Tp __t)((-__t));
+
+// [simd.binary], binary operator constraints
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_binary_plus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a + __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_binary_minus = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a - __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_multiplies = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a * __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_divides = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a / __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_modulo = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a % __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_and = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a & __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_or = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a | __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_bitwise_xor = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a ^ __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_left = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a << __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_right = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >> __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_left_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t << __simd_size_type{}));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_shift_right_size = _CCCL_REQUIRES_EXPR((_Tp), _Tp __t)((__t >> __simd_size_type{}));
+
+// [simd.comparison], comparison operator constraints
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a == __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_not_equal_to = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a != __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_greater_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a >= __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_less_equal = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a <= __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_greater = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a > __b));
+
+template <typename _Tp>
+_CCCL_CONCEPT __has_less = _CCCL_REQUIRES_EXPR((_Tp), _Tp __a, _Tp __b)((__a < __b));
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_CONCEPTS_H
diff --git a/cudax/include/cuda/experimental/__simd/declaration.h b/cudax/include/cuda/experimental/__simd/declaration.h
new file mode 100644
index 00000000000..bc9d2487f8c
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/declaration.h
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_DECLARATION_H
+#define _CUDAX___SIMD_DECLARATION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cstddef/types.h>
+
+#include <cuda/experimental/__simd/abi.h>
+#include <cuda/experimental/__simd/exposition.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+template <typename _Tp, typename _Abi = simd_abi::native<_Tp>>
+class basic_vec;
+
+template <::cuda::std::size_t _Bytes, typename _Abi = simd_abi::native<__integer_from<_Bytes>>>
+class basic_mask;
+
+template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, simd_abi::native<_Tp>>>
+using vec = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>;
+
+template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, simd_abi::native<_Tp>>>
+using mask = basic_mask<sizeof(_Tp), simd_abi::__deduce_abi_t<_Tp, _Np>>;
+
+// specializations
+
+template <typename _Tp, typename _Abi>
+struct __simd_storage;
+
+template <typename _Tp, typename _Abi>
+struct __simd_operations;
+
+template <::cuda::std::size_t _Bytes, typename _Abi>
+struct __mask_storage;
+
+template <::cuda::std::size_t _Bytes, typename _Abi>
+struct __mask_operations;
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_DECLARATION_H
diff --git a/cudax/include/cuda/experimental/__simd/exposition.h b/cudax/include/cuda/experimental/__simd/exposition.h
new file mode 100644
index 00000000000..ae2eeba4857
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/exposition.h
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_EXPOSITION_H
+#define _CUDAX___SIMD_EXPOSITION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__type_traits/is_floating_point.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/is_const.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/is_volatile.h>
+#include <cuda/std/__type_traits/make_nbit_int.h>
+
+#include <cuda/experimental/__simd/abi.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.expos], exposition-only helpers
+
+template <::cuda::std::size_t _Bytes>
+using __integer_from = ::cuda::std::__make_nbit_int_t<_Bytes * 8, true>;
+
+// all standard integer types, character types, and the types float and double ([basic.fundamental]);
+// std​::​float16_t, std​::​float32_t, and std​::​float64_t if defined ([basic.extended.fp]); and
+// TODO(fbusato) complex<T> where T is a vectorizable floating-point type.
+template <typename _Tp>
+constexpr bool __is_vectorizable_v =
+  (::cuda::std::is_integral_v<_Tp> || ::cuda::is_floating_point_v<_Tp>)
+  && !::cuda::std::is_same_v<_Tp, bool> && !::cuda::std::is_const_v<_Tp> && !::cuda::std::is_volatile_v<_Tp>;
+
+template <typename _Tp, typename _Abi>
+constexpr __simd_size_type __simd_size_v = 0;
+
+template <typename _Tp, __simd_size_type _Np>
+constexpr __simd_size_type __simd_size_v<_Tp, simd_abi::fixed_size_simple<_Np>> = _Np;
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_EXPOSITION_H
diff --git a/cudax/include/cuda/experimental/__simd/flag.h b/cudax/include/cuda/experimental/__simd/flag.h
new file mode 100644
index 00000000000..d2aad6e8ba8
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/flag.h
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_FLAG_H
+#define _CUDAX___SIMD_FLAG_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__cmath/pow2.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.expos], exposition-only flag types
+
+struct __convert_flag
+{};
+
+struct __aligned_flag
+{};
+
+template <::cuda::std::size_t _Np>
+struct __overaligned_flag
+{
+  static_assert(::cuda::is_power_of_two(_Np), "Overaligned flag requires a power-of-2 alignment");
+};
+
+template <typename _Tp>
+constexpr bool __is_flag_type_v = false;
+
+template <>
+constexpr bool __is_flag_type_v<__convert_flag> = true;
+
+template <>
+constexpr bool __is_flag_type_v<__aligned_flag> = true;
+
+template <::cuda::std::size_t _Np>
+constexpr bool __is_flag_type_v<__overaligned_flag<_Np>> = true;
+
+// [simd.flags.overview], class template flags
+
+template <typename... _Flags>
+struct flags
+{
+  static_assert((true && ... && __is_flag_type_v<_Flags>),
+                "Every flag type must be one of convert_flag, aligned_flag, or overaligned_flag<N>");
+
+  // [simd.flags.oper], flags operators
+  template <typename... _Other>
+  [[nodiscard]] _CCCL_API friend constexpr flags<_Flags..., _Other...> operator|(flags, flags<_Other...>) noexcept
+  {
+    return {};
+  }
+};
+
+// [simd.flags], flag constants
+
+inline constexpr flags<> flag_default{};
+inline constexpr flags<__convert_flag> flag_convert{};
+inline constexpr flags<__aligned_flag> flag_aligned{};
+
+template <::cuda::std::size_t _Np>
+constexpr flags<__overaligned_flag<_Np>> flag_overaligned{};
+
+template <typename... _Flags>
+constexpr bool __has_convert_flag_v = (false || ... || ::cuda::std::is_same_v<_Flags, __convert_flag>);
+
+template <typename... _Flags>
+constexpr bool __has_aligned_flag_v = (false || ... || ::cuda::std::is_same_v<_Flags, __aligned_flag>);
+
+template <typename _Flag>
+constexpr ::cuda::std::size_t __overaligned_value_v = 0;
+
+template <::cuda::std::size_t _Np>
+constexpr ::cuda::std::size_t __overaligned_value_v<__overaligned_flag<_Np>> = _Np;
+
+template <typename... _Flags>
+constexpr bool __has_overaligned_flag_v = (false || ... || (__overaligned_value_v<_Flags> != 0));
+
+template <typename... _Flags>
+constexpr ::cuda::std::size_t __overaligned_alignment_v =
+  (::cuda::std::size_t{0} | ... | __overaligned_value_v<_Flags>);
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_FLAG_H
diff --git a/cudax/include/cuda/experimental/__simd/load_store.h b/cudax/include/cuda/experimental/__simd/load_store.h
new file mode 100644
index 00000000000..10cc75d6b6e
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/load_store.h
@@ -0,0 +1,453 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_LOAD_STORE_H
+#define _CUDAX___SIMD_LOAD_STORE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/same_as.h>
+#include <cuda/std/__iterator/concepts.h>
+#include <cuda/std/__iterator/distance.h>
+#include <cuda/std/__iterator/incrementable_traits.h>
+#include <cuda/std/__iterator/readable_traits.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__ranges/access.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__ranges/data.h>
+#include <cuda/std/__ranges/size.h>
+#include <cuda/std/__utility/cmp.h>
+#include <cuda/std/__utility/forward.h>
+
+#include <cuda/experimental/__simd/basic_vec.h>
+#include <cuda/experimental/__simd/concepts.h>
+#include <cuda/experimental/__simd/flag.h>
+#include <cuda/experimental/__simd/utility.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.loadstore] helper: resolves default V template parameter for load functions
+// When _Vp = void (default), resolves to basic_vec<_Up>; otherwise uses the explicit _Vp
+template <typename _Vp, typename _Up>
+struct __load_vec_type
+{
+  using type = _Vp;
+};
+
+template <typename _Up>
+struct __load_vec_type<void, _Up>
+{
+  using type = basic_vec<_Up>;
+};
+
+template <typename _Vp, typename _Up>
+using __load_vec_t = typename __load_vec_type<_Vp, _Up>::type;
+
+// [simd.loadstore] helper: core partial load from pointer + count + mask
+template <typename _Result, typename _Up, typename... _Flags>
+[[nodiscard]] _CCCL_API constexpr _Result
+__partial_load_from_ptr(const _Up* __ptr, __simd_size_type __count, const typename _Result::mask_type& __mask)
+{
+  using _Tp = typename _Result::value_type;
+  static_assert(::cuda::std::same_as<::cuda::std::remove_cvref_t<_Result>, _Result>,
+                "V must not be a reference or cv-qualified type");
+  static_assert(__is_vectorizable_v<_Tp> && __is_abi_tag_v<typename _Result::abi_type>,
+                "V must be an enabled specialization of basic_vec");
+  static_assert(__is_vectorizable_v<_Up>, "range_value_t<R> must be a vectorizable type");
+  static_assert(__explicitly_convertible_to<_Tp, _Up>,
+                "range_value_t<R> must satisfy explicitly-convertible-to<value_type>");
+  static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Up, _Tp>,
+                "Conversion from range_value_t<R> to value_type is not value-preserving; use flag_convert");
+  ::cuda::experimental::simd::__assert_load_store_alignment<_Result, _Up, _Flags...>(__ptr);
+  _Result __result{};
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < _Result::size; ++__i)
+  {
+    if (__mask[__i] && __i < __count)
+    {
+      __result.__set(__i, static_cast<_Tp>(__ptr[__i]));
+    }
+  }
+  return __result;
+}
+
+// [simd.loadstore] helper: core partial store to pointer + count + mask
+template <typename _Tp, typename _Abi, typename _Up, typename... _Flags>
+_CCCL_API constexpr void __partial_store_to_ptr(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Up* __ptr,
+  __simd_size_type __count,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask)
+{
+  static_assert(__is_vectorizable_v<_Up>, "range_value_t<R> must be a vectorizable type");
+  static_assert(__explicitly_convertible_to<_Up, _Tp>,
+                "value_type must satisfy explicitly-convertible-to<range_value_t<R>>");
+  static_assert(__has_convert_flag_v<_Flags...> || __is_value_preserving_v<_Tp, _Up>,
+                "Conversion from value_type to range_value_t<R> is not value-preserving; use flag_convert");
+  ::cuda::experimental::simd::__assert_load_store_alignment<basic_vec<_Tp, _Abi>, _Up, _Flags...>(__ptr);
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < basic_vec<_Tp, _Abi>::size; ++__i)
+  {
+    if (__mask[__i] && __i < __count)
+    {
+      __ptr[__i] = static_cast<_Up>(__v[__i]);
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] partial_load
+
+// partial_load: range, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> partial_load(
+  _Range&& __r,
+  const typename __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>;
+  using _Up     = ::cuda::std::ranges::range_value_t<_Range>;
+  return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>(
+    ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask);
+}
+
+// partial_load: range, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>
+partial_load(_Range&& __r, flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>;
+  return ::cuda::experimental::simd::partial_load<_Vp>(
+    ::cuda::std::forward<_Range>(__r), typename _Result::mask_type(true), __f);
+}
+
+// partial_load: iterator + count, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> partial_load(
+  _Ip __first,
+  ::cuda::std::iter_difference_t<_Ip> __n,
+  const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  using _Up     = ::cuda::std::iter_value_t<_Ip>;
+  return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>(
+    ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask);
+}
+
+// partial_load: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>
+partial_load(_Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  return ::cuda::experimental::simd::partial_load<_Vp>(__first, __n, typename _Result::mask_type(true), __f);
+}
+
+// partial_load: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> partial_load(
+  _Ip __first,
+  _Sp __last,
+  const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  using _Up     = ::cuda::std::iter_value_t<_Ip>;
+  return ::cuda::experimental::simd::__partial_load_from_ptr<_Result, _Up, _Flags...>(
+    ::cuda::std::to_address(__first), static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)), __mask);
+}
+
+// partial_load: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>
+partial_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  return ::cuda::experimental::simd::partial_load<_Vp>(__first, __last, typename _Result::mask_type(true), __f);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] unchecked_load
+
+// unchecked_load: range, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>> unchecked_load(
+  _Range&& __r,
+  const typename __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>;
+  if constexpr (__has_static_size<_Range>)
+  {
+    static_assert(__static_range_size_v<_Range> >= _Result::size(),
+                  "unchecked_load requires ranges::size(r) >= V::size()");
+  }
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), _Result::size()),
+               "unchecked_load requires ranges::size(r) >= V::size()");
+  return ::cuda::experimental::simd::partial_load<_Vp>(::cuda::std::forward<_Range>(__r), __mask, __f);
+}
+
+// unchecked_load: range, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>
+unchecked_load(_Range&& __r, flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::ranges::range_value_t<_Range>>;
+  return ::cuda::experimental::simd::unchecked_load<_Vp>(
+    ::cuda::std::forward<_Range>(__r), typename _Result::mask_type(true), __f);
+}
+
+// unchecked_load: iterator + count, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> unchecked_load(
+  _Ip __first,
+  ::cuda::std::iter_difference_t<_Ip> __n,
+  const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, _Result::size()), "unchecked_load requires n >= V::size()");
+  return ::cuda::experimental::simd::partial_load<_Vp>(__first, __n, __mask, __f);
+}
+
+// unchecked_load: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>
+unchecked_load(_Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  return ::cuda::experimental::simd::unchecked_load<_Vp>(__first, __n, typename _Result::mask_type(true), __f);
+}
+
+// unchecked_load: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>> unchecked_load(
+  _Ip __first,
+  _Sp __last,
+  const typename __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), _Result::size()),
+               "unchecked_load requires distance(first, last) >= V::size()");
+  return ::cuda::experimental::simd::partial_load<_Vp>(__first, __last, __mask, __f);
+}
+
+// unchecked_load: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Vp = void, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip>)
+[[nodiscard]] _CCCL_API constexpr __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>
+unchecked_load(_Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  using _Result = __load_vec_t<_Vp, ::cuda::std::iter_value_t<_Ip>>;
+  return ::cuda::experimental::simd::unchecked_load<_Vp>(__first, __last, typename _Result::mask_type(true), __f);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] partial_store
+
+// partial_store: range, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>
+                 _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void partial_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Range&& __r,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  static_assert(
+    ::cuda::std::indirectly_writable<::cuda::std::ranges::iterator_t<_Range>, ::cuda::std::ranges::range_value_t<_Range>>,
+    "ranges::iterator_t<R> must model indirectly_writable<ranges::range_value_t<R>>");
+  using _Up = ::cuda::std::ranges::range_value_t<_Range>;
+  ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>(
+    __v, ::cuda::std::ranges::data(__r), static_cast<__simd_size_type>(::cuda::std::ranges::size(__r)), __mask);
+}
+
+// partial_store: range, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>
+                 _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void partial_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {})
+{
+  ::cuda::experimental::simd::partial_store(
+    __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// partial_store: iterator + count, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(
+  ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void partial_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  ::cuda::std::iter_difference_t<_Ip> __n,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  static_assert(::cuda::std::indirectly_writable<_Ip, ::cuda::std::iter_value_t<_Ip>>,
+                "I must model indirectly_writable<iter_value_t<I>>");
+  using _Up = ::cuda::std::iter_value_t<_Ip>;
+  ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>(
+    __v, ::cuda::std::to_address(__first), static_cast<__simd_size_type>(__n), __mask);
+}
+
+// partial_store: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(
+  ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void partial_store(
+  const basic_vec<_Tp, _Abi>& __v, _Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  ::cuda::experimental::simd::partial_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// partial_store: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void partial_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  _Sp __last,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> = {})
+{
+  static_assert(::cuda::std::indirectly_writable<_Ip, ::cuda::std::iter_value_t<_Ip>>,
+                "I must model indirectly_writable<iter_value_t<I>>");
+  using _Up = ::cuda::std::iter_value_t<_Ip>;
+  ::cuda::experimental::simd::__partial_store_to_ptr<_Tp, _Abi, _Up, _Flags...>(
+    __v,
+    ::cuda::std::to_address(__first),
+    static_cast<__simd_size_type>(::cuda::std::distance(__first, __last)),
+    __mask);
+}
+
+// partial_store: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void
+partial_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  ::cuda::experimental::simd::partial_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+//----------------------------------------------------------------------------------------------------------------------
+// [simd.loadstore] unchecked_store
+
+// unchecked_store: range, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>
+                 _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void unchecked_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Range&& __r,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  if constexpr (__has_static_size<_Range>)
+  {
+    static_assert(__static_range_size_v<_Range> >= basic_vec<_Tp, _Abi>::size(),
+                  "unchecked_store requires ranges::size(r) >= V::size()");
+  }
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::ranges::size(__r), __v.size),
+               "unchecked_store requires ranges::size(r) >= V::size()");
+  ::cuda::experimental::simd::partial_store(__v, ::cuda::std::forward<_Range>(__r), __mask, __f);
+}
+
+// unchecked_store: range, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Range, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::ranges::contiguous_range<_Range> _CCCL_AND ::cuda::std::ranges::sized_range<_Range>
+                 _CCCL_AND __explicitly_convertible_to<::cuda::std::ranges::range_value_t<_Range>, _Tp>)
+_CCCL_API constexpr void unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Range&& __r, flags<_Flags...> __f = {})
+{
+  ::cuda::experimental::simd::unchecked_store(
+    __v, ::cuda::std::forward<_Range>(__r), typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// unchecked_store: iterator + count, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(
+  ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void unchecked_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  ::cuda::std::iter_difference_t<_Ip> __n,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(__n, __v.size), "unchecked_store requires n >= V::size()");
+  ::cuda::experimental::simd::partial_store(__v, __first, __n, __mask, __f);
+}
+
+// unchecked_store: iterator + count, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename... _Flags)
+_CCCL_REQUIRES(
+  ::cuda::std::contiguous_iterator<_Ip> _CCCL_AND __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void unchecked_store(
+  const basic_vec<_Tp, _Abi>& __v, _Ip __first, ::cuda::std::iter_difference_t<_Ip> __n, flags<_Flags...> __f = {})
+{
+  ::cuda::experimental::simd::unchecked_store(__v, __first, __n, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+
+// unchecked_store: iterator + sentinel, masked
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void unchecked_store(
+  const basic_vec<_Tp, _Abi>& __v,
+  _Ip __first,
+  _Sp __last,
+  const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+  flags<_Flags...> __f = {})
+{
+  _CCCL_ASSERT(::cuda::std::cmp_greater_equal(::cuda::std::distance(__first, __last), __v.size),
+               "unchecked_store requires distance(first, last) >= V::size()");
+  ::cuda::experimental::simd::partial_store(__v, __first, __last, __mask, __f);
+}
+
+// unchecked_store: iterator + sentinel, no mask
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _Ip, typename _Sp, typename... _Flags)
+_CCCL_REQUIRES(::cuda::std::contiguous_iterator<_Ip> _CCCL_AND ::cuda::std::sized_sentinel_for<_Sp, _Ip> _CCCL_AND
+                 __explicitly_convertible_to<::cuda::std::iter_value_t<_Ip>, _Tp>)
+_CCCL_API constexpr void
+unchecked_store(const basic_vec<_Tp, _Abi>& __v, _Ip __first, _Sp __last, flags<_Flags...> __f = {})
+{
+  ::cuda::experimental::simd::unchecked_store(__v, __first, __last, typename basic_vec<_Tp, _Abi>::mask_type(true), __f);
+}
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_LOAD_STORE_H
diff --git a/cudax/include/cuda/experimental/__simd/reductions.h b/cudax/include/cuda/experimental/__simd/reductions.h
new file mode 100644
index 00000000000..36bff50ab1a
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/reductions.h
@@ -0,0 +1,216 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_REDUCTIONS_H
+#define _CUDAX___SIMD_REDUCTIONS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/totally_ordered.h>
+#include <cuda/std/__functional/operations.h>
+#include <cuda/std/__limits/numeric_limits.h>
+#include <cuda/std/__type_traits/always_false.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/type_identity.h>
+
+#include <cuda/experimental/__simd/basic_vec.h>
+#include <cuda/experimental/__simd/declaration.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.expos], reduction-binary-operation concept
+
+template <typename _BinaryOp, typename _Tp>
+_CCCL_CONCEPT __reduction_binary_operation = _CCCL_REQUIRES_EXPR(
+  (_BinaryOp, _Tp), const _BinaryOp __binary_op, const vec<_Tp, 1> __v)(_Same_as(vec<_Tp, 1>) __binary_op(__v, __v));
+
+template <typename _BinaryOp>
+constexpr bool __is_reduce_default_supported_operation_v =
+  ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::plus<>> || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::multiplies<>>
+  || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_and<>>
+  || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_or<>>
+  || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_xor<>>;
+
+template <typename _Tp, typename _BinaryOp>
+[[nodiscard]] _CCCL_API constexpr _Tp __default_identity_element() noexcept
+{
+  if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::plus<>>
+                || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_or<>>
+                || ::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_xor<>>)
+  {
+    return _Tp();
+  }
+  else if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::multiplies<>>)
+  {
+    return _Tp(1);
+  }
+  else if constexpr (::cuda::std::is_same_v<_BinaryOp, ::cuda::std::bit_and<>>)
+  {
+    return _Tp(~_Tp());
+  }
+  else
+  {
+    static_assert(::cuda::std::__always_false_v<_Tp>,
+                  "No default identity element for this BinaryOperation; provide one explicitly");
+    return _Tp();
+  }
+}
+
+// [simd.reductions], reduce
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation = ::cuda::std::plus<>)
+_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp>)
+[[nodiscard]] _CCCL_API constexpr _Tp
+reduce(const basic_vec<_Tp, _Abi>& __x, _BinaryOperation __binary_op = ::cuda::std::plus<>{})
+{
+  vec<_Tp, 1> __result{__x[0]};
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 1; __i < __x.size; ++__i)
+  {
+    __result = __binary_op(__result, vec<_Tp, 1>{__x[__i]});
+  }
+  return __result[0];
+}
+
+// We need two overloads:
+// 1) An argument for identity_element is provided for the invocation
+// 2) unless BinaryOperation is one of plus<>, multiplies<>, bit_and<>, bit_or<>, or bit_xor<>
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation)
+_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp>)
+[[nodiscard]] _CCCL_API constexpr _Tp
+reduce(const basic_vec<_Tp, _Abi>& __x,
+       const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+       _BinaryOperation __binary_op,
+       ::cuda::std::type_identity_t<_Tp> __identity_element)
+{
+  vec<_Tp, 1> __result{__identity_element};
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < __x.size; ++__i)
+  {
+    if (__mask[__i])
+    {
+      __result = __binary_op(__result, vec<_Tp, 1>{__x[__i]});
+    }
+  }
+  return __result[0];
+}
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi, typename _BinaryOperation)
+_CCCL_REQUIRES(__reduction_binary_operation<_BinaryOperation, _Tp> _CCCL_AND
+                 __is_reduce_default_supported_operation_v<_BinaryOperation>)
+[[nodiscard]] _CCCL_API constexpr _Tp
+reduce(const basic_vec<_Tp, _Abi>& __x,
+       const typename basic_vec<_Tp, _Abi>::mask_type& __mask,
+       _BinaryOperation __binary_op = ::cuda::std::plus<>{})
+{
+  return ::cuda::experimental::simd::reduce(
+    __x, __mask, __binary_op, ::cuda::experimental::simd::__default_identity_element<_Tp, _BinaryOperation>());
+}
+
+// [simd.reductions], reduce_min
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi)
+_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>)
+[[nodiscard]] _CCCL_API constexpr _Tp reduce_min(const basic_vec<_Tp, _Abi>& __x) noexcept
+{
+  static_assert(__x.size > 0, "Vector is empty");
+  auto __result = __x[0];
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 1; __i < __x.size; ++__i)
+  {
+    const auto __val = __x[__i];
+    if (!(__result < __val))
+    {
+      __result = __val;
+    }
+  }
+  return __result;
+}
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi)
+_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>)
+[[nodiscard]] _CCCL_API constexpr _Tp
+reduce_min(const basic_vec<_Tp, _Abi>& __x, const typename basic_vec<_Tp, _Abi>::mask_type& __mask) noexcept
+{
+  auto __result = ::cuda::std::numeric_limits<_Tp>::max();
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < __x.size; ++__i)
+  {
+    if (__mask[__i])
+    {
+      const auto __val = __x[__i];
+      if (!(__result < __val))
+      {
+        __result = __val;
+      }
+    }
+  }
+  return __result;
+}
+
+// [simd.reductions], reduce_max
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi)
+_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>)
+[[nodiscard]] _CCCL_API constexpr _Tp reduce_max(const basic_vec<_Tp, _Abi>& __x) noexcept
+{
+  static_assert(__x.size > 0, "Vector is empty");
+  auto __result = __x[0];
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 1; __i < __x.size; ++__i)
+  {
+    const auto __val = __x[__i];
+    if (!(__val < __result))
+    {
+      __result = __val;
+    }
+  }
+  return __result;
+}
+
+_CCCL_TEMPLATE(typename _Tp, typename _Abi)
+_CCCL_REQUIRES(::cuda::std::totally_ordered<_Tp>)
+[[nodiscard]] _CCCL_API constexpr _Tp
+reduce_max(const basic_vec<_Tp, _Abi>& __x, const typename basic_vec<_Tp, _Abi>::mask_type& __mask) noexcept
+{
+  auto __result = ::cuda::std::numeric_limits<_Tp>::lowest();
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < __x.size; ++__i)
+  {
+    if (__mask[__i])
+    {
+      const auto __val = __x[__i];
+      if (!(__val < __result))
+      {
+        __result = __val;
+      }
+    }
+  }
+  return __result;
+}
+
+// NOTE: mask reductions (all_of, any_of, none_of, reduce_count, reduce_min_index, reduce_max_index)
+// and their bool scalar overloads are defined in basic_mask.h
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_REDUCTIONS_H
diff --git a/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h
new file mode 100644
index 00000000000..50434015934
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_mask.h
@@ -0,0 +1,232 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H
+#define _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/experimental/__simd/declaration.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// Bool-per-element mask storage for fixed_size_simple ABI
+template <::cuda::std::size_t _Bytes, __simd_size_type _Np>
+struct __mask_storage<_Bytes, simd_abi::__fixed_size_simple<_Np>>
+{
+  using value_type                                     = bool;
+  static constexpr ::cuda::std::size_t __element_bytes = _Bytes;
+
+  bool __data[_Np];
+
+  [[nodiscard]] _CCCL_API constexpr bool __get(__simd_size_type __idx) const noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    return __data[__idx];
+  }
+
+  _CCCL_API constexpr void __set(__simd_size_type __idx, bool __v) noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    __data[__idx] = __v;
+  }
+};
+
+// Mask operations for fixed_size_simple ABI with bool-per-element storage
+template <::cuda::std::size_t _Bytes, __simd_size_type _Np>
+struct __mask_operations<_Bytes, simd_abi::__fixed_size_simple<_Np>>
+{
+  using _MaskStorage = __mask_storage<_Bytes, simd_abi::__fixed_size_simple<_Np>>;
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __broadcast(bool __v) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __v;
+    }
+    return __result;
+  }
+
+  template <typename _Generator, __simd_size_type... _Is>
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __generate_init(_Generator&& __g, ::cuda::std::integer_sequence<__simd_size_type, _Is...>)
+  {
+    _MaskStorage __result;
+    ((__result.__data[_Is] = static_cast<bool>(__g(::cuda::std::integral_constant<__simd_size_type, _Is>()))), ...);
+    return __result;
+  }
+
+  template <typename _Generator>
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __generate(_Generator&& __g)
+  {
+    return __generate_init(__g, ::cuda::std::make_integer_sequence<__simd_size_type, _Np>());
+  }
+
+  // Logical operators (for operator&& and operator||)
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __logic_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __logic_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  // Bitwise operators (for operator&, operator|, operator^)
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __bitwise_and(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] && __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __bitwise_or(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] || __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage
+  __bitwise_xor(const _MaskStorage& __lhs, const _MaskStorage& __rhs) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __lhs.__data[__i] != __rhs.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __bitwise_not(const _MaskStorage& __s) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = !__s.__data[__i];
+    }
+    return __result;
+  }
+
+  // Reductions
+
+  [[nodiscard]] _CCCL_API static constexpr bool __all(const _MaskStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      if (!__s.__data[__i])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr bool __any(const _MaskStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      if (__s.__data[__i])
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr __simd_size_type __count(const _MaskStorage& __s) noexcept
+  {
+    __simd_size_type __count = 0;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __count += static_cast<__simd_size_type>(__s.__data[__i]);
+    }
+    return __count;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr __simd_size_type __min_index(const _MaskStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      if (__s.__data[__i])
+      {
+        return __i;
+      }
+    }
+    _CCCL_UNREACHABLE();
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr __simd_size_type __max_index(const _MaskStorage& __s) noexcept
+  {
+    for (__simd_size_type __i = _Np - 1; __i >= 0; --__i)
+    {
+      if (__s.__data[__i])
+      {
+        return __i;
+      }
+    }
+    _CCCL_UNREACHABLE();
+  }
+};
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_MASK_H
diff --git a/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h
new file mode 100644
index 00000000000..624d373ee8a
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/specializations/fixed_size_simple_vec.h
@@ -0,0 +1,218 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H
+#define _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/experimental/__simd/declaration.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+namespace simd_abi
+{
+template <__simd_size_type _Np>
+struct __fixed_size_simple
+{
+  static constexpr __simd_size_type __simd_size = _Np;
+};
+} // namespace simd_abi
+
+// Element-per-slot simd storage for fixed_size_simple ABI
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>>
+{
+  using value_type = _Tp;
+  _Tp __data[_Np];
+
+  [[nodiscard]] _CCCL_API constexpr _Tp __get(__simd_size_type __idx) const noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    return __data[__idx];
+  }
+
+  _CCCL_API constexpr void __set(__simd_size_type __idx, _Tp __v) noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    __data[__idx] = __v;
+  }
+};
+
+#define _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_STORAGE_TYPE, _NAME, _OP) \
+  [[nodiscard]] _CCCL_API static constexpr _STORAGE_TYPE _NAME(             \
+    const _STORAGE_TYPE& __lhs, const _STORAGE_TYPE& __rhs) noexcept        \
+  {                                                                         \
+    _STORAGE_TYPE __result;                                                 \
+    _CCCL_PRAGMA_UNROLL_FULL()                                              \
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)                        \
+    {                                                                       \
+      __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]);     \
+    }                                                                       \
+    return __result;                                                        \
+  }
+
+#define _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(_NAME, _OP)                \
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage _NAME(          \
+    const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept      \
+  {                                                                     \
+    _MaskStorage __result;                                              \
+    _CCCL_PRAGMA_UNROLL_FULL()                                          \
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)                    \
+    {                                                                   \
+      __result.__data[__i] = (__lhs.__data[__i] _OP __rhs.__data[__i]); \
+    }                                                                   \
+    return __result;                                                    \
+  }
+
+// Simd operations for fixed_size_simple ABI
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_operations<_Tp, simd_abi::__fixed_size_simple<_Np>>
+{
+  using _SimdStorage = __simd_storage<_Tp, simd_abi::__fixed_size_simple<_Np>>;
+  using _MaskStorage = __mask_storage<sizeof(_Tp), simd_abi::__fixed_size_simple<_Np>>;
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __broadcast(_Tp __v) noexcept
+  {
+    _SimdStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = __v;
+    }
+    return __result;
+  }
+
+  template <typename _Generator, __simd_size_type... _Is>
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __generate_init(_Generator&& __g, ::cuda::std::integer_sequence<__simd_size_type, _Is...>)
+  {
+    return _SimdStorage{{__g(::cuda::std::integral_constant<__simd_size_type, _Is>())...}};
+  }
+
+  template <typename _Generator>
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __generate(_Generator&& __g)
+  {
+    return __generate_init(__g, ::cuda::std::make_integer_sequence<__simd_size_type, _Np>());
+  }
+
+  // Unary operations
+
+  _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __s.__data[__i] += 1;
+    }
+  }
+
+  _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept
+  {
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __s.__data[__i] -= 1;
+    }
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _MaskStorage __negate(const _SimdStorage& __s) noexcept
+  {
+    _MaskStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = !__s.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __bitwise_not(const _SimdStorage& __s) noexcept
+  {
+    _SimdStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = ~__s.__data[__i];
+    }
+    return __result;
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
+  {
+    _SimdStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+    {
+      __result.__data[__i] = -__s.__data[__i];
+    }
+    return __result;
+  }
+
+  // Binary arithmetic operations
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __plus, +)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __minus, -)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __multiplies, *)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __divides, /)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __modulo, %)
+
+  // Comparison operations
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__equal_to, ==)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__not_equal_to, !=)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__less, <)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__less_equal, <=)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__greater, >)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP(__greater_equal, >=)
+
+  // Bitwise and shift operations
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_and, &)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_or, |)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __bitwise_xor, ^)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __shift_left, <<)
+
+  _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP(_SimdStorage, __shift_right, >>)
+};
+
+#undef _CUDAX_SIMD_FIXED_SIZE_BINARY_STORAGE_OP
+#undef _CUDAX_SIMD_FIXED_SIZE_BINARY_CMP_OP
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_SPECIALIZATIONS_FIXED_SIZE_SIMPLE_VEC_H
diff --git a/cudax/include/cuda/experimental/__simd/type_traits.h b/cudax/include/cuda/experimental/__simd/type_traits.h
new file mode 100644
index 00000000000..9815420b1ac
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/type_traits.h
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_TYPE_TRAITS_H
+#define _CUDAX___SIMD_TYPE_TRAITS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__memory/is_valid_alignment.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+
+#include <cuda/experimental/__simd/abi.h>
+#include <cuda/experimental/__simd/declaration.h>
+#include <cuda/experimental/__simd/exposition.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+// [simd.traits], alignment
+template <typename _Tp, typename _Up = typename _Tp::value_type>
+struct alignment;
+
+template <typename _Tp, typename _Abi, typename _Up>
+struct alignment<basic_vec<_Tp, _Abi>, _Up>
+    : ::cuda::std::integral_constant<::cuda::std::size_t, alignof(_Up) * __simd_size_v<_Tp, _Abi>>
+{
+  static_assert(::cuda::__is_valid_alignment(alignof(_Up) * __simd_size_v<_Tp, _Abi>), "Alignment is not valid");
+  static_assert(__is_vectorizable_v<_Up>, "U must be a vectorizable type");
+};
+
+template <typename _Tp, typename _Up = typename _Tp::value_type>
+constexpr ::cuda::std::size_t alignment_v = alignment<_Tp, _Up>::value;
+
+// [simd.traits], rebind
+template <typename _Tp, typename _Vp>
+struct rebind;
+
+template <typename _Tp, typename _Up, typename _Abi>
+struct rebind<_Tp, basic_vec<_Up, _Abi>>
+{
+  static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type");
+  using type = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, __simd_size_v<_Up, _Abi>>>;
+};
+
+template <typename _Tp, ::cuda::std::size_t _Bytes, typename _Abi>
+struct rebind<_Tp, basic_mask<_Bytes, _Abi>>
+{
+  static_assert(__is_vectorizable_v<_Tp>, "T must be a vectorizable type");
+  using __integer_t       = __integer_from<sizeof(_Tp)>;
+  using __integer_bytes_t = __integer_from<_Bytes>;
+
+  using type = basic_mask<sizeof(_Tp), simd_abi::__deduce_abi_t<__integer_t, __simd_size_v<__integer_bytes_t, _Abi>>>;
+};
+
+template <typename _Tp, typename _Vp>
+using rebind_t = typename rebind<_Tp, _Vp>::type;
+
+// [simd.traits], resize
+template <__simd_size_type _Np, typename _Vp>
+struct resize;
+
+template <__simd_size_type _Np, typename _Tp, typename _Abi>
+struct resize<_Np, basic_vec<_Tp, _Abi>>
+{
+  using type = basic_vec<_Tp, simd_abi::__deduce_abi_t<_Tp, _Np>>;
+};
+
+template <__simd_size_type _Np, ::cuda::std::size_t _Bytes, typename _Abi>
+struct resize<_Np, basic_mask<_Bytes, _Abi>>
+{
+  using type = basic_mask<_Bytes, simd_abi::__deduce_abi_t<__integer_from<_Bytes>, _Np>>;
+};
+
+template <__simd_size_type _Np, typename _Vp>
+using resize_t = typename resize<_Np, _Vp>::type;
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_TYPE_TRAITS_H
diff --git a/cudax/include/cuda/experimental/__simd/utility.h b/cudax/include/cuda/experimental/__simd/utility.h
new file mode 100644
index 00000000000..bf35ccc067e
--- /dev/null
+++ b/cudax/include/cuda/experimental/__simd/utility.h
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX___SIMD_UTILITY_H
+#define _CUDAX___SIMD_UTILITY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__memory/is_aligned.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__tuple_dir/tuple_size.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/void_t.h>
+#include <cuda/std/__utility/declval.h>
+#include <cuda/std/__utility/integer_sequence.h>
+
+#include <cuda/experimental/__simd/abi.h>
+#include <cuda/experimental/__simd/flag.h>
+#include <cuda/experimental/__simd/type_traits.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+namespace cuda::experimental::simd
+{
+template <typename _Tp>
+constexpr bool __is_abi_tag_v = false;
+
+template <__simd_size_type _Np>
+constexpr bool __is_abi_tag_v<simd_abi::__fixed_size_simple<_Np>> = true;
+
+template <typename _Tp, typename _Generator, __simd_size_type _Idx, typename = void>
+constexpr bool __is_well_formed = false;
+
+template <typename _Tp, typename _Generator, __simd_size_type _Idx>
+constexpr bool __is_well_formed<_Tp,
+                                _Generator,
+                                _Idx,
+                                ::cuda::std::void_t<decltype(::cuda::std::declval<_Generator>()(
+                                  ::cuda::std::integral_constant<__simd_size_type, _Idx>()))>> =
+  ::cuda::std::is_convertible_v<
+    decltype(::cuda::std::declval<_Generator>()(::cuda::std::integral_constant<__simd_size_type, _Idx>())),
+    _Tp>;
+
+template <typename _Tp, typename _Generator, __simd_size_type... _Indices>
+[[nodiscard]]
+_CCCL_API constexpr bool __can_generate(::cuda::std::integer_sequence<__simd_size_type, _Indices...>) noexcept
+{
+  return (true && ... && __is_well_formed<_Tp, _Generator, _Indices>);
+}
+
+template <typename _Tp, typename _Generator, __simd_size_type _Size>
+constexpr bool __can_generate_v = ::cuda::experimental::simd::__can_generate<_Tp, _Generator>(
+  ::cuda::std::make_integer_sequence<__simd_size_type, _Size>());
+
+// Proxy for ranges::size(r) is a constant expression
+template <typename _Range>
+_CCCL_CONCEPT __has_static_size =
+  _CCCL_REQUIRES_EXPR((_Range))((__simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>}));
+
+template <typename _Range>
+constexpr __simd_size_type __static_range_size_v =
+  __simd_size_type{::cuda::std::tuple_size_v<::cuda::std::remove_cvref_t<_Range>>};
+
+// [simd.flags] alignment assertion for load/store pointers
+template <typename _Vec, typename _Up, typename... _Flags>
+_CCCL_API constexpr void __assert_load_store_alignment([[maybe_unused]] const _Up* __data) noexcept
+{
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    if constexpr (__has_aligned_flag_v<_Flags...>)
+    {
+      _CCCL_ASSERT(::cuda::is_aligned(__data, alignment_v<_Vec, _Up>),
+                   "flag_aligned requires data to be aligned to alignment_v<V, range_value_t<R>>");
+    }
+    else if constexpr (__has_overaligned_flag_v<_Flags...>)
+    {
+      _CCCL_ASSERT(::cuda::is_aligned(__data, __overaligned_alignment_v<_Flags...>),
+                   "flag_overaligned<N> requires data to be aligned to N");
+    }
+  }
+}
+} // namespace cuda::experimental::simd
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDAX___SIMD_UTILITY_H
diff --git a/cudax/include/cuda/experimental/simd.cuh b/cudax/include/cuda/experimental/simd.cuh
new file mode 100644
index 00000000000..d51d2294bcf
--- /dev/null
+++ b/cudax/include/cuda/experimental/simd.cuh
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_SIMD___
+#define __CUDAX_SIMD___
+
+#include <cuda/experimental/__simd/basic_mask.h>
+#include <cuda/experimental/__simd/basic_vec.h>
+#include <cuda/experimental/__simd/reductions.h>
+
+#endif // __CUDAX_SIMD___
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index 9ae7be85039..c4eaeefbe77 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -119,6 +119,10 @@ cudax_add_catch2_test(test_target hierarchy_groups
     hierarchy/group.cu
 )
 
+cudax_add_catch2_test(test_target simd
+    simd/simd.cu
+)
+
 if (cudax_ENABLE_CUFILE)
   cudax_add_catch2_test(test_target cufile.driver_attributes
       cufile/driver_attributes.cu
diff --git a/cudax/test/simd/simd.cu b/cudax/test/simd/simd.cu
new file mode 100644
index 00000000000..5b51fdf1fa2
--- /dev/null
+++ b/cudax/test/simd/simd.cu
@@ -0,0 +1,345 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/array>
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include <cuda/experimental/simd.cuh>
+
+#include <testing.cuh>
+
+namespace dp = cuda::experimental::simd;
+
+namespace
+{
+struct linear_index_gen
+{
+  template <class Index>
+  __host__ __device__ constexpr int operator()(Index idx) const
+  {
+    return static_cast<int>(idx);
+  }
+};
+
+struct linear_index_x2_gen
+{
+  template <class Index>
+  __host__ __device__ constexpr int operator()(Index idx) const
+  {
+    return static_cast<int>(idx * 2);
+  }
+};
+
+struct alternating_mask_gen
+{
+  template <class Index>
+  __host__ __device__ constexpr bool operator()(Index idx) const
+  {
+    return (idx % 2) == 0;
+  }
+};
+} // namespace
+
+template <typename Simd, typename T, size_t N>
+__host__ __device__ void expect_equal(const Simd& actual, const ::cuda::std::array<T, N>& expected)
+{
+  static_assert(N == Simd::size(), "Mismatch between expected values and simd width");
+  for (size_t i = 0; i < N; ++i)
+  {
+    CUDAX_REQUIRE(actual[i] == expected[i]);
+  }
+}
+
+template <typename Simd>
+using simd_array_t = ::cuda::std::array<typename Simd::value_type, Simd::size()>;
+
+C2H_CCCLRT_TEST("simd.traits", "[simd][traits]")
+{
+  using abi_t    = dp::simd_abi::fixed_size_simple<4>;
+  using simd_t   = dp::vec<int, 4>;
+  using mask_t   = dp::mask<int, 4>;
+  using other_t  = dp::vec<float, 4>;
+  using rebind_t = dp::rebind_simd_t<float, simd_t>;
+
+  STATIC_REQUIRE(dp::is_abi_tag_v<abi_t>);
+  STATIC_REQUIRE(!dp::is_abi_tag_v<int>);
+  STATIC_REQUIRE(dp::__simd_size_v<int, abi_t> == 4);
+  STATIC_REQUIRE(dp::__simd_size_v<float, abi_t> == 4);
+  STATIC_REQUIRE(dp::__simd_size_v<void, abi_t> == 0);
+
+  STATIC_REQUIRE(dp::is_vec_v<simd_t>);
+  STATIC_REQUIRE(!dp::is_vec_v<int>);
+  STATIC_REQUIRE(dp::is_mask_v<mask_t>);
+  STATIC_REQUIRE(!dp::is_mask_v<simd_t>);
+
+  STATIC_REQUIRE(dp::is_simd_flag_type_v<dp::element_aligned_tag>);
+  STATIC_REQUIRE(dp::is_simd_flag_type_v<dp::vector_aligned_tag>);
+  STATIC_REQUIRE(dp::is_simd_flag_type_v<dp::overaligned_tag<64>>);
+
+  STATIC_REQUIRE(simd_t::size() == 4);
+  STATIC_REQUIRE(mask_t::size() == simd_t::size());
+  STATIC_REQUIRE(dp::memory_alignment_v<simd_t> == alignof(int));
+  STATIC_REQUIRE(dp::memory_alignment_v<simd_t, dp::vector_aligned_tag> == alignof(int) * simd_t::size());
+  STATIC_REQUIRE(dp::memory_alignment_v<simd_t, dp::overaligned_tag<128>> == 128);
+  STATIC_REQUIRE(dp::memory_alignment_v<mask_t> == alignof(bool));
+  STATIC_REQUIRE(dp::memory_alignment_v<mask_t, dp::vector_aligned_tag> == alignof(bool) * mask_t::size());
+
+  STATIC_REQUIRE(::cuda::std::is_same_v<rebind_t, other_t>);
+}
+
+C2H_CCCLRT_TEST("simd.construction_and_memory", "[simd][construction]")
+{
+  constexpr auto size = 4;
+  using simd_t        = dp::vec<int, size>;
+  using mask_t        = simd_t::mask_type;
+  using array_t       = simd_array_t<simd_t>;
+
+  simd_t broadcast(7);
+  expect_equal(broadcast, array_t{7, 7, 7, 7});
+
+  simd_t generated(linear_index_x2_gen{});
+  expect_equal(generated, array_t{0, 2, 4, 6});
+
+  alignas(64) int storage[size] = {0, 1, 2, 3};
+  simd_t from_ptr(storage, dp::overaligned<64>);
+  expect_equal(from_ptr, array_t{0, 1, 2, 3});
+
+  alignas(64) int roundtrip[size] = {};
+  generated.copy_to(roundtrip, dp::overaligned<64>);
+
+  simd_t loaded;
+  loaded.copy_from(roundtrip, dp::overaligned<64>);
+  expect_equal(loaded, array_t{0, 2, 4, 6});
+
+  dp::vec<float, 4> widened(generated);
+  expect_equal(widened, ::cuda::std::array<float, size>{0.0f, 2.0f, 4.0f, 6.0f});
+
+  mask_t from_simd = (generated != simd_t(0));
+  expect_equal(from_simd, ::cuda::std::array<bool, size>{false, true, true, true});
+
+  dp::vec<int, 4> assigned = simd_t(linear_index_gen{});
+  assigned                 = generated;
+  expect_equal(assigned, array_t{0, 2, 4, 6});
+
+  auto incremented = generated;
+  ++incremented;
+  expect_equal(incremented, array_t{1, 3, 5, 7});
+
+  auto decremented = incremented;
+  decremented--;
+  expect_equal(decremented, array_t{0, 2, 4, 6});
+}
+
+C2H_CCCLRT_TEST("simd.arithmetic_and_comparisons", "[simd][arithmetic]")
+{
+  using simd_t  = dp::vec<int, 4>;
+  using mask_t  = simd_t::mask_type;
+  using array_t = simd_array_t<simd_t>;
+
+  simd_t lhs(linear_index_gen{});
+  simd_t rhs(2);
+
+  auto sum = lhs + rhs;
+  expect_equal(sum, array_t{2, 3, 4, 5});
+
+  auto difference = sum - 1;
+  expect_equal(difference, array_t{1, 2, 3, 4});
+
+  auto vec_plus_scalar = lhs + 5;
+  expect_equal(vec_plus_scalar, array_t{5, 6, 7, 8});
+
+  auto scalar_plus_vec = 5 + lhs;
+  expect_equal(scalar_plus_vec, array_t{5, 6, 7, 8});
+
+  auto scalar_minus_vec = 5 - lhs;
+  expect_equal(scalar_minus_vec, array_t{5, 4, 3, 2});
+
+  auto product = lhs * rhs;
+  expect_equal(product, array_t{0, 2, 4, 6});
+
+  auto quotient = product / rhs;
+  expect_equal(quotient, array_t{0, 1, 2, 3});
+
+  auto modulo = product % rhs;
+  expect_equal(modulo, array_t{0, 0, 0, 0});
+
+  auto bit_and = product & simd_t(3);
+  expect_equal(bit_and, array_t{0, 2, 0, 2});
+
+  auto bit_or = bit_and | simd_t(4);
+  expect_equal(bit_or, array_t{4, 6, 4, 6});
+
+  auto bit_xor = bit_and ^ simd_t(1);
+  expect_equal(bit_xor, array_t{1, 3, 1, 3});
+
+  auto vec_or_scalar = lhs | 1;
+  expect_equal(vec_or_scalar, array_t{1, 1, 3, 3});
+
+  auto scalar_or_vec = 1 | lhs;
+  expect_equal(scalar_or_vec, array_t{1, 1, 3, 3});
+
+  auto shift_left = simd_t(1) << lhs;
+  expect_equal(shift_left, array_t{1, 2, 4, 8});
+
+  auto shift_right = shift_left >> simd_t(1);
+  expect_equal(shift_right, array_t{0, 1, 2, 4});
+
+  auto vector_shift_scalar = lhs << 1;
+  expect_equal(vector_shift_scalar, array_t{0, 2, 4, 6});
+
+  auto scalar_shift_vector = 1 << lhs;
+  expect_equal(scalar_shift_vector, array_t{1, 2, 4, 8});
+
+  auto compound = lhs;
+  compound += rhs;
+  compound -= rhs;
+  expect_equal(compound, array_t{0, 1, 2, 3});
+
+  auto bitwise_compound = simd_t(3);
+  bitwise_compound &= rhs;
+  bitwise_compound |= simd_t(4);
+  bitwise_compound ^= simd_t(1);
+  expect_equal(bitwise_compound, array_t{7, 7, 7, 7});
+
+  auto shift_compound = simd_t(1);
+  shift_compound <<= rhs;
+  shift_compound >>= rhs;
+  expect_equal(shift_compound, array_t{1, 1, 1, 1});
+
+  mask_t eq_mask = (lhs == lhs);
+  CUDAX_REQUIRE(dp::all_of(eq_mask));
+  mask_t lt_mask = (lhs < 2);
+  CUDAX_REQUIRE(dp::reduce_count(lt_mask) == 2);
+
+  mask_t scalar_first_lt = (2 <= lhs);
+  CUDAX_REQUIRE(dp::reduce_count(scalar_first_lt) == 2);
+
+  mask_t scalar_eq_rhs = (lhs == 1);
+  CUDAX_REQUIRE(dp::reduce_count(scalar_eq_rhs) == 1);
+
+  mask_t scalar_eq_lhs = (1 == lhs);
+  CUDAX_REQUIRE(dp::reduce_count(scalar_eq_lhs) == 1);
+
+  mask_t ge_mask = (lhs >= 1);
+  CUDAX_REQUIRE(dp::any_of(ge_mask));
+  CUDAX_REQUIRE(!dp::none_of(ge_mask));
+
+  auto negated = -lhs;
+  expect_equal(negated, array_t{0, -1, -2, -3});
+
+  auto bitwise_not = ~lhs;
+  expect_equal(bitwise_not, array_t{-1, -2, -3, -4});
+}
+
+C2H_CCCLRT_TEST("simd.mask", "[simd][mask]")
+{
+  using mask_t           = dp::mask<int, 4>;
+  using simd_t           = dp::vec<int, 4>;
+  using mask_array_t     = ::cuda::std::array<bool, mask_t::size()>;
+  using simd_array_typed = simd_array_t<simd_t>;
+
+  mask_t alternating(alternating_mask_gen{});
+  expect_equal(alternating, mask_array_t{true, false, true, false});
+  CUDAX_REQUIRE(dp::reduce_count(alternating) == 2);
+  CUDAX_REQUIRE(dp::any_of(alternating));
+  CUDAX_REQUIRE(!dp::all_of(alternating));
+  CUDAX_REQUIRE(!dp::none_of(alternating));
+
+  mask_t inverted = !alternating;
+  expect_equal(inverted, mask_array_t{false, true, false, true});
+
+  mask_t zero = alternating & inverted;
+  CUDAX_REQUIRE(dp::none_of(zero));
+
+  mask_t combined = alternating | inverted;
+  CUDAX_REQUIRE(dp::all_of(combined));
+
+  auto vec_from_mask = static_cast<simd_t>(alternating);
+  expect_equal(vec_from_mask, simd_array_typed{1, 0, 1, 0});
+
+  mask_t xor_mask = alternating ^ inverted;
+  CUDAX_REQUIRE(dp::all_of(xor_mask));
+
+  mask_t assigned = alternating;
+  assigned ^= inverted;
+  CUDAX_REQUIRE(dp::all_of(assigned));
+
+  assigned &= combined;
+  CUDAX_REQUIRE(dp::all_of(assigned));
+
+  mask_t or_test(false);
+  or_test |= alternating;
+  CUDAX_REQUIRE(dp::all_of(or_test == alternating));
+
+  mask_t broadcast_true(true);
+  CUDAX_REQUIRE(dp::all_of(broadcast_true));
+
+  mask_t broadcast_false(false);
+  CUDAX_REQUIRE(dp::none_of(broadcast_false));
+
+  // Element-wise comparison operators
+  mask_t eq_result = (alternating == alternating);
+  CUDAX_REQUIRE(dp::all_of(eq_result));
+
+  mask_t ne_result = (alternating != inverted);
+  CUDAX_REQUIRE(dp::all_of(ne_result));
+
+  mask_t a(true);
+  mask_t b(false);
+  CUDAX_REQUIRE(dp::all_of(a >= b));
+  CUDAX_REQUIRE(dp::all_of(b <= a));
+  CUDAX_REQUIRE(dp::all_of(a > b));
+  CUDAX_REQUIRE(dp::all_of(b < a));
+  CUDAX_REQUIRE(dp::all_of(a >= a));
+  CUDAX_REQUIRE(dp::all_of(a <= a));
+  CUDAX_REQUIRE(dp::none_of(a > a));
+  CUDAX_REQUIRE(dp::none_of(a < a));
+
+  // Unsigned integer constructor
+  mask_t from_bits(static_cast<unsigned>(0b1010));
+  expect_equal(from_bits, mask_array_t{false, true, false, true});
+
+  // Unary operators returning basic_vec
+  mask_t pos_input(true);
+  auto plus_result = +pos_input;
+  CUDAX_REQUIRE(dp::all_of(plus_result == simd_t(1)));
+
+  auto minus_result = -pos_input;
+  CUDAX_REQUIRE(dp::all_of(minus_result == simd_t(-1)));
+
+  // Logical operators
+  mask_t logical_and = alternating && inverted;
+  CUDAX_REQUIRE(dp::none_of(logical_and));
+
+  mask_t logical_or = alternating || inverted;
+  CUDAX_REQUIRE(dp::all_of(logical_or));
+}
+
+C2H_CCCLRT_TEST("simd.reference", "[simd][reference]")
+{
+  using simd_t  = dp::vec<int, 4>;
+  using array_t = simd_array_t<simd_t>;
+
+  simd_t values(linear_index_gen{});
+  values[2] += 5;
+  expect_equal(values, array_t{0, 1, 7, 3});
+
+  using ::cuda::std::swap;
+
+  swap(values[0], values[3]);
+  int scalar = 42;
+  swap(values[1], scalar);
+  swap(scalar, values[2]);
+
+  expect_equal(values, array_t{3, 42, 1, 0});
+  CUDAX_REQUIRE(scalar == 7);
+}