diff --git a/libcudacxx/benchmarks/bench/partition/basic.cu b/libcudacxx/benchmarks/bench/partition/basic.cu new file mode 100644 index 00000000000..a19a64eb59b --- /dev/null +++ b/libcudacxx/benchmarks/bench/partition/basic.cu @@ -0,0 +1,48 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include +#include + +#include "nvbench_helper.cuh" + +template +static void basic(nvbench::state& state, nvbench::type_list) +{ + using select_op_t = less_then_t; + + const auto elements = static_cast(state.get_int64("Elements")); + const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); + + const T val = lerp_min_max(entropy_to_probability(entropy)); + select_op_t select_op{val}; + + thrust::device_vector input = generate(elements); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(elements); + + caching_allocator_t alloc{}; + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + do_not_optimize(cuda::std::partition(cuda_policy(alloc, launch), input.begin(), input.end(), select_op)); + }); +} + +NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("base") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); diff --git a/libcudacxx/benchmarks/bench/partition_copy/basic.cu b/libcudacxx/benchmarks/bench/partition_copy/basic.cu new file mode 100644 index 00000000000..71569fa28ec --- /dev/null +++ b/libcudacxx/benchmarks/bench/partition_copy/basic.cu @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include +#include + +#include "nvbench_helper.cuh" + +template +static void basic(nvbench::state& state, nvbench::type_list) +{ + using select_op_t = less_then_t; + + const auto elements = static_cast(state.get_int64("Elements")); + const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); + + const T val = lerp_min_max(entropy_to_probability(entropy)); + select_op_t select_op{val}; + + thrust::device_vector input = generate(elements); + thrust::device_vector output(elements); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(elements); + + caching_allocator_t alloc{}; + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + do_not_optimize(cuda::std::partition_copy( + cuda_policy(alloc, launch), + input.begin(), + input.end(), + output.begin(), + cuda::std::make_reverse_iterator(output.begin() + elements), + select_op)); + }); +} + +NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("base") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); diff --git a/libcudacxx/benchmarks/bench/stable_partition/basic.cu b/libcudacxx/benchmarks/bench/stable_partition/basic.cu new file mode 100644 index 00000000000..62f9d653e87 --- /dev/null +++ b/libcudacxx/benchmarks/bench/stable_partition/basic.cu @@ -0,0 +1,48 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include +#include + +#include "nvbench_helper.cuh" + +template +static void basic(nvbench::state& state, nvbench::type_list) +{ + using select_op_t = less_then_t; + + const auto elements = static_cast(state.get_int64("Elements")); + const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); + + const T val = lerp_min_max(entropy_to_probability(entropy)); + select_op_t select_op{val}; + + thrust::device_vector input = generate(elements); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + state.add_global_memory_writes(elements); + + caching_allocator_t alloc{}; + state.exec( + nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + do_not_optimize(cuda::std::stable_partition(cuda_policy(alloc, launch), input.begin(), input.end(), select_op)); + }); +} + +NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("base") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) + .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/partition.h b/libcudacxx/include/cuda/std/__pstl/cuda/partition.h new file mode 100644 index 00000000000..25b014e7447 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/cuda/partition.h @@ -0,0 +1,169 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_CUDA_PARTITION_H +#define _CUDA_STD___PSTL_CUDA_PARTITION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow") +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") +_CCCL_DIAG_SUPPRESS_GCC("-Wattributes") +_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) + +# include +# include + +_CCCL_DIAG_POP + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +template <> +struct __pstl_dispatch<__pstl_algorithm::__partition, __execution_backend::__cuda> +{ + template + [[nodiscard]] _CCCL_HOST_API static size_t + __par_impl(const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) + { + using _OffsetType = size_t; + using value_type = iter_value_t<_InputIterator>; + + _OffsetType __ret; + const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last)); + + // Determine temporary device storage requirements for device_partition + size_t __num_bytes = 0; + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DevicePartition::If, + "__pstl_cuda_partition: determination of device storage for cub::DevicePartition::If failed", + static_cast(nullptr), + __num_bytes, + static_cast(nullptr), + __first, + static_cast<_OffsetType*>(nullptr), + __count, + __pred, + 0); + + // Allocate memory for result + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + { + __temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count}; + + // Partition cannot run inplace, so we need to first copy the input into temporary storage + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DeviceTransform::TransformIf, + "__pstl_cuda_partition: kernel launch of cub::DeviceTransform::TransformIf failed", + tuple<_InputIterator>{__first}, + __storage.template __get_ptr<1>(), + __count, + CUB_NS_QUALIFIER::detail::transform::always_true_predicate{}, + identity{}, + __stream.get()); + + // Run the kernel, the standard requires that the input and output range do not overlap + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DevicePartition::If, + "__pstl_cuda_partition: kernel launch of cub::DevicePartition::If failed", + __storage.__get_temp_storage(), + __num_bytes, + __storage.template __get_raw_ptr<1>(), + ::cuda::std::move(__first), + __storage.template __get_ptr<0>(), + __count, + ::cuda::std::move(__pred), + __stream.get()); + + // Copy the result back from storage + _CCCL_TRY_CUDA_API( + ::cudaMemcpyAsync, + "__pstl_cuda_partition: copy of result from device to host failed", + ::cuda::std::addressof(__ret), + __storage.template __get_ptr<0>(), + sizeof(_OffsetType), + ::cudaMemcpyDefault, + __stream.get()); + } + + __stream.sync(); + return static_cast(__ret); + } + + _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred) + _CCCL_REQUIRES(__has_forward_traversal<_InputIterator>) + [[nodiscard]] _CCCL_HOST_API size_t operator()( + [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) const + { + if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>) + { + try + { + return __par_impl(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + } + catch (const ::cuda::cuda_error& __err) + { + if (__err.status() == cudaErrorMemoryAllocation) + { + _CCCL_THROW(::std::bad_alloc); + } + else + { + throw __err; + } + } + } + else + { + static_assert(__always_false_v<_Policy>, "CUDA backend of cuda::std::partition requires random access iterators"); + return ::cuda::std::partition(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + } + } +}; + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +# include + +#endif // _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA_STD___PSTL_CUDA_PARTITION_H diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/partition_copy.h b/libcudacxx/include/cuda/std/__pstl/cuda/partition_copy.h new file mode 100644 index 00000000000..8068fb36da2 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/cuda/partition_copy.h @@ -0,0 +1,184 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_CUDA_PARTITION_COPY_H +#define _CUDA_STD___PSTL_CUDA_PARTITION_COPY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow") +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") +_CCCL_DIAG_SUPPRESS_GCC("-Wattributes") +_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) + +# include + +_CCCL_DIAG_POP + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +template <> +struct __pstl_dispatch<__pstl_algorithm::__partition_copy, __execution_backend::__cuda> +{ + template + [[nodiscard]] _CCCL_HOST_API static size_t __par_impl( + const _Policy& __policy, + _InputIterator __first, + _InputIterator __last, + _OutputIterator1 __result_true, + _OutputIterator2 __result_false, + _UnaryPred __pred) + { + using _OffsetType = size_t; + using __output_wrapper_t = + CUB_NS_QUALIFIER::detail::select::partition_distinct_output_t<_OutputIterator1, _OutputIterator2>; + __output_wrapper_t __result{::cuda::std::move(__result_true), ::cuda::std::move(__result_false)}; + + _OffsetType __ret; + const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last)); + + // Determine temporary device storage requirements for device_partition + size_t __num_bytes = 0; + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DevicePartition::If, + "__pstl_cuda_partition_copy: determination of device storage for cub::DevicePartition::If failed", + static_cast(nullptr), + __num_bytes, + __first, + __result, + static_cast<_OffsetType*>(nullptr), + __count, + __pred, + 0); + + // Allocate memory for result + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + { + __temporary_storage<_OffsetType> __storage{__policy, __num_bytes, 1}; + + // Run the kernel, the standard requires that the input and output range do not overlap + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DevicePartition::If, + "__pstl_cuda_partition_copy: kernel launch of cub::DevicePartition::If failed", + __storage.__get_temp_storage(), + __num_bytes, + ::cuda::std::move(__first), + ::cuda::std::move(__result), + __storage.template __get_ptr<0>(), + __count, + ::cuda::std::move(__pred), + __stream.get()); + + // Copy the result back from storage + _CCCL_TRY_CUDA_API( + ::cudaMemcpyAsync, + "__pstl_cuda_partition_copy: copy of result from device to host failed", + ::cuda::std::addressof(__ret), + __storage.template __get_ptr<0>(), + sizeof(_OffsetType), + ::cudaMemcpyDefault, + __stream.get()); + } + + __stream.sync(); + return static_cast(__ret); + } + + _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator1, class _OutputIterator2, class _UnaryPred) + _CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator1> _CCCL_AND + __has_forward_traversal<_OutputIterator2>) + [[nodiscard]] _CCCL_HOST_API size_t operator()( + [[maybe_unused]] const _Policy& __policy, + _InputIterator __first, + _InputIterator __last, + _OutputIterator1 __result_true, + _OutputIterator2 __result_false, + _UnaryPred __pred) const + { + if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator> + && ::cuda::std::__has_random_access_traversal<_OutputIterator1> + && ::cuda::std::__has_random_access_traversal<_OutputIterator2>) + { + try + { + return __par_impl( + __policy, + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__result_true), + ::cuda::std::move(__result_false), + ::cuda::std::move(__pred)); + } + catch (const ::cuda::cuda_error& __err) + { + if (__err.status() == cudaErrorMemoryAllocation) + { + _CCCL_THROW(::std::bad_alloc); + } + else + { + throw __err; + } + } + } + else + { + static_assert(__always_false_v<_Policy>, + "CUDA backend of cuda::std::partition_copy requires random access iterators"); + return ::cuda::std::partition_copy( + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__result_true), + ::cuda::std::move(__result_false), + ::cuda::std::move(__pred)); + } + } +}; + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +# include + +#endif // _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA_STD___PSTL_CUDA_PARTITION_COPY_H diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h b/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h new file mode 100644 index 00000000000..d3c8a09ef1e --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h @@ -0,0 +1,176 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_CUDA_STABLE_PARTITION_H +#define _CUDA_STD___PSTL_CUDA_STABLE_PARTITION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow") +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") +_CCCL_DIAG_SUPPRESS_GCC("-Wattributes") +_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) + +# include +# include + +_CCCL_DIAG_POP + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +template <> +struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend::__cuda> +{ + template + [[nodiscard]] _CCCL_HOST_API static size_t + __par_impl(const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) + { + using _OffsetType = size_t; + using value_type = iter_value_t<_InputIterator>; + + _OffsetType __ret; + const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last)); + + // Determine temporary device storage requirements for device_stable_partition + size_t __num_bytes = 0; + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DevicePartition::If, + "__pstl_cuda_stable_partition: determination of device storage for cub::DevicePartition::If failed", + static_cast(nullptr), + __num_bytes, + static_cast(nullptr), + __first, + static_cast<_OffsetType*>(nullptr), + __count, + __pred, + 0); + + // Allocate memory for result + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + { + __temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count}; + + // Partition cannot run inplace, so we need to first copy the input into temporary storage + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DeviceTransform::TransformIf, + "__pstl_cuda_stable_partition: kernel launch of cub::DeviceTransform::TransformIf failed", + tuple<_InputIterator>{__first}, + __storage.template __get_ptr<1>(), + __count, + CUB_NS_QUALIFIER::detail::transform::always_true_predicate{}, + identity{}, + __stream.get()); + + // Run the kernel, the standard requires that the input and output range do not overlap + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DevicePartition::If, + "__pstl_cuda_stable_partition: kernel launch of cub::DevicePartition::If failed", + __storage.__get_temp_storage(), + __num_bytes, + __storage.template __get_raw_ptr<1>(), + __first, + __storage.template __get_ptr<0>(), + __count, + ::cuda::std::move(__pred), + __stream.get()); + + // Copy the result back from storage + _CCCL_TRY_CUDA_API( + ::cudaMemcpyAsync, + "__pstl_cuda_stable_partition: copy of result from device to host failed", + ::cuda::std::addressof(__ret), + __storage.template __get_ptr<0>(), + sizeof(_OffsetType), + ::cudaMemcpyDefault, + __stream.get()); + } + + __stream.sync(); + + // Need to reverse the elements in the second partition + const auto __mid = __first + static_cast>(__ret); + ::cuda::std::reverse(__policy, __mid, __last); + return static_cast(__ret); + } + + _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred) + _CCCL_REQUIRES(__has_forward_traversal<_InputIterator>) + [[nodiscard]] _CCCL_HOST_API size_t operator()( + [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) const + { + if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>) + { + try + { + return __par_impl(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + } + catch (const ::cuda::cuda_error& __err) + { + if (__err.status() == cudaErrorMemoryAllocation) + { + _CCCL_THROW(::std::bad_alloc); + } + else + { + throw __err; + } + } + } + else + { + static_assert(__always_false_v<_Policy>, + "CUDA backend of cuda::std::stable_partition requires random access iterators"); + return ::cuda::std::stable_partition( + ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + } + } +}; + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +# include + +#endif // _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA_STD___PSTL_CUDA_STABLE_PARTITION_H diff --git a/libcudacxx/include/cuda/std/__pstl/dispatch.h b/libcudacxx/include/cuda/std/__pstl/dispatch.h index 9b8d68ac0a9..4dca9963b70 100644 --- a/libcudacxx/include/cuda/std/__pstl/dispatch.h +++ b/libcudacxx/include/cuda/std/__pstl/dispatch.h @@ -41,8 +41,11 @@ enum class __pstl_algorithm __generate_n, __inclusive_scan, __merge, + __partition, + __partition_copy, __reduce, __remove_if, + __stable_partition, __transform, __transform_reduce, __unique, diff --git a/libcudacxx/include/cuda/std/__pstl/partition.h b/libcudacxx/include/cuda/std/__pstl/partition.h new file mode 100644 index 00000000000..9c6f8263706 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/partition.h @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_PARTITION_H +#define _CUDA_STD___PSTL_PARTITION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred) +_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>) +_CCCL_HOST_API _InputIterator +partition([[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) +{ + static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>, + "cuda::std::partition: UnaryPred must satisfy indirect_unary_predicate"); + + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__partition, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + _CCCL_NVTX_RANGE_SCOPE("cuda::std::partition"); + + if (__first == __last) + { + return __first; + } + + auto __result = __first; + const size_t __num_selected = + __dispatch(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + + return __result + static_cast>(__num_selected); + } + else + { + static_assert(__always_false_v<_Policy>, "Parallel cuda::std::partition requires at least one selected backend"); + return ::cuda::std::partition(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_PARTITION_H diff --git a/libcudacxx/include/cuda/std/__pstl/partition_copy.h b/libcudacxx/include/cuda/std/__pstl/partition_copy.h new file mode 100644 index 00000000000..228e11cb0c6 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/partition_copy.h @@ -0,0 +1,111 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_PARTITION_COPY_H +#define _CUDA_STD___PSTL_PARTITION_COPY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator1, class _OutputIterator2, class _UnaryPred) +_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator1> _CCCL_AND + __has_forward_traversal<_OutputIterator2> _CCCL_AND is_execution_policy_v<_Policy>) +_CCCL_HOST_API pair<_OutputIterator1, _OutputIterator2> partition_copy( + [[maybe_unused]] const _Policy& __policy, + _InputIterator __first, + _InputIterator __last, + _OutputIterator1 __result_true, + _OutputIterator2 __result_false, + _UnaryPred __pred) +{ + static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>, + "cuda::std::partition_copy: UnaryPred must satisfy indirect_unary_predicate"); + + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__partition_copy, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + _CCCL_NVTX_RANGE_SCOPE("cuda::std::partition_copy"); + + if (__first == __last) + { + return pair{__result_true, __result_false}; + } + + const auto __count = ::cuda::std::distance(__first, __last); + const size_t __num_selected = __dispatch( + __policy, + ::cuda::std::move(__first), + ::cuda::std::move(__last), + __result_true, + __result_false, + ::cuda::std::move(__pred)); + + const auto __num_not_selected = __count - static_cast>(__num_selected); + + return pair{__result_true + static_cast>(__num_selected), + __result_false + static_cast>(__num_not_selected)}; + } + else + { + static_assert(__always_false_v<_Policy>, + "Parallel cuda::std::partition_copy requires at least one selected backend"); + return ::cuda::std::partition_copy( + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__result_true), + ::cuda::std::move(__result_false), + ::cuda::std::move(__pred)); + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_PARTITION_COPY_H diff --git a/libcudacxx/include/cuda/std/__pstl/stable_partition.h b/libcudacxx/include/cuda/std/__pstl/stable_partition.h new file mode 100644 index 00000000000..ac84ae0f66b --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/stable_partition.h @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_STABLE_PARTITION_H +#define _CUDA_STD___PSTL_STABLE_PARTITION_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred) +_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>) +_CCCL_HOST_API _InputIterator stable_partition( + [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) +{ + static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>, + "cuda::std::stable_partition: UnaryPred must satisfy indirect_unary_predicate"); + + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__stable_partition, + _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + _CCCL_NVTX_RANGE_SCOPE("cuda::std::stable_partition"); + + if (__first == __last) + { + return __first; + } + + auto __result = __first; + const size_t __num_selected = + __dispatch(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + + return __result + static_cast>(__num_selected); + } + else + { + static_assert(__always_false_v<_Policy>, + "Parallel cuda::std::stable_partition requires at least one selected backend"); + return ::cuda::std::stable_partition( + ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred)); + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_STABLE_PARTITION_H diff --git a/libcudacxx/include/cuda/std/execution b/libcudacxx/include/cuda/std/execution index 9a883862e32..a0849b882ed 100644 --- a/libcudacxx/include/cuda/std/execution +++ b/libcudacxx/include/cuda/std/execution @@ -54,6 +54,8 @@ # include # include # include +# include +# include # include # include # include @@ -65,6 +67,7 @@ # include # include # include +# include # include # include # include diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition.cu new file mode 100644 index 00000000000..208578fb584 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition.cu @@ -0,0 +1,107 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template +// void partition(const Policy& policy, +// InputIterator first, +// InputIterator last, +// UnaryPredicate pred); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +inline constexpr int size = 1000; + +template +struct is_even +{ + [[nodiscard]] __device__ constexpr bool operator()(T value) const noexcept + { + return value % 2 == 0; + } +}; + +template +void test_partition(const Policy& policy, thrust::device_vector& input) +{ + { // Empty does not access anything + auto res = cuda::std::partition(policy, static_cast(nullptr), static_cast(nullptr), is_even{}); + CHECK(res == nullptr); + } + + const auto mid = size / 2; + thrust::sequence(input.begin(), input.end(), 0); + { // With matching predicate + auto res = cuda::std::partition(policy, input.begin(), input.end(), is_even{}); + CHECK(res == cuda::std::next(input.begin(), mid)); + CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + + // Implementation detail, we copy the unselected elements in reverse order + CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{size - 1}, -2})); + } + + thrust::sequence(input.begin(), input.end(), 0); + { // With converting predicate + auto res = cuda::std::partition(policy, input.begin(), input.end(), is_even{}); + CHECK(res == cuda::std::next(input.begin(), mid)); + CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + + // Implementation detail, we copy the unselected elements in reverse order + CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{size - 1}, -2})); + } +} + +C2H_TEST("cuda::std::partition", "[parallel algorithm]") +{ + thrust::device_vector input(size, thrust::no_init); + + SECTION("with default stream") + { + const auto policy = cuda::execution::gpu; + + test_partition(policy, input); + } + + SECTION("with provided stream") + { + cuda::stream stream{cuda::device_ref{0}}; + const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream); + + test_partition(policy, input); + } + + SECTION("with provided memory_resource") + { + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0}); + const auto policy = cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource); + + test_partition(policy, input); + } + + SECTION("with provided stream and memory_resource") + { + cuda::stream stream{cuda::device_ref{0}}; + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device()); + const auto policy = + cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource).with(cuda::get_stream, stream); + + test_partition(policy, input); + } +} diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition_copy.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition_copy.cu new file mode 100644 index 00000000000..d167ea714a0 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition_copy.cu @@ -0,0 +1,177 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template +// void partition_copy_copy(const Policy& policy, +// InputIterator first, +// InputIterator last, +// OutputIterator1 result_true, +// OutputIterator2 result_false, +// UnaryPredicate pred); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +inline constexpr int size = 1000; + +template +struct is_even +{ + [[nodiscard]] __device__ constexpr bool operator()(T value) const noexcept + { + return value % 2 == 0; + } +}; + +template +void test_partition_copy(const Policy& policy, + const thrust::device_vector& input, + thrust::device_vector& output_true, + thrust::device_vector& output_false) +{ + { // Empty does not access anything + auto res = cuda::std::partition_copy( + policy, + static_cast(nullptr), + static_cast(nullptr), + static_cast(nullptr), + static_cast(nullptr), + is_even{}); + CHECK(res.first == nullptr); + CHECK(res.second == nullptr); + } + + cuda::std::fill(policy, output_true.begin(), output_true.end(), -1); + cuda::std::fill(policy, output_false.begin(), output_false.end(), -1); + { // With contiguous input + auto res = cuda::std::partition_copy( + policy, input.begin(), input.end(), output_true.begin(), output_false.begin(), is_even{}); + CHECK(res == cuda::std::pair{output_true.end(), output_false.end()}); + CHECK(cuda::std::equal( + policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + CHECK(cuda::std::equal( + policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2})); + } + + cuda::std::fill(policy, output_true.begin(), output_true.end(), -1); + cuda::std::fill(policy, output_false.begin(), output_false.end(), -1); + { // With random access input + auto res = cuda::std::partition_copy( + policy, + cuda::counting_iterator{0}, + cuda::counting_iterator{size}, + output_true.begin(), + output_false.begin(), + is_even{}); + CHECK(res == cuda::std::pair{output_true.end(), output_false.end()}); + CHECK(cuda::std::equal( + policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + CHECK(cuda::std::equal( + policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2})); + } + + cuda::std::fill(policy, output_true.begin(), output_true.end(), -1); + cuda::std::fill(policy, output_false.begin(), output_false.end(), -1); + { // With different input type + auto res = cuda::std::partition_copy( + policy, + cuda::counting_iterator{0}, + cuda::counting_iterator{size}, + output_true.begin(), + output_false.begin(), + is_even{}); + CHECK(res == cuda::std::pair{output_true.end(), output_false.end()}); + CHECK(cuda::std::equal( + policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + CHECK(cuda::std::equal( + policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2})); + } + + cuda::std::fill(policy, output_true.begin(), output_true.end(), -1); + cuda::std::fill(policy, output_false.begin(), output_false.end(), -1); + { // With contiguous input, converting predicate + auto res = cuda::std::partition_copy( + policy, input.begin(), input.end(), output_true.begin(), output_false.begin(), is_even{}); + CHECK(res == cuda::std::pair{output_true.end(), output_false.end()}); + CHECK(cuda::std::equal( + policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + CHECK(cuda::std::equal( + policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2})); + } + + cuda::std::fill(policy, output_true.begin(), output_true.end(), -1); + cuda::std::fill(policy, output_false.begin(), output_false.end(), -1); + { // With different input type, converting predicate + auto res = cuda::std::partition_copy( + policy, + cuda::counting_iterator{0}, + cuda::counting_iterator{size}, + output_true.begin(), + output_false.begin(), + is_even{}); + CHECK(res == cuda::std::pair{output_true.end(), output_false.end()}); + CHECK(cuda::std::equal( + policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + CHECK(cuda::std::equal( + policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2})); + } +} + +C2H_TEST("cuda::std::partition_copy", "[parallel algorithm]") +{ + thrust::device_vector input(size, thrust::no_init); + thrust::device_vector output_true(size / 2, thrust::no_init); + thrust::device_vector output_false(size / 2, thrust::no_init); + thrust::sequence(input.begin(), input.end(), 0); + + SECTION("with default stream") + { + const auto policy = cuda::execution::gpu; + + test_partition_copy(policy, input, output_true, output_false); + } + + SECTION("with provided stream") + { + cuda::stream stream{cuda::device_ref{0}}; + const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream); + + test_partition_copy(policy, input, output_true, output_false); + } + + SECTION("with provided memory_resource") + { + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0}); + const auto policy = cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource); + + test_partition_copy(policy, input, output_true, output_false); + } + + SECTION("with provided stream and memory_resource") + { + cuda::stream stream{cuda::device_ref{0}}; + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device()); + const auto policy = + cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource).with(cuda::get_stream, stream); + + test_partition_copy(policy, input, output_true, output_false); + } +} diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_stable_partition.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_stable_partition.cu new file mode 100644 index 00000000000..7af84536493 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_stable_partition.cu @@ -0,0 +1,104 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template +// void stable_partition(const Policy& policy, +// InputIterator first, +// InputIterator last, +// UnaryPredicate pred); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +inline constexpr int size = 1000; + +template +struct is_even +{ + [[nodiscard]] __device__ constexpr bool operator()(T value) const noexcept + { + return value % 2 == 0; + } +}; + +template +void test_partition(const Policy& policy, thrust::device_vector& input) +{ + { // Empty does not access anything + auto res = + cuda::std::stable_partition(policy, static_cast(nullptr), static_cast(nullptr), is_even{}); + CHECK(res == nullptr); + } + + const auto mid = size / 2; + thrust::sequence(input.begin(), input.end(), 0); + { // With matching predicate + auto res = cuda::std::stable_partition(policy, input.begin(), input.end(), is_even{}); + CHECK(res == cuda::std::next(input.begin(), mid)); + CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2})); + } + + thrust::sequence(input.begin(), input.end(), 0); + { // With converting predicate + auto res = cuda::std::stable_partition(policy, input.begin(), input.end(), is_even{}); + CHECK(res == cuda::std::next(input.begin(), mid)); + CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2})); + CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2})); + } +} + +C2H_TEST("cuda::std::stable_partition", "[parallel algorithm]") +{ + thrust::device_vector input(size, thrust::no_init); + + SECTION("with default stream") + { + const auto policy = cuda::execution::gpu; + + test_partition(policy, input); + } + + SECTION("with provided stream") + { + cuda::stream stream{cuda::device_ref{0}}; + const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream); + + test_partition(policy, input); + } + + SECTION("with provided memory_resource") + { + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0}); + const auto policy = cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource); + + test_partition(policy, input); + } + + SECTION("with provided stream and memory_resource") + { + cuda::stream stream{cuda::device_ref{0}}; + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device()); + const auto policy = + cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource).with(cuda::get_stream, stream); + + test_partition(policy, input); + } +}