Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions libcudacxx/benchmarks/bench/partition/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>
#include <thrust/partition.h>

#include <cuda/memory_pool>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
using select_op_t = less_then_t<T>;

const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

const T val = lerp_min_max<T>(entropy_to_probability(entropy));
select_op_t select_op{val};

thrust::device_vector<T> input = generate(elements);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
do_not_optimize(cuda::std::partition(cuda_policy(alloc, launch), input.begin(), input.end(), select_op));
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
55 changes: 55 additions & 0 deletions libcudacxx/benchmarks/bench/partition_copy/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>
#include <thrust/partition.h>

#include <cuda/memory_pool>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
using select_op_t = less_then_t<T>;

const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

const T val = lerp_min_max<T>(entropy_to_probability(entropy));
select_op_t select_op{val};

thrust::device_vector<T> input = generate(elements);
thrust::device_vector<T> output(elements);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
do_not_optimize(cuda::std::partition_copy(
cuda_policy(alloc, launch),
input.begin(),
input.end(),
output.begin(),
cuda::std::make_reverse_iterator(output.begin() + elements),
select_op));
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
48 changes: 48 additions & 0 deletions libcudacxx/benchmarks/bench/stable_partition/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>
#include <thrust/partition.h>

#include <cuda/memory_pool>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
using select_op_t = less_then_t<T>;

const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

const T val = lerp_min_max<T>(entropy_to_probability(entropy));
select_op_t select_op{val};

thrust::device_vector<T> input = generate(elements);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};
state.exec(
nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
do_not_optimize(cuda::std::stable_partition(cuda_policy(alloc, launch), input.begin(), input.end(), select_op));
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
169 changes: 169 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/cuda/partition.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_CUDA_PARTITION_H
#define _CUDA_STD___PSTL_CUDA_PARTITION_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if _CCCL_HAS_BACKEND_CUDA()

_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
_CCCL_DIAG_SUPPRESS_GCC("-Wattributes")
_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)

# include <cub/device/device_partition.cuh>
# include <cub/device/device_transform.cuh>

_CCCL_DIAG_POP

# include <cuda/__execution/policy.h>
# include <cuda/__functional/call_or.h>
# include <cuda/__stream/get_stream.h>
# include <cuda/__stream/stream_ref.h>
# include <cuda/std/__algorithm/partition.h>
# include <cuda/std/__exception/cuda_error.h>
# include <cuda/std/__exception/exception_macros.h>
# include <cuda/std/__execution/env.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__functional/identity.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__pstl/cuda/temporary_storage.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__utility/move.h>

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

template <>
struct __pstl_dispatch<__pstl_algorithm::__partition, __execution_backend::__cuda>
{
template <class _Policy, class _InputIterator, class _UnaryPred>
[[nodiscard]] _CCCL_HOST_API static size_t
__par_impl(const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred)
{
using _OffsetType = size_t;
using value_type = iter_value_t<_InputIterator>;

_OffsetType __ret;
const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last));

// Determine temporary device storage requirements for device_partition
size_t __num_bytes = 0;
_CCCL_TRY_CUDA_API(
CUB_NS_QUALIFIER::DevicePartition::If,
"__pstl_cuda_partition: determination of device storage for cub::DevicePartition::If failed",
static_cast<void*>(nullptr),
__num_bytes,
static_cast<value_type*>(nullptr),
__first,
static_cast<_OffsetType*>(nullptr),
__count,
__pred,
0);

// Allocate memory for result
auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
{
__temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count};

// Partition cannot run inplace, so we need to first copy the input into temporary storage
_CCCL_TRY_CUDA_API(
CUB_NS_QUALIFIER::DeviceTransform::TransformIf,
"__pstl_cuda_partition: kernel launch of cub::DeviceTransform::TransformIf failed",
tuple<_InputIterator>{__first},
__storage.template __get_ptr<1>(),
__count,
CUB_NS_QUALIFIER::detail::transform::always_true_predicate{},
identity{},
__stream.get());

// Run the kernel, the standard requires that the input and output range do not overlap
_CCCL_TRY_CUDA_API(
CUB_NS_QUALIFIER::DevicePartition::If,
"__pstl_cuda_partition: kernel launch of cub::DevicePartition::If failed",
__storage.__get_temp_storage(),
__num_bytes,
__storage.template __get_raw_ptr<1>(),
::cuda::std::move(__first),
__storage.template __get_ptr<0>(),
__count,
::cuda::std::move(__pred),
__stream.get());

// Copy the result back from storage
_CCCL_TRY_CUDA_API(
::cudaMemcpyAsync,
"__pstl_cuda_partition: copy of result from device to host failed",
::cuda::std::addressof(__ret),
__storage.template __get_ptr<0>(),
sizeof(_OffsetType),
::cudaMemcpyDefault,
__stream.get());
}

__stream.sync();
return static_cast<size_t>(__ret);
}

_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator>)
[[nodiscard]] _CCCL_HOST_API size_t operator()(
[[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) const
{
if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>)
{
try
{
return __par_impl(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
}
catch (const ::cuda::cuda_error& __err)
{
if (__err.status() == cudaErrorMemoryAllocation)
{
_CCCL_THROW(::std::bad_alloc);
}
else
{
throw __err;
}
}
}
else
{
static_assert(__always_false_v<_Policy>, "CUDA backend of cuda::std::partition requires random access iterators");
return ::cuda::std::partition(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
}
}
};

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION

# include <cuda/std/__cccl/epilogue.h>

#endif // _CCCL_HAS_BACKEND_CUDA()

#endif // _CUDA_STD___PSTL_CUDA_PARTITION_H
Loading
Loading