diff --git a/libcudacxx/benchmarks/bench/partition/basic.cu b/libcudacxx/benchmarks/bench/partition/basic.cu
new file mode 100644
index 00000000000..a19a64eb59b
--- /dev/null
+++ b/libcudacxx/benchmarks/bench/partition/basic.cu
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+#include <thrust/partition.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/execution>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  using select_op_t = less_then_t<T>;
+
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  const T val = lerp_min_max<T>(entropy_to_probability(entropy));
+  select_op_t select_op{val};
+
+  thrust::device_vector<T> input = generate(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc{};
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(cuda::std::partition(cuda_policy(alloc, launch), input.begin(), input.end(), select_op));
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/libcudacxx/benchmarks/bench/partition_copy/basic.cu b/libcudacxx/benchmarks/bench/partition_copy/basic.cu
new file mode 100644
index 00000000000..71569fa28ec
--- /dev/null
+++ b/libcudacxx/benchmarks/bench/partition_copy/basic.cu
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+#include <thrust/partition.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/execution>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  using select_op_t = less_then_t<T>;
+
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  const T val = lerp_min_max<T>(entropy_to_probability(entropy));
+  select_op_t select_op{val};
+
+  thrust::device_vector<T> input = generate(elements);
+  thrust::device_vector<T> output(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc{};
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(cuda::std::partition_copy(
+                 cuda_policy(alloc, launch),
+                 input.begin(),
+                 input.end(),
+                 output.begin(),
+                 cuda::std::make_reverse_iterator(output.begin() + elements),
+                 select_op));
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/libcudacxx/benchmarks/bench/stable_partition/basic.cu b/libcudacxx/benchmarks/bench/stable_partition/basic.cu
new file mode 100644
index 00000000000..62f9d653e87
--- /dev/null
+++ b/libcudacxx/benchmarks/bench/stable_partition/basic.cu
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+#include <thrust/partition.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/execution>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  using select_op_t = less_then_t<T>;
+
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  const T val = lerp_min_max<T>(entropy_to_probability(entropy));
+  select_op_t select_op{val};
+
+  thrust::device_vector<T> input = generate(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc{};
+  state.exec(
+    nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      do_not_optimize(cuda::std::stable_partition(cuda_policy(alloc, launch), input.begin(), input.end(), select_op));
+    });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/partition.h b/libcudacxx/include/cuda/std/__pstl/cuda/partition.h
new file mode 100644
index 00000000000..25b014e7447
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/cuda/partition.h
@@ -0,0 +1,169 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_CUDA_PARTITION_H
+#define _CUDA_STD___PSTL_CUDA_PARTITION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
+_CCCL_DIAG_SUPPRESS_GCC("-Wattributes")
+_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
+
+#  include <cub/device/device_partition.cuh>
+#  include <cub/device/device_transform.cuh>
+
+_CCCL_DIAG_POP
+
+#  include <cuda/__execution/policy.h>
+#  include <cuda/__functional/call_or.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__algorithm/partition.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__execution/env.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/identity.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/cuda/temporary_storage.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <>
+struct __pstl_dispatch<__pstl_algorithm::__partition, __execution_backend::__cuda>
+{
+  template <class _Policy, class _InputIterator, class _UnaryPred>
+  [[nodiscard]] _CCCL_HOST_API static size_t
+  __par_impl(const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred)
+  {
+    using _OffsetType = size_t;
+    using value_type  = iter_value_t<_InputIterator>;
+
+    _OffsetType __ret;
+    const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last));
+
+    // Determine temporary device storage requirements for device_partition
+    size_t __num_bytes = 0;
+    _CCCL_TRY_CUDA_API(
+      CUB_NS_QUALIFIER::DevicePartition::If,
+      "__pstl_cuda_partition: determination of device storage for cub::DevicePartition::If failed",
+      static_cast<void*>(nullptr),
+      __num_bytes,
+      static_cast<value_type*>(nullptr),
+      __first,
+      static_cast<_OffsetType*>(nullptr),
+      __count,
+      __pred,
+      0);
+
+    // Allocate memory for result
+    auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
+    {
+      __temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count};
+
+      // Partition cannot run inplace, so we need to first copy the input into temporary storage
+      _CCCL_TRY_CUDA_API(
+        CUB_NS_QUALIFIER::DeviceTransform::TransformIf,
+        "__pstl_cuda_partition: kernel launch of cub::DeviceTransform::TransformIf failed",
+        tuple<_InputIterator>{__first},
+        __storage.template __get_ptr<1>(),
+        __count,
+        CUB_NS_QUALIFIER::detail::transform::always_true_predicate{},
+        identity{},
+        __stream.get());
+
+      // Run the kernel, the standard requires that the input and output range do not overlap
+      _CCCL_TRY_CUDA_API(
+        CUB_NS_QUALIFIER::DevicePartition::If,
+        "__pstl_cuda_partition: kernel launch of cub::DevicePartition::If failed",
+        __storage.__get_temp_storage(),
+        __num_bytes,
+        __storage.template __get_raw_ptr<1>(),
+        ::cuda::std::move(__first),
+        __storage.template __get_ptr<0>(),
+        __count,
+        ::cuda::std::move(__pred),
+        __stream.get());
+
+      // Copy the result back from storage
+      _CCCL_TRY_CUDA_API(
+        ::cudaMemcpyAsync,
+        "__pstl_cuda_partition: copy of result from device to host failed",
+        ::cuda::std::addressof(__ret),
+        __storage.template __get_ptr<0>(),
+        sizeof(_OffsetType),
+        ::cudaMemcpyDefault,
+        __stream.get());
+    }
+
+    __stream.sync();
+    return static_cast<size_t>(__ret);
+  }
+
+  _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred)
+  _CCCL_REQUIRES(__has_forward_traversal<_InputIterator>)
+  [[nodiscard]] _CCCL_HOST_API size_t operator()(
+    [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) const
+  {
+    if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>)
+    {
+      try
+      {
+        return __par_impl(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+      }
+      catch (const ::cuda::cuda_error& __err)
+      {
+        if (__err.status() == cudaErrorMemoryAllocation)
+        {
+          _CCCL_THROW(::std::bad_alloc);
+        }
+        else
+        {
+          throw __err;
+        }
+      }
+    }
+    else
+    {
+      static_assert(__always_false_v<_Policy>, "CUDA backend of cuda::std::partition requires random access iterators");
+      return ::cuda::std::partition(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+    }
+  }
+};
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA_STD___PSTL_CUDA_PARTITION_H
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/partition_copy.h b/libcudacxx/include/cuda/std/__pstl/cuda/partition_copy.h
new file mode 100644
index 00000000000..8068fb36da2
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/cuda/partition_copy.h
@@ -0,0 +1,184 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_CUDA_PARTITION_COPY_H
+#define _CUDA_STD___PSTL_CUDA_PARTITION_COPY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
+_CCCL_DIAG_SUPPRESS_GCC("-Wattributes")
+_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
+
+#  include <cub/device/device_partition.cuh>
+
+_CCCL_DIAG_POP
+
+#  include <cuda/__execution/policy.h>
+#  include <cuda/__functional/call_or.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__algorithm/partition_copy.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__execution/env.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/cuda/temporary_storage.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <>
+struct __pstl_dispatch<__pstl_algorithm::__partition_copy, __execution_backend::__cuda>
+{
+  template <class _Policy, class _InputIterator, class _OutputIterator1, class _OutputIterator2, class _UnaryPred>
+  [[nodiscard]] _CCCL_HOST_API static size_t __par_impl(
+    const _Policy& __policy,
+    _InputIterator __first,
+    _InputIterator __last,
+    _OutputIterator1 __result_true,
+    _OutputIterator2 __result_false,
+    _UnaryPred __pred)
+  {
+    using _OffsetType = size_t;
+    using __output_wrapper_t =
+      CUB_NS_QUALIFIER::detail::select::partition_distinct_output_t<_OutputIterator1, _OutputIterator2>;
+    __output_wrapper_t __result{::cuda::std::move(__result_true), ::cuda::std::move(__result_false)};
+
+    _OffsetType __ret;
+    const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last));
+
+    // Determine temporary device storage requirements for device_partition
+    size_t __num_bytes = 0;
+    _CCCL_TRY_CUDA_API(
+      CUB_NS_QUALIFIER::DevicePartition::If,
+      "__pstl_cuda_partition_copy: determination of device storage for cub::DevicePartition::If failed",
+      static_cast<void*>(nullptr),
+      __num_bytes,
+      __first,
+      __result,
+      static_cast<_OffsetType*>(nullptr),
+      __count,
+      __pred,
+      0);
+
+    // Allocate memory for result
+    auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
+    {
+      __temporary_storage<_OffsetType> __storage{__policy, __num_bytes, 1};
+
+      // Run the kernel, the standard requires that the input and output range do not overlap
+      _CCCL_TRY_CUDA_API(
+        CUB_NS_QUALIFIER::DevicePartition::If,
+        "__pstl_cuda_partition_copy: kernel launch of cub::DevicePartition::If failed",
+        __storage.__get_temp_storage(),
+        __num_bytes,
+        ::cuda::std::move(__first),
+        ::cuda::std::move(__result),
+        __storage.template __get_ptr<0>(),
+        __count,
+        ::cuda::std::move(__pred),
+        __stream.get());
+
+      // Copy the result back from storage
+      _CCCL_TRY_CUDA_API(
+        ::cudaMemcpyAsync,
+        "__pstl_cuda_partition_copy: copy of result from device to host failed",
+        ::cuda::std::addressof(__ret),
+        __storage.template __get_ptr<0>(),
+        sizeof(_OffsetType),
+        ::cudaMemcpyDefault,
+        __stream.get());
+    }
+
+    __stream.sync();
+    return static_cast<size_t>(__ret);
+  }
+
+  _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator1, class _OutputIterator2, class _UnaryPred)
+  _CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator1> _CCCL_AND
+                   __has_forward_traversal<_OutputIterator2>)
+  [[nodiscard]] _CCCL_HOST_API size_t operator()(
+    [[maybe_unused]] const _Policy& __policy,
+    _InputIterator __first,
+    _InputIterator __last,
+    _OutputIterator1 __result_true,
+    _OutputIterator2 __result_false,
+    _UnaryPred __pred) const
+  {
+    if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>
+                  && ::cuda::std::__has_random_access_traversal<_OutputIterator1>
+                  && ::cuda::std::__has_random_access_traversal<_OutputIterator2>)
+    {
+      try
+      {
+        return __par_impl(
+          __policy,
+          ::cuda::std::move(__first),
+          ::cuda::std::move(__last),
+          ::cuda::std::move(__result_true),
+          ::cuda::std::move(__result_false),
+          ::cuda::std::move(__pred));
+      }
+      catch (const ::cuda::cuda_error& __err)
+      {
+        if (__err.status() == cudaErrorMemoryAllocation)
+        {
+          _CCCL_THROW(::std::bad_alloc);
+        }
+        else
+        {
+          throw __err;
+        }
+      }
+    }
+    else
+    {
+      static_assert(__always_false_v<_Policy>,
+                    "CUDA backend of cuda::std::partition_copy requires random access iterators");
+      return ::cuda::std::partition_copy(
+        ::cuda::std::move(__first),
+        ::cuda::std::move(__last),
+        ::cuda::std::move(__result_true),
+        ::cuda::std::move(__result_false),
+        ::cuda::std::move(__pred));
+    }
+  }
+};
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA_STD___PSTL_CUDA_PARTITION_COPY_H
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h b/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h
new file mode 100644
index 00000000000..d3c8a09ef1e
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h
@@ -0,0 +1,176 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_CUDA_STABLE_PARTITION_H
+#define _CUDA_STD___PSTL_CUDA_STABLE_PARTITION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
+_CCCL_DIAG_SUPPRESS_GCC("-Wattributes")
+_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
+
+#  include <cub/device/device_partition.cuh>
+#  include <cub/device/device_transform.cuh>
+
+_CCCL_DIAG_POP
+
+#  include <cuda/__execution/policy.h>
+#  include <cuda/__functional/call_or.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__algorithm/stable_partition.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__execution/env.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/identity.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/cuda/temporary_storage.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__pstl/reverse.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <>
+struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend::__cuda>
+{
+  template <class _Policy, class _InputIterator, class _UnaryPred>
+  [[nodiscard]] _CCCL_HOST_API static size_t
+  __par_impl(const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred)
+  {
+    using _OffsetType = size_t;
+    using value_type  = iter_value_t<_InputIterator>;
+
+    _OffsetType __ret;
+    const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last));
+
+    // Determine temporary device storage requirements for device_stable_partition
+    size_t __num_bytes = 0;
+    _CCCL_TRY_CUDA_API(
+      CUB_NS_QUALIFIER::DevicePartition::If,
+      "__pstl_cuda_stable_partition: determination of device storage for cub::DevicePartition::If failed",
+      static_cast<void*>(nullptr),
+      __num_bytes,
+      static_cast<value_type*>(nullptr),
+      __first,
+      static_cast<_OffsetType*>(nullptr),
+      __count,
+      __pred,
+      0);
+
+    // Allocate memory for result
+    auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
+    {
+      __temporary_storage<_OffsetType, value_type> __storage{__policy, __num_bytes, 1, __count};
+
+      // Partition cannot run inplace, so we need to first copy the input into temporary storage
+      _CCCL_TRY_CUDA_API(
+        CUB_NS_QUALIFIER::DeviceTransform::TransformIf,
+        "__pstl_cuda_stable_partition: kernel launch of cub::DeviceTransform::TransformIf failed",
+        tuple<_InputIterator>{__first},
+        __storage.template __get_ptr<1>(),
+        __count,
+        CUB_NS_QUALIFIER::detail::transform::always_true_predicate{},
+        identity{},
+        __stream.get());
+
+      // Run the kernel, the standard requires that the input and output range do not overlap
+      _CCCL_TRY_CUDA_API(
+        CUB_NS_QUALIFIER::DevicePartition::If,
+        "__pstl_cuda_stable_partition: kernel launch of cub::DevicePartition::If failed",
+        __storage.__get_temp_storage(),
+        __num_bytes,
+        __storage.template __get_raw_ptr<1>(),
+        __first,
+        __storage.template __get_ptr<0>(),
+        __count,
+        ::cuda::std::move(__pred),
+        __stream.get());
+
+      // Copy the result back from storage
+      _CCCL_TRY_CUDA_API(
+        ::cudaMemcpyAsync,
+        "__pstl_cuda_stable_partition: copy of result from device to host failed",
+        ::cuda::std::addressof(__ret),
+        __storage.template __get_ptr<0>(),
+        sizeof(_OffsetType),
+        ::cudaMemcpyDefault,
+        __stream.get());
+    }
+
+    __stream.sync();
+
+    // Need to reverse the elements in the second partition
+    const auto __mid = __first + static_cast<iter_difference_t<_InputIterator>>(__ret);
+    ::cuda::std::reverse(__policy, __mid, __last);
+    return static_cast<size_t>(__ret);
+  }
+
+  _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred)
+  _CCCL_REQUIRES(__has_forward_traversal<_InputIterator>)
+  [[nodiscard]] _CCCL_HOST_API size_t operator()(
+    [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred) const
+  {
+    if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>)
+    {
+      try
+      {
+        return __par_impl(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+      }
+      catch (const ::cuda::cuda_error& __err)
+      {
+        if (__err.status() == cudaErrorMemoryAllocation)
+        {
+          _CCCL_THROW(::std::bad_alloc);
+        }
+        else
+        {
+          throw __err;
+        }
+      }
+    }
+    else
+    {
+      static_assert(__always_false_v<_Policy>,
+                    "CUDA backend of cuda::std::stable_partition requires random access iterators");
+      return ::cuda::std::stable_partition(
+        ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+    }
+  }
+};
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA_STD___PSTL_CUDA_STABLE_PARTITION_H
diff --git a/libcudacxx/include/cuda/std/__pstl/dispatch.h b/libcudacxx/include/cuda/std/__pstl/dispatch.h
index 9b8d68ac0a9..4dca9963b70 100644
--- a/libcudacxx/include/cuda/std/__pstl/dispatch.h
+++ b/libcudacxx/include/cuda/std/__pstl/dispatch.h
@@ -41,8 +41,11 @@ enum class __pstl_algorithm
   __generate_n,
   __inclusive_scan,
   __merge,
+  __partition,
+  __partition_copy,
   __reduce,
   __remove_if,
+  __stable_partition,
   __transform,
   __transform_reduce,
   __unique,
diff --git a/libcudacxx/include/cuda/std/__pstl/partition.h b/libcudacxx/include/cuda/std/__pstl/partition.h
new file mode 100644
index 00000000000..9c6f8263706
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/partition.h
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_PARTITION_H
+#define _CUDA_STD___PSTL_PARTITION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__algorithm/partition.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/incrementable_traits.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/integral_constant.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/partition.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
+_CCCL_HOST_API _InputIterator
+partition([[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred)
+{
+  static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>,
+                "cuda::std::partition: UnaryPred must satisfy indirect_unary_predicate<InputIterator>");
+
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__partition, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::partition");
+
+    if (__first == __last)
+    {
+      return __first;
+    }
+
+    auto __result = __first;
+    const size_t __num_selected =
+      __dispatch(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+
+    return __result + static_cast<iter_difference_t<_InputIterator>>(__num_selected);
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>, "Parallel cuda::std::partition requires at least one selected backend");
+    return ::cuda::std::partition(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_PARTITION_H
diff --git a/libcudacxx/include/cuda/std/__pstl/partition_copy.h b/libcudacxx/include/cuda/std/__pstl/partition_copy.h
new file mode 100644
index 00000000000..228e11cb0c6
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/partition_copy.h
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_PARTITION_COPY_H
+#define _CUDA_STD___PSTL_PARTITION_COPY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__algorithm/partition_copy.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__iterator/incrementable_traits.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/integral_constant.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+#  include <cuda/std/__utility/pair.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/partition_copy.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator1, class _OutputIterator2, class _UnaryPred)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator1> _CCCL_AND
+                 __has_forward_traversal<_OutputIterator2> _CCCL_AND is_execution_policy_v<_Policy>)
+_CCCL_HOST_API pair<_OutputIterator1, _OutputIterator2> partition_copy(
+  [[maybe_unused]] const _Policy& __policy,
+  _InputIterator __first,
+  _InputIterator __last,
+  _OutputIterator1 __result_true,
+  _OutputIterator2 __result_false,
+  _UnaryPred __pred)
+{
+  static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>,
+                "cuda::std::partition_copy: UnaryPred must satisfy indirect_unary_predicate<InputIterator>");
+
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__partition_copy, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::partition_copy");
+
+    if (__first == __last)
+    {
+      return pair{__result_true, __result_false};
+    }
+
+    const auto __count          = ::cuda::std::distance(__first, __last);
+    const size_t __num_selected = __dispatch(
+      __policy,
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      __result_true,
+      __result_false,
+      ::cuda::std::move(__pred));
+
+    const auto __num_not_selected = __count - static_cast<iter_difference_t<_InputIterator>>(__num_selected);
+
+    return pair{__result_true + static_cast<iter_difference_t<_OutputIterator1>>(__num_selected),
+                __result_false + static_cast<iter_difference_t<_OutputIterator2>>(__num_not_selected)};
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>,
+                  "Parallel cuda::std::partition_copy requires at least one selected backend");
+    return ::cuda::std::partition_copy(
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      ::cuda::std::move(__result_true),
+      ::cuda::std::move(__result_false),
+      ::cuda::std::move(__pred));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_PARTITION_COPY_H
diff --git a/libcudacxx/include/cuda/std/__pstl/stable_partition.h b/libcudacxx/include/cuda/std/__pstl/stable_partition.h
new file mode 100644
index 00000000000..ac84ae0f66b
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__pstl/stable_partition.h
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_STABLE_PARTITION_H
+#define _CUDA_STD___PSTL_STABLE_PARTITION_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__algorithm/stable_partition.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/incrementable_traits.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/integral_constant.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/stable_partition.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPred)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
+_CCCL_HOST_API _InputIterator stable_partition(
+  [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _UnaryPred __pred)
+{
+  static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>,
+                "cuda::std::stable_partition: UnaryPred must satisfy indirect_unary_predicate<InputIterator>");
+
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__stable_partition,
+                                                   _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::stable_partition");
+
+    if (__first == __last)
+    {
+      return __first;
+    }
+
+    auto __result = __first;
+    const size_t __num_selected =
+      __dispatch(__policy, ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+
+    return __result + static_cast<iter_difference_t<_InputIterator>>(__num_selected);
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>,
+                  "Parallel cuda::std::stable_partition requires at least one selected backend");
+    return ::cuda::std::stable_partition(
+      ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_STABLE_PARTITION_H
diff --git a/libcudacxx/include/cuda/std/execution b/libcudacxx/include/cuda/std/execution
index 9a883862e32..a0849b882ed 100644
--- a/libcudacxx/include/cuda/std/execution
+++ b/libcudacxx/include/cuda/std/execution
@@ -54,6 +54,8 @@
 #  include <cuda/std/__pstl/merge.h>
 #  include <cuda/std/__pstl/mismatch.h>
 #  include <cuda/std/__pstl/none_of.h>
+#  include <cuda/std/__pstl/partition.h>
+#  include <cuda/std/__pstl/partition_copy.h>
 #  include <cuda/std/__pstl/reduce.h>
 #  include <cuda/std/__pstl/remove.h>
 #  include <cuda/std/__pstl/remove_copy.h>
@@ -65,6 +67,7 @@
 #  include <cuda/std/__pstl/replace_if.h>
 #  include <cuda/std/__pstl/reverse.h>
 #  include <cuda/std/__pstl/reverse_copy.h>
+#  include <cuda/std/__pstl/stable_partition.h>
 #  include <cuda/std/__pstl/transform.h>
 #  include <cuda/std/__pstl/transform_reduce.h>
 #  include <cuda/std/__pstl/unique.h>
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition.cu
new file mode 100644
index 00000000000..208578fb584
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition.cu
@@ -0,0 +1,107 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// template <class Policy, class InputIterator, class UnaryPredicate>
+// void partition(const Policy&  policy,
+//                InputIterator  first,
+//                InputIterator  last,
+//                UnaryPredicate pred);
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <cuda/cmath>
+#include <cuda/iterator>
+#include <cuda/memory_pool>
+#include <cuda/std/execution>
+#include <cuda/stream>
+
+#include <testing.cuh>
+#include <utility.cuh>
+
+inline constexpr int size = 1000;
+
+template <class T>
+struct is_even
+{
+  [[nodiscard]] __device__ constexpr bool operator()(T value) const noexcept
+  {
+    return value % 2 == 0;
+  }
+};
+
+template <class Policy>
+void test_partition(const Policy& policy, thrust::device_vector<int>& input)
+{
+  { // Empty does not access anything
+    auto res = cuda::std::partition(policy, static_cast<int*>(nullptr), static_cast<int*>(nullptr), is_even<int>{});
+    CHECK(res == nullptr);
+  }
+
+  const auto mid = size / 2;
+  thrust::sequence(input.begin(), input.end(), 0);
+  { // With matching predicate
+    auto res = cuda::std::partition(policy, input.begin(), input.end(), is_even<int>{});
+    CHECK(res == cuda::std::next(input.begin(), mid));
+    CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+
+    // Implementation detail, we copy the unselected elements in reverse order
+    CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{size - 1}, -2}));
+  }
+
+  thrust::sequence(input.begin(), input.end(), 0);
+  { // With converting predicate
+    auto res = cuda::std::partition(policy, input.begin(), input.end(), is_even<long>{});
+    CHECK(res == cuda::std::next(input.begin(), mid));
+    CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+
+    // Implementation detail, we copy the unselected elements in reverse order
+    CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{size - 1}, -2}));
+  }
+}
+
+C2H_TEST("cuda::std::partition", "[parallel algorithm]")
+{
+  thrust::device_vector<int> input(size, thrust::no_init);
+
+  SECTION("with default stream")
+  {
+    const auto policy = cuda::execution::gpu;
+
+    test_partition(policy, input);
+  }
+
+  SECTION("with provided stream")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
+
+    test_partition(policy, input);
+  }
+
+  SECTION("with provided memory_resource")
+  {
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
+    const auto policy = cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource);
+
+    test_partition(policy, input);
+  }
+
+  SECTION("with provided stream and memory_resource")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device());
+    const auto policy =
+      cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource).with(cuda::get_stream, stream);
+
+    test_partition(policy, input);
+  }
+}
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition_copy.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition_copy.cu
new file mode 100644
index 00000000000..d167ea714a0
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_partition_copy.cu
@@ -0,0 +1,177 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// template <class Policy, class InputIterator, class OutputIterator1, class OutputIterator2, class UnaryPredicate>
+// void partition_copy_copy(const Policy&   policy,
+//                     InputIterator   first,
+//                     InputIterator   last,
+//                     OutputIterator1 result_true,
+//                     OutputIterator2 result_false,
+//                     UnaryPredicate  pred);
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <cuda/cmath>
+#include <cuda/iterator>
+#include <cuda/memory_pool>
+#include <cuda/std/execution>
+#include <cuda/std/utility>
+#include <cuda/stream>
+
+#include <testing.cuh>
+#include <utility.cuh>
+
+inline constexpr int size = 1000;
+
+template <class T>
+struct is_even
+{
+  [[nodiscard]] __device__ constexpr bool operator()(T value) const noexcept
+  {
+    return value % 2 == 0;
+  }
+};
+
+template <class Policy>
+void test_partition_copy(const Policy& policy,
+                         const thrust::device_vector<int>& input,
+                         thrust::device_vector<int>& output_true,
+                         thrust::device_vector<int>& output_false)
+{
+  { // Empty does not access anything
+    auto res = cuda::std::partition_copy(
+      policy,
+      static_cast<int*>(nullptr),
+      static_cast<int*>(nullptr),
+      static_cast<int*>(nullptr),
+      static_cast<int*>(nullptr),
+      is_even<int>{});
+    CHECK(res.first == nullptr);
+    CHECK(res.second == nullptr);
+  }
+
+  cuda::std::fill(policy, output_true.begin(), output_true.end(), -1);
+  cuda::std::fill(policy, output_false.begin(), output_false.end(), -1);
+  { // With contiguous input
+    auto res = cuda::std::partition_copy(
+      policy, input.begin(), input.end(), output_true.begin(), output_false.begin(), is_even<int>{});
+    CHECK(res == cuda::std::pair{output_true.end(), output_false.end()});
+    CHECK(cuda::std::equal(
+      policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+    CHECK(cuda::std::equal(
+      policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2}));
+  }
+
+  cuda::std::fill(policy, output_true.begin(), output_true.end(), -1);
+  cuda::std::fill(policy, output_false.begin(), output_false.end(), -1);
+  { // With random access input
+    auto res = cuda::std::partition_copy(
+      policy,
+      cuda::counting_iterator<int>{0},
+      cuda::counting_iterator<int>{size},
+      output_true.begin(),
+      output_false.begin(),
+      is_even<int>{});
+    CHECK(res == cuda::std::pair{output_true.end(), output_false.end()});
+    CHECK(cuda::std::equal(
+      policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+    CHECK(cuda::std::equal(
+      policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2}));
+  }
+
+  cuda::std::fill(policy, output_true.begin(), output_true.end(), -1);
+  cuda::std::fill(policy, output_false.begin(), output_false.end(), -1);
+  { // With different input type
+    auto res = cuda::std::partition_copy(
+      policy,
+      cuda::counting_iterator<short>{0},
+      cuda::counting_iterator<short>{size},
+      output_true.begin(),
+      output_false.begin(),
+      is_even<short>{});
+    CHECK(res == cuda::std::pair{output_true.end(), output_false.end()});
+    CHECK(cuda::std::equal(
+      policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+    CHECK(cuda::std::equal(
+      policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2}));
+  }
+
+  cuda::std::fill(policy, output_true.begin(), output_true.end(), -1);
+  cuda::std::fill(policy, output_false.begin(), output_false.end(), -1);
+  { // With contiguous input, converting predicate
+    auto res = cuda::std::partition_copy(
+      policy, input.begin(), input.end(), output_true.begin(), output_false.begin(), is_even<long>{});
+    CHECK(res == cuda::std::pair{output_true.end(), output_false.end()});
+    CHECK(cuda::std::equal(
+      policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+    CHECK(cuda::std::equal(
+      policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2}));
+  }
+
+  cuda::std::fill(policy, output_true.begin(), output_true.end(), -1);
+  cuda::std::fill(policy, output_false.begin(), output_false.end(), -1);
+  { // With different input type, converting predicate
+    auto res = cuda::std::partition_copy(
+      policy,
+      cuda::counting_iterator<short>{0},
+      cuda::counting_iterator<short>{size},
+      output_true.begin(),
+      output_false.begin(),
+      is_even<long>{});
+    CHECK(res == cuda::std::pair{output_true.end(), output_false.end()});
+    CHECK(cuda::std::equal(
+      policy, output_true.begin(), output_true.end(), cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+    CHECK(cuda::std::equal(
+      policy, output_false.begin(), output_false.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2}));
+  }
+}
+
+C2H_TEST("cuda::std::partition_copy", "[parallel algorithm]")
+{
+  thrust::device_vector<int> input(size, thrust::no_init);
+  thrust::device_vector<int> output_true(size / 2, thrust::no_init);
+  thrust::device_vector<int> output_false(size / 2, thrust::no_init);
+  thrust::sequence(input.begin(), input.end(), 0);
+
+  SECTION("with default stream")
+  {
+    const auto policy = cuda::execution::gpu;
+
+    test_partition_copy(policy, input, output_true, output_false);
+  }
+
+  SECTION("with provided stream")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
+
+    test_partition_copy(policy, input, output_true, output_false);
+  }
+
+  SECTION("with provided memory_resource")
+  {
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
+    const auto policy = cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource);
+
+    test_partition_copy(policy, input, output_true, output_false);
+  }
+
+  SECTION("with provided stream and memory_resource")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device());
+    const auto policy =
+      cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource).with(cuda::get_stream, stream);
+
+    test_partition_copy(policy, input, output_true, output_false);
+  }
+}
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_stable_partition.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_stable_partition.cu
new file mode 100644
index 00000000000..7af84536493
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.partitions/pstl_stable_partition.cu
@@ -0,0 +1,104 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// template <class Policy, class InputIterator, class UnaryPredicate>
+// void stable_partition(const Policy&  policy,
+//                       InputIterator  first,
+//                       InputIterator  last,
+//                       UnaryPredicate pred);
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <cuda/cmath>
+#include <cuda/iterator>
+#include <cuda/memory_pool>
+#include <cuda/std/execution>
+#include <cuda/stream>
+
+#include <testing.cuh>
+#include <utility.cuh>
+
+inline constexpr int size = 1000;
+
+template <class T>
+struct is_even
+{
+  [[nodiscard]] __device__ constexpr bool operator()(T value) const noexcept
+  {
+    return value % 2 == 0;
+  }
+};
+
+template <class Policy>
+void test_partition(const Policy& policy, thrust::device_vector<int>& input)
+{
+  { // Empty does not access anything
+    auto res =
+      cuda::std::stable_partition(policy, static_cast<int*>(nullptr), static_cast<int*>(nullptr), is_even<int>{});
+    CHECK(res == nullptr);
+  }
+
+  const auto mid = size / 2;
+  thrust::sequence(input.begin(), input.end(), 0);
+  { // With matching predicate
+    auto res = cuda::std::stable_partition(policy, input.begin(), input.end(), is_even<int>{});
+    CHECK(res == cuda::std::next(input.begin(), mid));
+    CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+    CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2}));
+  }
+
+  thrust::sequence(input.begin(), input.end(), 0);
+  { // With converting predicate
+    auto res = cuda::std::stable_partition(policy, input.begin(), input.end(), is_even<long>{});
+    CHECK(res == cuda::std::next(input.begin(), mid));
+    CHECK(cuda::std::equal(policy, input.begin(), res, cuda::strided_iterator{cuda::counting_iterator{0}, 2}));
+    CHECK(cuda::std::equal(policy, res, input.end(), cuda::strided_iterator{cuda::counting_iterator{1}, 2}));
+  }
+}
+
+C2H_TEST("cuda::std::stable_partition", "[parallel algorithm]")
+{
+  thrust::device_vector<int> input(size, thrust::no_init);
+
+  SECTION("with default stream")
+  {
+    const auto policy = cuda::execution::gpu;
+
+    test_partition(policy, input);
+  }
+
+  SECTION("with provided stream")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
+
+    test_partition(policy, input);
+  }
+
+  SECTION("with provided memory_resource")
+  {
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
+    const auto policy = cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource);
+
+    test_partition(policy, input);
+  }
+
+  SECTION("with provided stream and memory_resource")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device());
+    const auto policy =
+      cuda::execution::gpu.with(cuda::mr::get_memory_resource, device_resource).with(cuda::get_stream, stream);
+
+    test_partition(policy, input);
+  }
+}