Skip to content

Commit

Permalink
Merge branch 'dev' into fix-full-occupancy
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel authored Dec 3, 2024
2 parents 740dbae + a4fb985 commit 0b60143
Show file tree
Hide file tree
Showing 23 changed files with 596 additions and 919 deletions.
9 changes: 5 additions & 4 deletions include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ class bloom_filter_impl {
auto const grid_size =
cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);

detail::add_if_n<cg_size, block_size>
detail::bloom_filter_ns::add_if_n<cg_size, block_size>
<<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
}

Expand Down Expand Up @@ -303,8 +303,9 @@ class bloom_filter_impl {
auto const grid_size =
cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);

detail::contains_if_n<cg_size, block_size><<<grid_size, block_size, 0, stream.get()>>>(
first, num_keys, stencil, pred, output_begin, *this);
detail::bloom_filter_ns::contains_if_n<cg_size, block_size>
<<<grid_size, block_size, 0, stream.get()>>>(
first, num_keys, stencil, pred, output_begin, *this);
}

[[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; }
Expand Down Expand Up @@ -365,4 +366,4 @@ class bloom_filter_impl {
policy_type policy_;
};

} // namespace cuco::detail
} // namespace cuco::detail
4 changes: 2 additions & 2 deletions include/cuco/detail/bloom_filter/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cstdint>
#include <iterator>

namespace cuco::detail {
namespace cuco::detail::bloom_filter_ns {

CUCO_SUPPRESS_KERNEL_WARNINGS

Expand Down Expand Up @@ -89,4 +89,4 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
}
}

} // namespace cuco::detail
} // namespace cuco::detail::bloom_filter_ns
4 changes: 2 additions & 2 deletions include/cuco/detail/open_addressing/functors.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <cuco/detail/bitwise_compare.cuh>
#include <cuco/detail/pair/traits.hpp>

namespace cuco::open_addressing_ns::detail {
namespace cuco::detail::open_addressing_ns {

/**
* @brief Device functor returning the content of the slot indexed by `idx`
Expand Down Expand Up @@ -107,4 +107,4 @@ struct slot_is_filled {
}
};

} // namespace cuco::open_addressing_ns::detail
} // namespace cuco::detail::open_addressing_ns
4 changes: 2 additions & 2 deletions include/cuco/detail/open_addressing/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

#include <iterator>

namespace cuco::detail {
namespace cuco::detail::open_addressing_ns {
CUCO_SUPPRESS_KERNEL_WARNINGS

/**
Expand Down Expand Up @@ -729,4 +729,4 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void rehash(
}
}

} // namespace cuco::detail
} // namespace cuco::detail::open_addressing_ns
35 changes: 18 additions & 17 deletions include/cuco/detail/open_addressing/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, stencil, pred, counter.data(), container_ref);

Expand Down Expand Up @@ -384,7 +384,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::insert_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, stencil, pred, container_ref);
}
Expand Down Expand Up @@ -426,7 +426,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::insert_and_find<cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::insert_and_find<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, found_begin, inserted_begin, container_ref);
}
Expand Down Expand Up @@ -466,7 +466,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::erase<cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::erase<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, container_ref);
}
Expand Down Expand Up @@ -540,7 +540,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::contains_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, stencil, pred, output_begin, container_ref);
}
Expand Down Expand Up @@ -615,7 +615,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::find_if_n<cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::find_if_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, stencil, pred, output_begin, container_ref);
}
Expand Down Expand Up @@ -789,8 +789,8 @@ class open_addressing_impl {
std::min(static_cast<cuco::detail::index_type>(this->capacity()) - offset, stride);
auto const begin = thrust::make_transform_iterator(
thrust::counting_iterator{static_cast<size_type>(offset)},
open_addressing_ns::detail::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
this->empty_key_sentinel(), this->erased_key_sentinel()};

std::size_t temp_storage_bytes = 0;
Expand Down Expand Up @@ -844,7 +844,7 @@ class open_addressing_impl {
template <typename CallbackOp>
void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const
{
auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
this->empty_key_sentinel(), this->erased_key_sentinel()};

auto storage_ref = this->storage_ref();
Expand Down Expand Up @@ -886,7 +886,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::for_each_n<cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::for_each_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, std::forward<CallbackOp>(callback_op), container_ref);
}
Expand All @@ -907,12 +907,12 @@ class open_addressing_impl {
counter.reset(stream);

auto const grid_size = cuco::detail::grid_size(storage_.num_buckets());
auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
this->empty_key_sentinel(), this->erased_key_sentinel()};

// TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to
// v2.1.0
detail::size<cuco::detail::default_block_size()>
detail::open_addressing_ns::size<cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
storage_.ref(), is_filled, counter.data());

Expand Down Expand Up @@ -1014,10 +1014,10 @@ class open_addressing_impl {
auto constexpr block_size = cuco::detail::default_block_size();
auto constexpr stride = cuco::detail::default_stride();
auto const grid_size = cuco::detail::grid_size(num_buckets, 1, stride, block_size);
auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
this->empty_key_sentinel(), this->erased_key_sentinel()};

detail::rehash<block_size><<<grid_size, block_size, 0, stream.get()>>>(
detail::open_addressing_ns::rehash<block_size><<<grid_size, block_size, 0, stream.get()>>>(
old_storage.ref(), container.ref(op::insert), is_filled);
}

Expand Down Expand Up @@ -1120,7 +1120,7 @@ class open_addressing_impl {

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::count<IsOuter, cg_size, cuco::detail::default_block_size()>
detail::open_addressing_ns::count<IsOuter, cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, counter.data(), container_ref);

Expand Down Expand Up @@ -1180,8 +1180,9 @@ class open_addressing_impl {
auto constexpr grid_stride = 1;
auto const grid_size = cuco::detail::grid_size(n, cg_size, grid_stride, block_size);

detail::retrieve<IsOuter, block_size><<<grid_size, block_size, 0, stream.get()>>>(
first, n, output_probe, output_match, counter.data(), container_ref);
detail::open_addressing_ns::retrieve<IsOuter, block_size>
<<<grid_size, block_size, 0, stream.get()>>>(
first, n, output_probe, output_match, counter.data(), container_ref);

auto const num_retrieved = counter.load_to_host(stream.get());

Expand Down
4 changes: 2 additions & 2 deletions include/cuco/detail/static_map/helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <cuco/detail/static_map/kernels.cuh>
#include <cuco/detail/utility/cuda.cuh>

namespace cuco::static_map_ns::detail {
namespace cuco::detail::static_map_ns {

/**
* @brief Dispatches to shared memory map kernel if `num_elements_per_thread > 2`, else
Expand Down Expand Up @@ -112,4 +112,4 @@ void dispatch_insert_or_apply(
first, num, init, op, ref);
}
}
} // namespace cuco::static_map_ns::detail
} // namespace cuco::detail::static_map_ns
4 changes: 2 additions & 2 deletions include/cuco/detail/static_map/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

#include <iterator>

namespace cuco::static_map_ns::detail {
namespace cuco::detail::static_map_ns {
CUCO_SUPPRESS_KERNEL_WARNINGS

// TODO user insert_or_assign internally
Expand Down Expand Up @@ -262,4 +262,4 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
}
}
}
} // namespace cuco::static_map_ns::detail
} // namespace cuco::detail::static_map_ns
26 changes: 23 additions & 3 deletions include/cuco/detail/static_map/static_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora

auto const grid_size = cuco::detail::grid_size(num, cg_size);

static_map_ns::detail::insert_or_assign<cg_size, cuco::detail::default_block_size()>
detail::static_map_ns::insert_or_assign<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num, ref(op::insert_or_assign));
}
Expand Down Expand Up @@ -335,7 +335,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
{
auto constexpr has_init = false;
auto const init = this->empty_value_sentinel(); // use empty_sentinel as unused init value
static_map_ns::detail::dispatch_insert_or_apply<has_init, cg_size, Allocator>(
detail::static_map_ns::dispatch_insert_or_apply<has_init, cg_size, Allocator>(
first, last, init, op, ref(op::insert_or_apply), stream);
}

Expand All @@ -353,7 +353,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream) noexcept
{
auto constexpr has_init = true;
static_map_ns::detail::dispatch_insert_or_apply<has_init, cg_size, Allocator>(
detail::static_map_ns::dispatch_insert_or_apply<has_init, cg_size, Allocator>(
first, last, init, op, ref(op::insert_or_apply), stream);
}

Expand Down Expand Up @@ -612,6 +612,26 @@ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
return impl_->count(first, last, ref(op::count), stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
template <typename InputIt, typename OutputProbeIt, typename OutputMatchIt>
std::pair<OutputProbeIt, OutputMatchIt>
static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::retrieve(
InputIt first,
InputIt last,
OutputProbeIt output_probe,
OutputMatchIt output_match,
cuda::stream_ref stream) const
{
return impl_->retrieve(first, last, output_probe, output_match, this->ref(op::retrieve), stream);
}

template <class Key,
class T,
class Extent,
Expand Down
68 changes: 68 additions & 0 deletions include/cuco/detail/static_map/static_map_ref.inl
Original file line number Diff line number Diff line change
Expand Up @@ -1428,5 +1428,73 @@ class operator_impl<
return ref_.impl_.count(group, key);
}
};

template <typename Key,
typename T,
cuda::thread_scope Scope,
typename KeyEqual,
typename ProbingScheme,
typename StorageRef,
typename... Operators>
class operator_impl<
op::retrieve_tag,
static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
using key_type = typename base_type::key_type;
using value_type = typename base_type::value_type;
using iterator = typename base_type::iterator;
using const_iterator = typename base_type::const_iterator;

static constexpr auto cg_size = base_type::cg_size;
static constexpr auto bucket_size = base_type::bucket_size;

public:
/**
* @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin,
* input_probe_end)`.
*
* If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated
* slot content to `output_match`, respectively. The output order is unspecified.
*
* Behavior is undefined if the size of the output range exceeds the number of retrieved slots.
* Use `count()` to determine the size of the output range.
*
* @tparam BlockSize Size of the thread block this operation is executed in
* @tparam InputProbeIt Device accessible input iterator whose `value_type` is
* convertible to the container's `key_type`
* @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
* convertible to the container's `key_type`
* @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
* convertible to the container's `value_type`
* @tparam AtomicCounter Atomic counter type that follows the same semantics as
* `cuda::atomic(_ref)`
*
* @param block Thread block this operation is executed in
* @param input_probe_begin Beginning of the input sequence of keys
* @param input_probe_end End of the input sequence of keys
* @param output_probe Beginning of the sequence of keys corresponding to matching elements in
* `output_match`
* @param output_match Beginning of the sequence of matching elements
* @param atomic_counter Counter that is used to determine the next free position in the output
* sequences
*/
template <int32_t BlockSize,
class InputProbeIt,
class OutputProbeIt,
class OutputMatchIt,
class AtomicCounter>
__device__ void retrieve(cooperative_groups::thread_block const& block,
InputProbeIt input_probe_begin,
InputProbeIt input_probe_end,
OutputProbeIt output_probe,
OutputMatchIt output_match,
AtomicCounter* atomic_counter) const
{
auto const& ref_ = static_cast<ref_type const&>(*this);
ref_.impl_.retrieve<BlockSize>(
block, input_probe_begin, input_probe_end, output_probe, output_match, atomic_counter);
}
};
} // namespace detail
} // namespace cuco
20 changes: 20 additions & 0 deletions include/cuco/detail/static_multimap/static_multimap.inl
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,26 @@ static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
return impl_->count(first, last, ref(op::count), stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
template <typename InputIt, typename OutputProbeIt, typename OutputMatchIt>
std::pair<OutputProbeIt, OutputMatchIt>
static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::retrieve(
InputIt first,
InputIt last,
OutputProbeIt output_probe,
OutputMatchIt output_match,
cuda::stream_ref stream) const
{
return impl_->retrieve(first, last, output_probe, output_match, this->ref(op::retrieve), stream);
}

template <class Key,
class T,
class Extent,
Expand Down
Loading

0 comments on commit 0b60143

Please sign in to comment.