-
Notifications
You must be signed in to change notification settings - Fork 202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FEA]: cuda_parallel should provide DiscardIterator #4132
Labels
feature request
New feature or request.
Comments
@NaderAlAwar @shwina @gevtushenko This is related to discussion of examples for |
Example C++ code of using discard iterator which would be nice to replicate in Python: Using cub::DeviceSelect::UniqueByKey and Thrust iterators to count unique values of Collatz sequence#include <cstdint>
#include <vector>
#include <cassert>
#include <string>
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/copy.h>
#include <cub/cub.cuh>
#include <cuda/std/cstdint>
#include <cuda/std/functional>
std::uint32_t __device__ __host__ collatz_seq(std::uint32_t n) {
std::uint32_t m = n;
std::uint32_t count = 0;
assert(n > 0);
while(m > 1) {
++count;
m = (m & 1) ? (3*m + 1) : (m >> 1);
}
return count;
}
cudaError_t check_error(cudaError_t status) {
if (status != cudaSuccess) {
const std::string msg{"CUDA error encountered: "};
throw std::runtime_error(msg + cudaGetErrorString(status));
}
return status;
}
struct op_t {
cuda::std::uint32_t __device__ __host__ operator()(cuda::std::uint32_t n) const {
return collatz_seq(n);
}
};
int main(void) {
cuda::std::uint32_t start = 3;
std::size_t nelems = 1024 * 1024;
std::cout << "Processing Collatz sequence with indices [" <<start << ", " << nelems + start << "] to find unique sequence values" << std::endl;
using CountingIt = thrust::counting_iterator<cuda::std::uint32_t>;
using TransformIt = thrust::transform_iterator<op_t, CountingIt>;
// Iterator of counting sequence of Collatz arguments
CountingIt collatz_args_it{ 3 };
// Iterator over corresponding Collatz values
TransformIt collatz_values_it{
collatz_args_it, op_t{} };
auto keys_it = collatz_values_it;
auto values_it = collatz_args_it;
thrust::device_vector<cuda::std::size_t> n_selected(1);
std::size_t temp_storage_bytes = 0;
auto err1 = check_error(
cub::DeviceSelect::UniqueByKey(
nullptr,
temp_storage_bytes,
keys_it,
values_it,
thrust::make_discard_iterator(),
thrust::make_discard_iterator(),
n_selected.begin(),
nelems
)
);
thrust::device_vector<cuda::std::uint8_t> storage(temp_storage_bytes);
auto err2 = check_error(
cub::DeviceSelect::UniqueByKey(
thrust::raw_pointer_cast(storage.data()),
temp_storage_bytes,
keys_it,
values_it,
thrust::make_discard_iterator(),
thrust::make_discard_iterator(),
n_selected.begin(),
nelems
)
);
cudaDeviceSynchronize();
std::vector<std::size_t> n_selected_host(1);
thrust::copy(n_selected.begin(), n_selected.end(), n_selected_host.begin()); // should be blocking
thrust::device_vector<cuda::std::uint32_t> unique_keys(n_selected_host.front());
thrust::device_vector<cuda::std::uint32_t> unique_values(n_selected_host.front());
std::size_t temp_storage2_bytes = 0;
auto err3 = check_error(
cub::DeviceSelect::UniqueByKey(
nullptr,
temp_storage2_bytes,
keys_it,
values_it,
unique_keys.begin(),
unique_values.begin(),
n_selected.begin(),
nelems
)
);
thrust::device_vector<cuda::std::uint8_t> storage2(temp_storage2_bytes);
auto err4 = check_error(
cub::DeviceSelect::UniqueByKey(
thrust::raw_pointer_cast(storage2.data()),
temp_storage2_bytes,
keys_it,
values_it,
unique_keys.begin(),
unique_values.begin(),
n_selected.begin(),
nelems
)
);
cudaDeviceSynchronize();
std::vector<std::uint32_t> unique_keys_host(n_selected_host.front());
thrust::copy(unique_keys.begin(), unique_keys.end(), unique_keys_host.begin());
std::vector<std::uint32_t> unique_values_host(n_selected_host.front());
thrust::copy(unique_values.begin(), unique_values.end(), unique_values_host.begin());
cudaDeviceSynchronize();
std::cout << "Number of selected values: " << n_selected_host.front() << std::endl;
for(std::size_t i = 0; i < std::min<std::size_t>(12, n_selected_host.front()); ++i) {
std::cout << "(" << unique_keys_host[i] << ", " << unique_values_host[i] << "), ";
}
if (n_selected_host.front() > 12) {
std::cout << " ...";
}
std::cout << std::endl;
return 0;
} Compilation and output:
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Is this a duplicate?
Area
cuda.parallel (Python)
Is your feature request related to a problem? Please describe.
In order to support example of using
unique_by_key
algorithms but only allocating what is required to accommodate the unique keys and unique values Python user needs to useDiscardIterator
.In the first call,
unique_by_key
is called withDiscardIterator
for output keys and output items, compute the number of selected items, allocates output arrays and calls the algorithms again to populate the allocated arrays.Describe the solution you'd like
It would be awesome of
cuda.parallel
supported output iterators and, in particular,DiscardIterator
.Describe alternatives you've considered
No response
Additional context
No response
The text was updated successfully, but these errors were encountered: