diff --git a/bench/00_operators/reduction.cu b/bench/00_operators/reduction.cu index 6652cb39..b75205ed 100644 --- a/bench/00_operators/reduction.cu +++ b/bench/00_operators/reduction.cu @@ -55,11 +55,11 @@ void reduce_0d_cub(nvbench::state &state, nvbench::type_list) auto xv2 = make_tensor(); xv.PrefetchDevice(0); - cub_reduce(xv2, xv, 0.0f, 0); + sum(xv2, xv, 0); state.exec( [&xv, &xv2](nvbench::launch &launch) { - cub_reduce(xv2, xv, 0.0f, (cudaStream_t)launch.get_stream()); + sum(xv2, xv, (cudaStream_t)launch.get_stream()); }); } diff --git a/examples/convolution.cu b/examples/convolution.cu index 7cf6c1d7..6abe61d8 100644 --- a/examples/convolution.cu +++ b/examples/convolution.cu @@ -66,11 +66,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) using FilterType = float; // Create data objects - auto inView = make_static_tensor(); - auto outView = make_static_tensor(); - auto solView = make_static_tensor(); + auto inView = make_tensor({batches, numSamples}); + auto outView = make_tensor({batches, numSamples + filterLen - 1}); + auto solView = make_tensor({batches, numSamples + filterLen - 1}); + auto filterView = make_tensor({filterLen}); - auto filterView = make_static_tensor(); // initialize input data for (index_t b = 0; b < batches; b++) { diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu index b7def27a..35be79a0 100644 --- a/examples/spectrogram.cu +++ b/examples/spectrogram.cu @@ -80,15 +80,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) std::array half_win{nfft / 2 + 1}; std::array s_time_shape{(N - noverlap) / nstep}; - auto time = make_static_tensor(); - auto modulation = make_static_tensor(); - auto carrier = make_static_tensor(); - auto noise = make_static_tensor(); - auto x = make_static_tensor(); + auto time = make_tensor({N}); + auto modulation = make_tensor({N}); + auto carrier = make_tensor({N}); + auto noise = make_tensor({N}); + auto x = make_tensor({N}); - auto freqs = make_static_tensor(); + auto freqs = make_tensor({nfft / 2 + 1}); auto fftStackedMatrix = make_tensor({(N - noverlap) / nstep, nfft / 2 + 1}); - auto s_time = make_static_tensor(); + auto s_time = make_tensor({(N - noverlap) / nstep}); randomGenerator_t randData({N}, 0); auto randDataView = randData.GetTensorView<1>(num_samps, NORMAL); diff --git a/include/matx_cub.h b/include/matx_cub.h index fad8662b..452ea70b 100644 --- a/include/matx_cub.h +++ b/include/matx_cub.h @@ -35,7 +35,7 @@ #include "matx_error.h" #include "matx_tensor.h" -#include +#include #include #ifdef __CUDACC__ #include @@ -44,6 +44,8 @@ namespace matx { +using namespace std::placeholders; + /** * @brief Direction for sorting * @@ -69,9 +71,8 @@ typedef enum { struct CubParams_t { CUBOperation_t op; - index_t size; + std::array size = {0}; index_t batches; - void *A; void *a_out; MatXDataType_t dtype; cudaStream_t stream; @@ -106,10 +107,10 @@ struct UniqueParams_t { struct EmptyParams_t {}; -template +template class matxCubPlan_t { static constexpr int RANK = OutputTensor::Rank(); - using T1 = typename InputTensor::scalar_type; + using T1 = typename InputOperator::scalar_type; using T2 = typename OutputTensor::scalar_type; public: @@ -130,13 +131,13 @@ class matxCubPlan_t { * CUDA stream * */ - matxCubPlan_t(OutputTensor &a_out, const InputTensor &a, const CParams &cparams, const cudaStream_t stream = 0) : + matxCubPlan_t(OutputTensor &a_out, const InputOperator &a, const CParams &cparams, const cudaStream_t stream = 0) : cparams_(cparams) { #ifdef __CUDACC__ // Input/output tensors much match rank/dims if constexpr (op == CUB_OP_RADIX_SORT || op == CUB_OP_INC_SUM) { - static_assert(OutputTensor::Rank() == InputTensor::Rank(), "CUB input and output tensor ranks must match"); + static_assert(OutputTensor::Rank() == InputOperator::Rank(), "CUB input and output tensor ranks must match"); static_assert(RANK >= 1, "CUB function must have an output rank of 1 or higher"); for (int i = 0; i < a.Rank(); i++) { MATX_ASSERT(a.Size(i) == a_out.Size(i), matxInvalidSize); @@ -144,12 +145,14 @@ class matxCubPlan_t { } if constexpr ((RANK == 1 && ( op == CUB_OP_REDUCE_SUM || - op == CUB_OP_REDUCE)) || + op == CUB_OP_REDUCE || + op == CUB_OP_REDUCE_MIN || + op == CUB_OP_REDUCE_MAX)) || (RANK == 2 && op == CUB_OP_RADIX_SORT)) { matxAlloc((void **)&d_offsets, (a.Size(a.Rank() - 2) + 1) * sizeof(index_t), MATX_ASYNC_DEVICE_MEMORY, stream); for (index_t i = 0; i < a.Size(a.Rank() - 2) + 1; i++) { - offsets.push_back(i * a.Lsize()); + offsets.push_back(i * a.Size(a.Rank() - 1)); } cudaMemcpyAsync(d_offsets, offsets.data(), @@ -198,17 +201,19 @@ class matxCubPlan_t { } static CubParams_t GetCubParams(OutputTensor &a_out, - const InputTensor &a) + const InputOperator &a) { CubParams_t params; - params.size = a.Lsize(); - params.A = a.Data(); + for (int r = a.Rank() - 1; r >= RANK - 2; r--) { + params.size[1 - r] = a.Size(r); + } + params.op = op; if constexpr (op == CUB_OP_RADIX_SORT) { params.batches = (RANK == 1) ? 1 : a.Size(RANK - 2); } else if constexpr (op == CUB_OP_INC_SUM || op == CUB_OP_HIST_EVEN) { - params.batches = a.TotalSize() / a.Lsize(); + params.batches = a.TotalSize() / a.Size(a.Rank() - 1); } params.a_out = a_out.Data(); params.dtype = TypeToInt(); @@ -229,6 +234,60 @@ class matxCubPlan_t { matxFree(d_offsets); } + template + void RunBatches(OutputTensor &a_out, const InputOperator &a, const Func &f, int batch_offset) + { + using shape_type = index_t; + size_t total_iter = 1; + for (int i = 0; i < InputOperator::Rank() - batch_offset; i++) { + total_iter *= a.Size(i); + } + + // Get total indices per batch + size_t total_per_batch = 1; + index_t offset = 0; + for (int i = InputOperator::Rank() - batch_offset; i < InputOperator::Rank(); i++) { + total_per_batch *= a.Size(i); + } + + std::array idx{0}; + + if constexpr (is_tensor_view_v) { + if (a.IsContiguous()) { + for (size_t iter = 0; iter < total_iter; iter++) { + auto ap = std::apply([&a](auto... param) { return a.GetPointer(param...); }, idx); + auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); + + f(ap, aop); + + // Update all but the last batch_offset indices + UpdateIndices(a, idx, batch_offset); + } + } + else { + const tensor_impl_t base = a; + for (size_t iter = 0; iter < total_iter; iter++) { + auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); + + f(RandomOperatorIterator{base, offset}, aop); + offset += total_per_batch; + + UpdateIndices(a, idx, batch_offset); + } + } + } + else { + for (size_t iter = 0; iter < total_iter; iter++) { + auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); + + f(RandomOperatorIterator{a, offset}, aop); + offset += total_per_batch; + + UpdateIndices(a, idx, batch_offset); + } + } + } + /** * Execute an inclusive prefix sum on a tensor * @@ -249,35 +308,38 @@ class matxCubPlan_t { * */ inline void ExecHistEven(OutputTensor &a_out, - const InputTensor &a, const T1 lower, + const InputOperator &a, const T1 lower, const T1 upper, const cudaStream_t stream) { #ifdef __CUDACC__ + const tensor_impl_t base = a; if (RANK == 1 || d_temp == nullptr) { - cub::DeviceHistogram::HistogramEven( - d_temp, temp_storage_bytes, a.Data(), a_out.Data(), - static_cast(a_out.Lsize() + 1), lower, upper, - static_cast(a.Lsize()), stream); - } - else { // Batch higher dims - using shape_type = typename InputTensor::desc_type::shape_type; - int batch_offset = 2; - std::array idx{0}; - auto a_shape = a.Shape(); - // Get total number of batches - size_t total_iter = std::accumulate(a_shape.begin(), a_shape.begin() + InputTensor::Rank() - batch_offset, 1, std::multiplies()); - for (size_t iter = 0; iter < total_iter; iter++) { - auto ap = std::apply([&a](auto... param) { return a.GetPointer(param...); }, idx); - auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); - + if constexpr (is_tensor_view_v) { + if (a.IsContiguous()) { + cub::DeviceHistogram::HistogramEven( + d_temp, temp_storage_bytes, a.Data(), a_out.Data(), + static_cast(a_out.Lsize() + 1), lower, upper, + static_cast(a.Lsize()), stream); + } + else { + cub::DeviceHistogram::HistogramEven( + d_temp, temp_storage_bytes, RandomOperatorIterator{base}, a_out.Data(), + static_cast(a_out.Lsize() + 1), lower, upper, + static_cast(a.Lsize()), stream); + } + } + else { cub::DeviceHistogram::HistogramEven( - d_temp, temp_storage_bytes, ap, aop, + d_temp, temp_storage_bytes, RandomOperatorIterator{a}, a_out.Data(), static_cast(a_out.Lsize() + 1), lower, upper, + static_cast(a.Lsize()), stream); + } + } + else { // Batch higher dims + auto ft = [&](auto ...p){ return cub::DeviceHistogram::HistogramEven(p...); }; + auto f = std::bind(ft, d_temp, temp_storage_bytes, _1, _2, static_cast(a_out.Lsize() + 1), lower, upper, static_cast(a.Lsize()), stream); - - // Update all but the last 2 indices - UpdateIndices(a, idx, batch_offset); - } + RunBatches(a_out, a, f, 2); } #endif } @@ -298,33 +360,34 @@ class matxCubPlan_t { * */ inline void ExecPrefixScanEx(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { #ifdef __CUDACC__ if (RANK == 1 || d_temp == nullptr) { - cub::DeviceScan::InclusiveSum(d_temp, temp_storage_bytes, a.Data(), - a_out.Data(), static_cast(a.Lsize()), - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceScan::InclusiveSum(d_temp, temp_storage_bytes, a.Data(), + a_out.Data(), static_cast(a.Lsize()), + stream); + } + else { + cub::DeviceScan::InclusiveSum(d_temp, temp_storage_bytes, RandomOperatorIterator{base}, + a_out.Data(), static_cast(a.Lsize()), + stream); + } + } + else { + cub::DeviceScan::InclusiveSum(d_temp, temp_storage_bytes, RandomOperatorIterator{a}, + a_out.Data(), static_cast(a.Lsize()), + stream); + } } else { - using shape_type = typename InputTensor::desc_type::shape_type; - int batch_offset = 1; - std::array idx{}; - auto a_shape = a.Shape(); - // Get total number of batches - size_t total_iter = std::accumulate(a_shape.begin(), a_shape.begin() + InputTensor::Rank() - batch_offset, 1, std::multiplies()); - for (size_t iter = 0; iter < total_iter; iter++) { - auto ap = std::apply([&a](auto... param) { return a.GetPointer(param...); }, idx); - auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); - - cub::DeviceScan::InclusiveSum(d_temp, temp_storage_bytes,ap, - aop, static_cast(a.Lsize()), - stream); - - // Update all but the last 2 indices - UpdateIndices(a, idx, batch_offset); - } + auto ft = [&](auto ...p){ cub::DeviceScan::InclusiveSum(p...); }; + auto f = std::bind(ft, d_temp, temp_storage_bytes, _1, _2, static_cast(a.Lsize()), stream); + RunBatches(a_out, a, f, 1); } #endif } @@ -347,24 +410,29 @@ class matxCubPlan_t { * */ inline void ExecSort(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream, const SortDirection_t dir = SORT_DIR_ASC) { -#ifdef __CUDACC__ - if (RANK == 1) { - if (dir == SORT_DIR_ASC) { - cub::DeviceRadixSort::SortKeys( - d_temp, temp_storage_bytes, a.Data(), a_out.Data(), - static_cast(a.Lsize()), 0, sizeof(T1) * 8, stream); - } - else { - cub::DeviceRadixSort::SortKeysDescending( - d_temp, temp_storage_bytes, a.Data(), a_out.Data(), - static_cast(a.Lsize()), 0, sizeof(T1) * 8, stream); - } - } - else if (RANK == 2 || d_temp == nullptr) { +#ifdef __CUDACC__ + static_assert(is_tensor_view_v, "Sorting only accepts tensors for now (no operators)"); + MATX_ASSERT_STR(a.IsContiguous(), matxInvalidType, "Tensor must be contiguous in memory for sorting"); + + if constexpr (is_tensor_view_v) { + if (RANK == 1) { + const tensor_impl_t base = a; + if (dir == SORT_DIR_ASC) { + cub::DeviceRadixSort::SortKeys( + d_temp, temp_storage_bytes, a.Data(), a_out.Data(), + static_cast(a.Lsize()), 0, sizeof(T1) * 8, stream); + } + else { + cub::DeviceRadixSort::SortKeysDescending( + d_temp, temp_storage_bytes, a.Data(), a_out.Data(), + static_cast(a.Lsize()), 0, sizeof(T1) * 8, stream); + } + } + else if (RANK == 2 || d_temp == nullptr) { if (dir == SORT_DIR_ASC) { cub::DeviceSegmentedRadixSort::SortKeys( d_temp, temp_storage_bytes, a.Data(), a_out.Data(), @@ -377,33 +445,48 @@ class matxCubPlan_t { static_cast(a.Lsize()), static_cast(a.Size(RANK - 2)), d_offsets, d_offsets + 1, 0, sizeof(T1) * 8, stream); } - } - else { - using shape_type = typename InputTensor::desc_type::shape_type; - int batch_offset = 2; - std::array idx{0}; - auto a_shape = a.Shape(); - // Get total number of batches - size_t total_iter = std::accumulate(a_shape.begin(), a_shape.begin() + InputTensor::Rank() - batch_offset, 1, std::multiplies()); - for (size_t iter = 0; iter < total_iter; iter++) { - auto ap = std::apply([&a](auto... param) { return a.GetPointer(param...); }, idx); - auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); + } + else { + constexpr int batch_offset = 2; + using shape_type = index_t; + size_t total_iter = 1; + for (int i = 0; i < InputOperator::Rank() - batch_offset; i++) { + total_iter *= a.Size(i); + } + + std::array idx{0}; if (dir == SORT_DIR_ASC) { - cub::DeviceSegmentedRadixSort::SortKeys( - d_temp, temp_storage_bytes, ap, aop, - static_cast(a.Lsize()), static_cast(a.Size(RANK - 2)), - d_offsets, d_offsets + 1, 0, sizeof(T1) * 8, stream); + auto ft = [&](auto ...p){ cub::DeviceSegmentedRadixSort::SortKeys(p...); }; + + auto f = std::bind(ft, d_temp, temp_storage_bytes, _1, _2, static_cast(a.Lsize()), static_cast(a.Size(RANK - 2)), + d_offsets, d_offsets + 1, static_cast(0), static_cast(sizeof(T1) * 8), stream, false); + + for (size_t iter = 0; iter < total_iter; iter++) { + auto ap = std::apply([&a](auto... param) { return a.GetPointer(param...); }, idx); + auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); + + f(ap, aop); + + // Update all but the last batch_offset indices + UpdateIndices(a, idx, batch_offset); + } } else { - cub::DeviceSegmentedRadixSort::SortKeysDescending( - d_temp, temp_storage_bytes, ap, aop, - static_cast(a.Lsize()), static_cast(a.Size(RANK - 2)), - d_offsets, d_offsets + 1, 0, sizeof(T1) * 8, stream); + auto ft = [&](auto ...p){ cub::DeviceSegmentedRadixSort::SortKeysDescending(p...); }; + auto f = std::bind(ft, d_temp, temp_storage_bytes, _1, _2, static_cast(a.Lsize()), static_cast(a.Size(RANK - 2)), + d_offsets, d_offsets + 1, static_cast(0), static_cast(sizeof(T1) * 8), stream, false); + + for (size_t iter = 0; iter < total_iter; iter++) { + auto ap = std::apply([&a](auto... param) { return a.GetPointer(param...); }, idx); + auto aop = std::apply([&a_out](auto... param) { return a_out.GetPointer(param...); }, idx); + + f(ap, aop); + + // Update all but the last batch_offset indices + UpdateIndices(a, idx, batch_offset); + } } - - // Update all but the last 2 indices - UpdateIndices(a, idx, batch_offset); } } #endif @@ -428,49 +511,73 @@ class matxCubPlan_t { * */ inline void ExecReduce(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { #ifdef __CUDACC__ if constexpr (RANK == 0) { - if (a.IsLinear()) { - cub::DeviceReduce::Reduce(d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a.TotalSize()), - cparams_.reduce_op, - cparams_.init, - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceReduce::Reduce(d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(TotalSize(a)), + cparams_.reduce_op, + cparams_.init, + stream); + } + else { + cub::DeviceReduce::Reduce(d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(TotalSize(a)), + cparams_.reduce_op, + cparams_.init, + stream); + } } else { - tensor_impl_t base = a; cub::DeviceReduce::Reduce(d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), - static_cast(a.TotalSize()), + static_cast(TotalSize(a)), cparams_.reduce_op, cparams_.init, stream); } } else if constexpr (RANK == 1) { - if (a.IsLinear()) { - cub::DeviceSegmentedReduce::Reduce( d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a_out.Size(0)), - d_offsets, - d_offsets + 1, - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceSegmentedReduce::Reduce( d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } + else { + cub::DeviceSegmentedReduce::Reduce( d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } } else { - tensor_impl_t base = a; cub::DeviceSegmentedReduce::Reduce( d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), static_cast(a_out.Size(0)), d_offsets, @@ -488,7 +595,7 @@ class matxCubPlan_t { * * @tparam OutputTensor * Type of output tensor - * @tparam InputTensor + * @tparam InputOperator * Type of input tensor * @param a_out * Output tensor @@ -499,45 +606,67 @@ class matxCubPlan_t { * */ inline void ExecSum(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { -#ifdef __CUDACC__ +#ifdef __CUDACC__ if constexpr (RANK == 0) { - if (a.IsLinear()) { - cub::DeviceReduce::Sum(d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a.TotalSize()), - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceReduce::Sum(d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(TotalSize(a)), + stream); + } + else { + cub::DeviceReduce::Sum(d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(TotalSize(a)), + stream); + } } else { - tensor_impl_t base = a; cub::DeviceReduce::Sum(d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), - static_cast(a.TotalSize()), + static_cast(TotalSize(a)), stream); } } else if constexpr (RANK == 1) { - if (a.IsLinear()) { - cub::DeviceSegmentedReduce::Sum( d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a_out.Size(0)), - d_offsets, - d_offsets + 1, - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceSegmentedReduce::Sum( d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } + else { + cub::DeviceSegmentedReduce::Sum( d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } } else { - tensor_impl_t base = a; cub::DeviceSegmentedReduce::Sum( d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), static_cast(a_out.Size(0)), d_offsets, @@ -555,7 +684,7 @@ class matxCubPlan_t { * * @tparam OutputTensor * Type of output tensor - * @tparam InputTensor + * @tparam InputOperator * Type of input tensor * @param a_out * Output tensor @@ -566,45 +695,67 @@ class matxCubPlan_t { * */ inline void ExecMin(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { #ifdef __CUDACC__ if constexpr (RANK == 0) { - if (a.IsLinear()) { - cub::DeviceReduce::Min(d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a.TotalSize()), - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceReduce::Min(d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(TotalSize(a)), + stream); + } + else { + cub::DeviceReduce::Min(d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(TotalSize(a)), + stream); + } } else { - tensor_impl_t base = a; cub::DeviceReduce::Min(d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), - static_cast(a.TotalSize()), + static_cast(TotalSize(a)), stream); } } else if constexpr (RANK == 1) { - if (a.IsLinear()) { - cub::DeviceSegmentedReduce::Min( d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a_out.Size(0)), - d_offsets, - d_offsets + 1, - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceSegmentedReduce::Min( d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } + else { + cub::DeviceSegmentedReduce::Min( d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } } else { - tensor_impl_t base = a; cub::DeviceSegmentedReduce::Min( d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), static_cast(a_out.Size(0)), d_offsets, @@ -622,7 +773,7 @@ class matxCubPlan_t { * * @tparam OutputTensor * Type of output tensor - * @tparam InputTensor + * @tparam InputOperator * Type of input tensor * @param a_out * Output tensor @@ -633,45 +784,67 @@ class matxCubPlan_t { * */ inline void ExecMax(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { #ifdef __CUDACC__ if constexpr (RANK == 0) { - if (a.IsLinear()) { - cub::DeviceReduce::Max(d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a.TotalSize()), - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceReduce::Max(d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(a.TotalSize()), + stream); + } + else { + cub::DeviceReduce::Max(d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(a.TotalSize()), + stream); + } } else { - tensor_impl_t base = a; cub::DeviceReduce::Max(d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), static_cast(a.TotalSize()), stream); } } else if constexpr (RANK == 1) { - if (a.IsLinear()) { - cub::DeviceSegmentedReduce::Max( d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - static_cast(a_out.Size(0)), - d_offsets, - d_offsets + 1, - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceSegmentedReduce::Max( d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } + else { + cub::DeviceSegmentedReduce::Max( d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + static_cast(a_out.Size(0)), + d_offsets, + d_offsets + 1, + stream); + } } else { - tensor_impl_t base = a; cub::DeviceSegmentedReduce::Max( d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), static_cast(a_out.Size(0)), d_offsets, @@ -690,7 +863,7 @@ class matxCubPlan_t { * * @tparam OutputTensor * Type of output tensor - * @tparam InputTensor + * @tparam InputOperator * Type of input tensor * @param a_out * Output tensor @@ -701,25 +874,37 @@ class matxCubPlan_t { * */ inline void ExecSelect(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { -#ifdef __CUDACC__ - if (a.IsLinear()) { - cub::DeviceSelect::If(d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - cparams_.num_found.Data(), - static_cast(a.TotalSize()), - cparams_.op, - stream); +#ifdef __CUDACC__ + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceSelect::If(d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + cparams_.num_found.Data(), + static_cast(a.TotalSize()), + cparams_.op, + stream); + } + else { + cub::DeviceSelect::If(d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + cparams_.num_found.Data(), + static_cast(a.TotalSize()), + cparams_.op, + stream); + } } else { - tensor_impl_t base = a; cub::DeviceSelect::If(d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), cparams_.num_found.Data(), static_cast(a.TotalSize()), @@ -754,7 +939,7 @@ class matxCubPlan_t { * * @tparam OutputTensor * Type of output tensor - * @tparam InputTensor + * @tparam InputOperator * Type of input tensor * @param a_out * Output tensor @@ -765,32 +950,45 @@ class matxCubPlan_t { * */ inline void ExecSelectIndex(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { #ifdef __CUDACC__ - if (a.IsLinear()) { - cub::DeviceSelect::If(d_temp, - temp_storage_bytes, - cub::CountingInputIterator{0}, - a_out.Data(), - cparams_.num_found.Data(), - static_cast(a.TotalSize()), - IndexToSelectOp{a.Data(), cparams_.op}, - stream); + if constexpr (is_tensor_view_v) { + if (a.IsContiguous()) { + cub::DeviceSelect::If(d_temp, + temp_storage_bytes, + cub::CountingInputIterator{0}, + a_out.Data(), + cparams_.num_found.Data(), + static_cast(a.TotalSize()), + IndexToSelectOp{a.Data(), cparams_.op}, + stream); + } + else { + tensor_impl_t base = a; + cub::DeviceSelect::If(d_temp, + temp_storage_bytes, + cub::CountingInputIterator{0}, + a_out.Data(), + cparams_.num_found.Data(), + static_cast(a.TotalSize()), + IndexToSelectOp + {RandomOperatorIterator{base}, cparams_.op}, + stream); + } } else { - tensor_impl_t base = a; cub::DeviceSelect::If(d_temp, temp_storage_bytes, cub::CountingInputIterator{0}, a_out.Data(), cparams_.num_found.Data(), static_cast(a.TotalSize()), - IndexToSelectOp - {RandomOperatorIterator{base}, cparams_.op}, - stream); - } + IndexToSelectOp + {RandomOperatorIterator{a}, cparams_.op}, + stream); + } #endif } @@ -800,7 +998,7 @@ class matxCubPlan_t { * * @tparam OutputTensor * Type of output tensor - * @tparam InputTensor + * @tparam InputOperator * Type of input tensor * @param a_out * Output tensor @@ -811,24 +1009,35 @@ class matxCubPlan_t { * */ inline void ExecUnique(OutputTensor &a_out, - const InputTensor &a, + const InputOperator &a, const cudaStream_t stream) { #ifdef __CUDACC__ - if (a.IsLinear()) { - cub::DeviceSelect::Unique(d_temp, - temp_storage_bytes, - a.Data(), - a_out.Data(), - cparams_.num_found.Data(), - static_cast(a.TotalSize()), - stream); + if constexpr (is_tensor_view_v) { + const tensor_impl_t base = a; + if (a.IsContiguous()) { + cub::DeviceSelect::Unique(d_temp, + temp_storage_bytes, + a.Data(), + a_out.Data(), + cparams_.num_found.Data(), + static_cast(a.TotalSize()), + stream); + } + else { + cub::DeviceSelect::Unique(d_temp, + temp_storage_bytes, + RandomOperatorIterator{base}, + a_out.Data(), + cparams_.num_found.Data(), + static_cast(a.TotalSize()), + stream); + } } else { - tensor_impl_t base = a; cub::DeviceSelect::Unique(d_temp, temp_storage_bytes, - RandomOperatorIterator{base}, + RandomOperatorIterator{a}, a_out.Data(), cparams_.num_found.Data(), static_cast(a.TotalSize()), @@ -859,8 +1068,9 @@ class matxCubPlan_t { struct CubParamsKeyHash { std::size_t operator()(const CubParams_t &k) const noexcept { - return (std::hash()(k.size)) + (std::hash()(k.batches)) + - (std::hash()((uint64_t)k.A)) + + return (std::hash()(k.batches)) + + (std::hash()((uint64_t)k.size[0])) + + (std::hash()((uint64_t)k.size[1])) + (std::hash()((uint64_t)k.a_out)) + (std::hash()((uint64_t)k.stream)) + (std::hash()((uint64_t)k.op)); @@ -874,7 +1084,7 @@ struct CubParamsKeyHash { struct CubParamsKeyEq { bool operator()(const CubParams_t &l, const CubParams_t &t) const noexcept { - return l.size == t.size && l.A == t.A && l.a_out == t.a_out && + return l.size[0] == t.size[0] && l.size[1] == t.size[1] && l.a_out == t.a_out && l.batches == t.batches && l.dtype == t.dtype && l.stream == t.stream && l.op == t.op; } @@ -893,7 +1103,7 @@ static matxCache_t cub_cache; * * @tparam OutputTensor * Output tensor type - * @tparam InputTensor + * @tparam InputOperator * Input tensor type * @tparam ReduceOp * Reduction type @@ -906,17 +1116,17 @@ static matxCache_t cub_cache; * @param stream * CUDA stream */ -template -void cub_reduce(OutputTensor &a_out, const InputTensor &a, typename InputTensor::scalar_type init, +template +void cub_reduce(OutputTensor &a_out, const InputOperator &a, typename InputOperator::scalar_type init, const cudaStream_t stream = 0) { #ifdef __CUDACC__ // Get parameters required by these tensors - using param_type = typename detail::ReduceParams_t; + using param_type = typename detail::ReduceParams_t; auto reduce_params = param_type{ReduceOp{}, init}; auto params = detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; @@ -924,14 +1134,14 @@ void cub_reduce(OutputTensor &a_out, const InputTensor &a, typename InputTensor: // Get cache or new Sort plan if it doesn't exist auto ret = detail::cub_cache.Lookup(params); if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ + auto tmp = new detail::matxCubPlan_t{ a_out, a, reduce_params, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); tmp->ExecReduce(a_out, a, stream); } else { auto type = - static_cast *>( + static_cast *>( ret.value()); type->ExecReduce(a_out, a, stream); } @@ -945,7 +1155,7 @@ void cub_reduce(OutputTensor &a_out, const InputTensor &a, typename InputTensor: * * @tparam OutputTensor * Output tensor type - * @tparam InputTensor + * @tparam InputOperator * Input tensor type * @param a_out * Sorted tensor @@ -954,28 +1164,28 @@ void cub_reduce(OutputTensor &a_out, const InputTensor &a, typename InputTensor: * @param stream * CUDA stream */ -template -void cub_sum(OutputTensor &a_out, const InputTensor &a, +template +void cub_sum(OutputTensor &a_out, const InputOperator &a, const cudaStream_t stream = 0) { #ifdef __CUDACC__ auto params = detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; // Get cache or new Sort plan if it doesn't exist auto ret = detail::cub_cache.Lookup(params); if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ + auto tmp = new detail::matxCubPlan_t{ a_out, a, {}, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); tmp->ExecSum(a_out, a, stream); } else { auto type = - static_cast *>( + static_cast *>( ret.value()); type->ExecSum(a_out, a, stream); } @@ -987,7 +1197,7 @@ void cub_sum(OutputTensor &a_out, const InputTensor &a, * * @tparam OutputTensor * Output tensor type - * @tparam InputTensor + * @tparam InputOperator * Input tensor type * @param a_out * Sorted tensor @@ -996,28 +1206,28 @@ void cub_sum(OutputTensor &a_out, const InputTensor &a, * @param stream * CUDA stream */ -template -void cub_min(OutputTensor &a_out, const InputTensor &a, +template +void cub_min(OutputTensor &a_out, const InputOperator &a, const cudaStream_t stream = 0) { #ifdef __CUDACC__ auto params = detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; // Get cache or new Sort plan if it doesn't exist auto ret = detail::cub_cache.Lookup(params); if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ + auto tmp = new detail::matxCubPlan_t{ a_out, a, {}, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); tmp->ExecMin(a_out, a, stream); } else { auto type = - static_cast *>( + static_cast *>( ret.value()); type->ExecMin(a_out, a, stream); } @@ -1029,7 +1239,7 @@ void cub_min(OutputTensor &a_out, const InputTensor &a, * * @tparam OutputTensor * Output tensor type - * @tparam InputTensor + * @tparam InputOperator * Input tensor type * @param a_out * Sorted tensor @@ -1038,28 +1248,28 @@ void cub_min(OutputTensor &a_out, const InputTensor &a, * @param stream * CUDA stream */ -template -void cub_max(OutputTensor &a_out, const InputTensor &a, +template +void cub_max(OutputTensor &a_out, const InputOperator &a, const cudaStream_t stream = 0) { #ifdef __CUDACC__ auto params = detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; // Get cache or new Sort plan if it doesn't exist auto ret = detail::cub_cache.Lookup(params); if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ + auto tmp = new detail::matxCubPlan_t{ a_out, a, {}, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); tmp->ExecMax(a_out, a, stream); } else { auto type = - static_cast *>( + static_cast *>( ret.value()); type->ExecMax(a_out, a, stream); } @@ -1092,15 +1302,15 @@ void cub_max(OutputTensor &a_out, const InputTensor &a, * @param stream * CUDA stream */ -template -void sort(OutputTensor &a_out, const InputTensor &a, +template +void sort(OutputTensor &a_out, const InputOperator &a, const SortDirection_t dir = SORT_DIR_ASC, const cudaStream_t stream = 0) { #ifdef __CUDACC__ // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a); + detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; detail::SortParams_t p{dir}; @@ -1108,14 +1318,14 @@ void sort(OutputTensor &a_out, const InputTensor &a, // Get cache or new Sort plan if it doesn't exist auto ret = detail::cub_cache.Lookup(params); if (ret == std::nullopt) { - auto tmp = new detail::matxCubPlan_t{ + auto tmp = new detail::matxCubPlan_t{ a_out, a, p, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); tmp->ExecSort(a_out, a, stream, dir); } else { auto sort_type = - static_cast *>( + static_cast *>( ret.value()); sort_type->ExecSort(a_out, a, stream, dir); } @@ -1139,27 +1349,27 @@ void sort(OutputTensor &a_out, const InputTensor &a, * @param stream * CUDA stream */ -template -void cumsum(OutputTensor &a_out, const InputTensor &a, +template +void cumsum(OutputTensor &a_out, const InputOperator &a, const cudaStream_t stream = 0) { #ifdef __CUDACC__ // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a); + detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; // Get cache or new Sort plan if it doesn't exist auto ret = detail::cub_cache.Lookup(params); if (ret == std::nullopt) { auto tmp = - new detail::matxCubPlan_t{a_out, a, {}, stream}; + new detail::matxCubPlan_t{a_out, a, {}, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); tmp->ExecPrefixScanEx(a_out, a, stream); } else { auto sort_type = - static_cast *>(ret.value()); + static_cast *>(ret.value()); sort_type->ExecPrefixScanEx(a_out, a, stream); } #endif @@ -1190,34 +1400,34 @@ void cumsum(OutputTensor &a_out, const InputTensor &a, * @param stream * CUDA stream */ -template -void hist(OutputTensor &a_out, const InputTensor &a, - const typename InputTensor::scalar_type lower, - const typename InputTensor::scalar_type upper, const cudaStream_t stream = 0) +template +void hist(OutputTensor &a_out, const InputOperator &a, + const typename InputOperator::scalar_type lower, + const typename InputOperator::scalar_type upper, const cudaStream_t stream = 0) { static_assert(std::is_same_v, "Output histogram tensor must use int type"); #ifdef __CUDACC__ // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a); + detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; // Get cache or new Sort plan if it doesn't exist auto ret = detail::cub_cache.Lookup(params); if (ret == std::nullopt) { - detail::HistEvenParams_t hp{lower, upper}; + detail::HistEvenParams_t hp{lower, upper}; auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputTensor, + InputOperator, detail::CUB_OP_HIST_EVEN, - detail::HistEvenParams_t>{ - a_out, a, detail::HistEvenParams_t{hp}, stream}; + detail::HistEvenParams_t>{ + a_out, a, detail::HistEvenParams_t{hp}, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); tmp->ExecHistEven(a_out, a, lower, upper, stream); } else { auto sort_type = - static_cast> *>( + static_cast> *>( ret.value()); sort_type->ExecHistEven(a_out, a, lower, upper, stream); } @@ -1308,7 +1518,7 @@ struct GTE * Output items type * @tparam OutputTensor * Output type - * @tparam InputTensor + * @tparam InputOperator * Input type * @param num_found * Number of items found meeting criteria @@ -1321,15 +1531,15 @@ struct GTE * @param stream * CUDA stream */ -template -void find(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, SelectType sel, const cudaStream_t stream = 0) +template +void find(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, const cudaStream_t stream = 0) { #ifdef __CUDACC__ static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0"); // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a); + detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; // Get cache or new Sort plan if it doesn't exist @@ -1338,7 +1548,7 @@ void find(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, Sel if (ret == std::nullopt) { auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputTensor, + InputOperator, detail::CUB_OP_SELECT, decltype(cparams)>{a_out, a, cparams, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); @@ -1346,7 +1556,7 @@ void find(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, Sel } else { auto sort_type = - static_cast *>( + static_cast *>( ret.value()); sort_type->ExecSelect(a_out, a, stream); } @@ -1368,7 +1578,7 @@ void find(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, Sel * Output items type * @tparam OutputTensor * Output type - * @tparam InputTensor + * @tparam InputOperator * Input type * @param num_found * Number of items found meeting criteria @@ -1381,15 +1591,15 @@ void find(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, Sel * @param stream * CUDA stream */ -template -void find_idx(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, SelectType sel, const cudaStream_t stream = 0) +template +void find_idx(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, const cudaStream_t stream = 0) { #ifdef __CUDACC__ static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0"); // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, a); + detail::matxCubPlan_t::GetCubParams(a_out, a); params.stream = stream; // Get cache or new Sort plan if it doesn't exist @@ -1398,7 +1608,7 @@ void find_idx(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, if (ret == std::nullopt) { auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputTensor, + InputOperator, detail::CUB_OP_SELECT_IDX, decltype(cparams)>{a_out, a, cparams, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); @@ -1406,7 +1616,7 @@ void find_idx(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, } else { auto sort_type = - static_cast *>( + static_cast *>( ret.value()); sort_type->ExecSelectIndex(a_out, a, stream); } @@ -1425,7 +1635,7 @@ void find_idx(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, * Output items type * @tparam OutputTensor * Output type - * @tparam InputTensor + * @tparam InputOperator * Input type * @param num_found * Number of items found meeting criteria @@ -1436,23 +1646,23 @@ void find_idx(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, * @param stream * CUDA stream */ -template -void unique(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, const cudaStream_t stream = 0) +template +void unique(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, const cudaStream_t stream = 0) { #ifdef __CUDACC__ static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0"); // Allocate space for sorted input since CUB doesn't do unique over unsorted inputs - typename InputTensor::scalar_type *sort_ptr; + typename InputOperator::scalar_type *sort_ptr; matxAlloc((void **)&sort_ptr, a.Bytes(), MATX_ASYNC_DEVICE_MEMORY, stream); - auto sort_tensor = make_tensor(sort_ptr, a.Shape()); + auto sort_tensor = make_tensor(sort_ptr, a.Shape()); matx::sort(sort_tensor, a, SORT_DIR_ASC, stream); auto cparams = detail::UniqueParams_t{num_found}; // Get parameters required by these tensors auto params = - detail::matxCubPlan_t::GetCubParams(a_out, sort_tensor); + detail::matxCubPlan_t::GetCubParams(a_out, sort_tensor); params.stream = stream; // Get cache or new Sort plan if it doesn't exist @@ -1461,7 +1671,7 @@ void unique(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, if (ret == std::nullopt) { auto tmp = new detail::matxCubPlan_t< OutputTensor, - InputTensor, + InputOperator, detail::CUB_OP_UNIQUE, decltype(cparams)>{a_out, sort_tensor, cparams, stream}; detail::cub_cache.Insert(params, static_cast(tmp)); @@ -1469,7 +1679,7 @@ void unique(OutputTensor &a_out, CountTensor &num_found, const InputTensor &a, } else { auto sort_type = - static_cast *>( + static_cast *>( ret.value()); sort_type->ExecUnique(a_out, sort_tensor, stream); } diff --git a/include/matx_radar.h b/include/matx_radar.h index 9b94a014..57b4cdca 100644 --- a/include/matx_radar.h +++ b/include/matx_radar.h @@ -182,7 +182,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x, auto x_normdiv_v = make_tensor(x_normdiv, x.Shape()); auto x_norm_v = make_tensor(x_norm); - matx::reduce(x_norm_v, norm(x), reduceOpSum(), 0); + sum(x_norm_v, norm(x), stream); (x_norm_v = sqrt(x_norm_v)).run(stream); (x_normdiv_v = x / x_norm_v).run(stream); @@ -198,7 +198,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x, y_normdiv_v.Reset(y_normdiv, ry.Shape()); auto y_norm_v = make_tensor(y_norm); - matx::reduce(y_norm_v, norm(ry), reduceOpSum(), 0); + sum(y_norm_v, norm(ry), stream); (y_normdiv_v = ry / y_norm_v).run(stream); } diff --git a/include/matx_reduce.h b/include/matx_reduce.h index ebb2746e..326352f4 100644 --- a/include/matx_reduce.h +++ b/include/matx_reduce.h @@ -1065,9 +1065,9 @@ template void inline reduce(TensorType &dest, const InType &in, ReduceOp op, cudaStream_t stream = 0, [[maybe_unused]] bool init = true) { - constexpr bool use_cub = TensorType::Rank() == 0; + constexpr bool use_cub = TensorType::Rank() == 0 || (TensorType::Rank() == 1 && InType::Rank() == 2); // Use CUB implementation if we have a tensor on the RHS and it's not blocked from using CUB - if constexpr (!is_matx_no_cub_reduction_v && is_tensor_view_v && use_cub) { + if constexpr (!is_matx_no_cub_reduction_v && use_cub) { cub_reduce(dest, in, op.Init(), stream); } else { // Fall back to the slow path of custom implementation @@ -1214,9 +1214,9 @@ template void inline sum(TensorType &dest, const InType &in, cudaStream_t stream = 0) { #ifdef __CUDACC__ - constexpr bool use_cub = TensorType::Rank() == 0; + constexpr bool use_cub = TensorType::Rank() == 0 || (TensorType::Rank() == 1 && InType::Rank() == 2); // Use CUB implementation if we have a tensor on the RHS - if constexpr (is_tensor_view_v && use_cub) { + if constexpr (use_cub) { cub_sum(dest, in, stream); } else { // Fall back to the slow path of custom implementation @@ -1278,9 +1278,9 @@ template void inline rmax(TensorType &dest, const InType &in, cudaStream_t stream = 0) { #ifdef __CUDACC__ - constexpr bool use_cub = TensorType::Rank() == 0; + constexpr bool use_cub = TensorType::Rank() == 0 || (TensorType::Rank() == 1 && InType::Rank() == 2); // Use CUB implementation if we have a tensor on the RHS - if constexpr (is_tensor_view_v && use_cub) { + if constexpr (use_cub) { cub_max(dest, in, stream); } else { // Fall back to the slow path of custom implementation @@ -1344,9 +1344,9 @@ template void inline rmin(TensorType &dest, const InType &in, cudaStream_t stream = 0) { #ifdef __CUDACC__ - constexpr bool use_cub = TensorType::Rank() == 0; + constexpr bool use_cub = TensorType::Rank() == 0 || (TensorType::Rank() == 1 && InType::Rank() == 2); // Use CUB implementation if we have a tensor on the RHS - if constexpr (is_tensor_view_v && use_cub) { + if constexpr (use_cub) { cub_min(dest, in, stream); } else { // Fall back to the slow path of custom implementation diff --git a/include/matx_solver.h b/include/matx_solver.h index 8c4d7e37..391b115a 100644 --- a/include/matx_solver.h +++ b/include/matx_solver.h @@ -111,7 +111,7 @@ class matxDnSolver_t { * CUDA stream */ template - static inline TensorType + static inline auto TransposeCopy(typename TensorType::scalar_type *tp, const TensorType &a, cudaStream_t stream = 0) { auto pa = a.PermuteMatrix(); @@ -1333,7 +1333,7 @@ void svd(UTensor &u, STensor &s, auto tvt = tv.PermuteMatrix(); // Get parameters required by these tensors - auto params = detail::matxDnSVDSolverPlan_t::GetSVDParams( + auto params = detail::matxDnSVDSolverPlan_t::GetSVDParams( u, s, v, tvt, jobu, jobvt); // Get cache or new QR plan if it doesn't exist @@ -1346,7 +1346,7 @@ void svd(UTensor &u, STensor &s, } else { auto svd_type = - static_cast *>(ret.value()); + static_cast *>(ret.value()); svd_type->Exec(u, s, v, tvt, jobu, jobvt, stream); } } diff --git a/include/matx_tensor.h b/include/matx_tensor.h index a48e86cb..b6ed814f 100644 --- a/include/matx_tensor.h +++ b/include/matx_tensor.h @@ -80,18 +80,20 @@ enum { * @tparam Desc Descriptor for tensor * */ -template +template struct RandomOperatorIterator { - using self_type = RandomOperatorIterator; - using value_type = typename TensorType::scalar_type; - using stride_type = typename TensorType::desc_type::stride_type; + using self_type = RandomOperatorIterator; + using value_type = typename OperatorType::scalar_type; + // using stride_type = std::conditional_t, typename OperatorType::desc_type::stride_type, + // index_t>; + using stride_type = index_t; using pointer = value_type*; using reference = value_type; using iterator_category = std::random_access_iterator_tag; using difference_type = typename std::iterator::difference_type; - __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorIterator(const TensorType &t) : t_(t), offset_(0) {} - __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorIterator(const TensorType &t, stride_type offset) : t_(t), offset_(offset) {} + __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorIterator(const OperatorType &t) : t_(t), offset_(0) { } + __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorIterator(const OperatorType &t, stride_type offset) : t_(t), offset_(offset) {} /** * @brief Dereference value at a pre-computed offset @@ -100,8 +102,10 @@ struct RandomOperatorIterator { */ [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ reference operator*() const { - auto arrs = t_.GetIdxFromAbs(offset_); - return t_.operator()(arrs); + auto arrs = detail::GetIdxFromAbs(t_, offset_); + return std::apply([&](auto &&...args) { + return t_.operator()(args...); + }, arrs); } [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ self_type operator+(difference_type offset) const @@ -111,7 +115,7 @@ struct RandomOperatorIterator { [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ reference operator[](difference_type offset) const { - return *(*this + offset); + return *self_type{t_, offset}; } __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ self_type operator++(int) @@ -145,7 +149,7 @@ struct RandomOperatorIterator { return *this; } - TensorType t_; + OperatorType t_; stride_type offset_; }; @@ -177,6 +181,10 @@ class tensor_t : public detail::tensor_impl_t { using matxop = bool; ///< Indicate this is a MatX operator using tensor_view = bool; ///< Indicate this is a MatX tensor view using storage_type = Storage; ///< Storage type trait + using shape_type = typename Desc::shape_type; + using stride_type = typename Desc::stride_type; + using shape_container = typename Desc::shape_container; + using stride_container = typename Desc::stride_container; using desc_type = Desc; ///< Descriptor type trait using self_type = tensor_t; static constexpr bool PRINT_ON_DEVICE = false; ///< Print() uses printf on device @@ -220,7 +228,7 @@ class tensor_t : public detail::tensor_impl_t { * @param rhs * Tensor to copy from */ - __MATX_HOST__ void Shallow(const tensor_t &rhs) noexcept + __MATX_HOST__ void Shallow(const self_type &rhs) noexcept { this->ldata_ = rhs.ldata_; storage_ = rhs.storage_; @@ -245,8 +253,6 @@ class tensor_t : public detail::tensor_impl_t { storage_{std::forward(s)} { this->SetLocalData(storage_.data()); - // static_assert(std::is_same_v>, - // "Must use default storage if not providing") } /** @@ -256,13 +262,12 @@ class tensor_t : public detail::tensor_impl_t { * @param desc * @param ldata */ - tensor_t(Storage s, Desc &&desc, T* ldata) : - detail::tensor_impl_t{std::forward(desc)}, + template + tensor_t(Storage s, D2 &&desc, T* ldata) : + detail::tensor_impl_t{std::forward(desc)}, storage_{std::move(s)} { this->SetLocalData(ldata); - // static_assert(std::is_same_v>, - // "Must use default storage if not providing") } @@ -274,8 +279,8 @@ class tensor_t : public detail::tensor_impl_t { */ template >> - __MATX_INLINE__ tensor_t(Desc &&desc) : - detail::tensor_impl_t{std::forward(desc)}, + __MATX_INLINE__ tensor_t(D2 &&desc) : + detail::tensor_impl_t{std::forward(desc)}, storage_{typename Storage::container{this->desc_.TotalSize()*sizeof(T)}} { this->SetLocalData(storage_.data()); @@ -307,7 +312,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator=(const self_type &op) { return detail::set(*this, op); } @@ -338,7 +343,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator+=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator+=(const self_type &op) { return detail::set(*this, *this + op); } @@ -370,7 +375,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator-=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator-=(const self_type &op) { return detail::set(*this, *this - op); } @@ -404,7 +409,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator*=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator*=(const self_type &op) { return detail::set(*this, *this * op); } @@ -436,7 +441,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator/=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator/=(const self_type &op) { return detail::set(*this, *this / op); } @@ -468,7 +473,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator<<=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator<<=(const self_type &op) { return detail::set(*this, *this << op); } @@ -500,7 +505,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator>>=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator>>=(const self_type &op) { return detail::set(*this, *this >> op); } @@ -532,7 +537,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator|=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator|=(const self_type &op) { return detail::set(*this, *this | op); } @@ -564,7 +569,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator&=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator&=(const self_type &op) { return detail::set(*this, *this & op); } @@ -596,7 +601,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator^=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator^=(const self_type &op) { return detail::set(*this, *this ^ op); } @@ -628,7 +633,7 @@ class tensor_t : public detail::tensor_impl_t { * @returns set object containing the destination view and source object * */ - [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator%=(const tensor_t &op) + [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ auto operator%=(const self_type &op) { return detail::set(*this, *this % op); } @@ -742,7 +747,7 @@ class tensor_t : public detail::tensor_impl_t { std::array tshape; std::move(std::begin(shape), std::end(shape), tshape.begin()); - typename Desc::stride_type prod = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies()); + stride_type prod = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies()); MATX_ASSERT_STR( sizeof(T) * prod <= storage_.Bytes(), matxInvalidSize, "Total size of new tensor must not be larger than the original"); @@ -852,7 +857,7 @@ class tensor_t : public detail::tensor_impl_t { using Type = typename U::value_type; Type *data = reinterpret_cast(this->ldata_) + 1; - std::array strides; + std::array strides; #pragma unroll for (int i = 0; i < RANK; i++) { strides[i] = this->Stride(i); @@ -884,11 +889,11 @@ class tensor_t : public detail::tensor_impl_t { * @returns tensor view of only imaginary-valued components * */ - __MATX_INLINE__ tensor_t Permute(const uint32_t (&dims)[RANK]) const + __MATX_INLINE__ auto Permute(const uint32_t (&dims)[RANK]) const { static_assert(RANK >= 2, "Only tensors of rank 2 and higher can be permuted."); - std::array n; - std::array s; + std::array n; + std::array s; [[maybe_unused]] bool done[RANK] = {0}; #pragma unroll @@ -1638,7 +1643,7 @@ class tensor_t : public detail::tensor_impl_t { cudaMemcpy(tmpd.Data(), Data(), tmpd.Bytes(), cudaMemcpyDeviceToHost); - tmpd.Print(dims...); + tmpv.Print(dims...); } } #else diff --git a/include/matx_tensor_desc.h b/include/matx_tensor_desc.h index bea7137f..7f10b654 100644 --- a/include/matx_tensor_desc.h +++ b/include/matx_tensor_desc.h @@ -43,27 +43,30 @@ namespace matx { /** * @brief Type-erased generic tensor descriptor for strides and sizes * - * @tparam ShapeType type of sizes - * @tparam StrideType type of strides + * @tparam ShapeContainer type of sizes + * @tparam StrideContainer type of strides */ -template +template class tensor_desc_t { public: - using shape_type = typename ShapeType::value_type; ///< Type trait of shape type - using stride_type = typename StrideType::value_type; ///< Type trait of stride type - using shape_container = ShapeType; ///< Type trait of shape container - using stride_container = StrideType; ///< Type trait of stride container + template + using self_type = tensor_desc_t; + + using shape_container = ShapeContainer; + using stride_container = StrideContainer; + using shape_type = typename ShapeContainer::value_type; ///< Type trait of shape type + using stride_type = typename StrideContainer::value_type; ///< Type trait of stride type using matx_descriptor = bool; ///< Type trait to indicate this is a tensor descriptor /** * @brief Default copy constructor */ - __MATX_INLINE__ tensor_desc_t(const tensor_desc_t& ) = default; + __MATX_INLINE__ tensor_desc_t(const tensor_desc_t &) = default; /** * @brief Default move constructor */ - __MATX_INLINE__ tensor_desc_t(tensor_desc_t&&) = default; + __MATX_INLINE__ tensor_desc_t(tensor_desc_t &&) = default; /** * @brief Default const copy assignment constructor @@ -82,10 +85,10 @@ class tensor_desc_t { * @param shape Shape object * @param stride Stride object */ - template && !std::is_array_v, bool> = true> - __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(ShapeType &&shape, StrideType &&stride) - : shape_(std::forward(shape)), - stride_(std::forward(stride)) { + template && !std::is_array_v, bool> = true> + __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(ShapeContainer &&shape, StrideContainer &&stride) + : shape_(std::forward(shape)), + stride_(std::forward(stride)) { MATX_ASSERT_STR(shape.size() == stride.size(), matxInvalidDim, "Size and stride array sizes must match"); MATX_ASSERT_STR(shape.size() == RANK, matxInvalidDim, @@ -137,9 +140,9 @@ class tensor_desc_t { * @param strides * Strides of tensor */ - template , bool> = true> - __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(ShapeType &&shape, const stride_type (&strides)[RANK]) : - shape_(std::forward(shape)) { + template , bool> = true> + __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(S2 &&shape, const stride_type (&strides)[RANK]) : + shape_(std::forward(shape)) { for (int i = 0; i < RANK; i++) { MATX_ASSERT_STR(*(shape + i) > 0, matxInvalidSize, "Must specify size larger than 0 for each dimension"); @@ -155,9 +158,9 @@ class tensor_desc_t { * @param strides * Strides of tensor */ - template , bool> = true> - __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(const shape_type (&shape)[RANK], StrideType &&strides) : - stride_(std::forward(strides)) { + template , bool> = true> + __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(const shape_type (&shape)[RANK], StrideContainer &&strides) : + stride_(std::forward(strides)) { for (int i = 0; i < RANK; i++) { MATX_ASSERT_STR(shape[i] > 0, matxInvalidSize, "Must specify size larger than 0 for each dimension"); @@ -183,12 +186,12 @@ class tensor_desc_t { } /** - * Check if a descriptor is linear in memory for all elements in the view + * Check if a descriptor is contiguous in memory for all elements in the view * * @return - * True is descriptor is linear, or false otherwise + * True is descriptor is contiguous, or false otherwise */ - __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ bool IsLinear() const noexcept + __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ constexpr bool IsContiguous() const noexcept { stride_type ttl = 1; for (int i = RANK - 1; i >= 0; i--) { @@ -297,8 +300,8 @@ class tensor_desc_t { static auto constexpr Rank() { return RANK; } private: - ShapeType shape_; - StrideType stride_; + ShapeContainer shape_; + StrideContainer stride_; }; /** @@ -310,8 +313,8 @@ class tensor_desc_t { template class static_tensor_desc_t { public: - using shape_container = std::array; ///< Type trait of shape type - using stride_container = std::array; ///< Type trait of stride type + using ShapeContainer = std::array; ///< Type trait of shape type + using StrideContainer = std::array; ///< Type trait of stride type using shape_type = index_t; ///< Type trait of shape container using stride_type = index_t; ///< Type trait of stride container using matx_descriptor = bool; ///< Type trait to indicate this is a tensor descriptor @@ -322,7 +325,7 @@ class static_tensor_desc_t { * @return * True is descriptor is linear, or false otherwise */ - __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ constexpr bool IsLinear() const noexcept + __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ constexpr bool IsContiguous() const noexcept { return true; } @@ -382,20 +385,20 @@ class static_tensor_desc_t { return m; } - static constexpr shape_container shape_ = make_shape(); - static constexpr stride_container stride_ = make_strides(); + static constexpr ShapeContainer shape_ = make_shape(); + static constexpr StrideContainer stride_ = make_strides(); }; /** * @brief Constant rank, dynamic size, dynamic strides * - * @tparam ShapeType Type of shape - * @tparam StrideType Type of stride container + * @tparam ShapeContainer Type of shape + * @tparam StrideContainer Type of stride container * @tparam RANK Rank of shape */ -template +template using tensor_desc_cr_ds_t = - tensor_desc_t, std::array, + tensor_desc_t, std::array, RANK>; /** diff --git a/include/matx_tensor_generators.h b/include/matx_tensor_generators.h index 802dc3a0..59bf5106 100644 --- a/include/matx_tensor_generators.h +++ b/include/matx_tensor_generators.h @@ -100,7 +100,7 @@ inline auto zeros(ShapeType &&s) template inline auto zeros(const index_t (&s)[RANK]) { - return zeros(detail::to_array(s)); + return zeros(detail::to_array(s)); } /** @@ -137,7 +137,7 @@ inline auto ones(ShapeType &&s) template inline auto ones(const index_t (&s)[RANK]) { - return ones(detail::to_array(s)); + return ones(detail::to_array(s)); } namespace detail { diff --git a/include/matx_tensor_impl.h b/include/matx_tensor_impl.h index 5a7fea9b..938166e3 100644 --- a/include/matx_tensor_impl.h +++ b/include/matx_tensor_impl.h @@ -685,9 +685,9 @@ class tensor_impl_t { * @return * The size of the dimension */ - __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ bool IsLinear() const noexcept + __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ constexpr bool IsContiguous() const noexcept { - return desc_.IsLinear(); + return desc_.IsContiguous(); } template @@ -876,37 +876,6 @@ class tensor_impl_t { ldata_ = data; } - /** - * @brief Returns an N-D coordinate as an array corresponding to the absolute index abs - * - * @param abs Absolute index - * @return std::array of indices - */ - __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto GetIdxFromAbs(typename Desc::shape_type abs) const { - using l_stride_type = typename Desc::stride_type; - using l_shape_type = typename Desc::shape_type; - std::array indices; - std::array sh = desc_.Shape(); - - for (int idx = 0; idx < RANK; idx++) { - if (idx == RANK-1) { - indices[RANK-1] = abs; - } - else { - // no std::accumulate on the device - l_stride_type prod = 1; - for (int i = idx + 1; i < RANK; i++) { - prod *= sh[i]; - } - - indices[idx] = abs / prod; - abs -= prod * indices[idx]; - } - } - - return indices; - } - protected: T *ldata_; diff --git a/include/matx_tensor_ops.h b/include/matx_tensor_ops.h index 85408d6f..8c727670 100644 --- a/include/matx_tensor_ops.h +++ b/include/matx_tensor_ops.h @@ -286,7 +286,7 @@ inline return; } - if (!in.IsLinear()) + if (!in.IsContiguous()) { MATX_THROW(matxInvalidSize, "Must have a linear tensor view for transpose"); } @@ -550,7 +550,7 @@ inline template __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(index_t i) const { - auto arrs = op_.GetIdxFromAbs(idx_(i)); + auto arrs = detail::GetIdxFromAbs(op_, idx_(i)); return op_(arrs); } @@ -1930,7 +1930,8 @@ auto __MATX_INLINE__ as_uint8(T t) { \ using I1Type = extract_scalar_type_t; \ using Op = TENSOR_OP; \ - return detail::matxUnaryOp(i1, Op()); \ + const typename detail::base_type::type &base = i1; \ + return detail::matxUnaryOp(base, Op()); \ } #define DEFINE_BINARY_OP(FUNCTION, TENSOR_OP) \ @@ -1942,7 +1943,9 @@ auto __MATX_INLINE__ as_uint8(T t) using I1Type = extract_scalar_type_t; \ using I2Type = extract_scalar_type_t; \ using Op = TENSOR_OP; \ - return detail::matxBinaryOp(i1, i2, Op()); \ + const typename detail::base_type::type &base1 = i1; \ + const typename detail::base_type::type &base2 = i2; \ + return detail::matxBinaryOp(base1, base2, Op()); \ } #ifdef DOXYGEN_ONLY diff --git a/include/matx_tensor_utils.h b/include/matx_tensor_utils.h index a18f3fee..a5b3c0ac 100644 --- a/include/matx_tensor_utils.h +++ b/include/matx_tensor_utils.h @@ -37,8 +37,55 @@ namespace matx { + template + size_t TotalSize(const Op &op) { + if constexpr (is_tensor_view_v) { + return static_cast(op.TotalSize()); + } + else { + size_t total = 1; + for (int i = 0; i < op.Rank(); i++) { + total *= op.Size(i); + } + + return total; + } + } + namespace detail { + /** + * @brief Returns an N-D coordinate as an array corresponding to the absolute index abs + * + * @param abs Absolute index + * @return std::array of indices + */ + template + __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto GetIdxFromAbs(const Op &op, index_t abs) { + using l_stride_type = index_t; + using l_shape_type = index_t; + constexpr int RANK = op.Rank(); + + std::array indices; + + for (int idx = 0; idx < RANK; idx++) { + if (idx == RANK-1) { + indices[RANK-1] = abs; + } + else { + // no std::accumulate on the device + l_stride_type prod = 1; + for (int i = idx + 1; i < RANK; i++) { + prod *= op.Size(i); + } + + indices[idx] = abs / prod; + abs -= prod * indices[idx]; + } + } + + return indices; + } // Work around cuda::std::apply not working template @@ -274,7 +321,5 @@ namespace detail { } } - - } } \ No newline at end of file diff --git a/include/matx_type_utils.h b/include/matx_type_utils.h index 98718f84..189c3cc8 100644 --- a/include/matx_type_utils.h +++ b/include/matx_type_utils.h @@ -51,6 +51,12 @@ namespace matx { +enum class MemoryLayout { + MEMORY_LAYOUT_ROW_MAJOR, + MEMORY_LAYOUT_COL_MAJOR, +}; + + /** * @brief Removes cv and reference qualifiers on a type * @@ -62,7 +68,7 @@ struct remove_cvref { }; template class tensor_impl_t; -template class tensor_t; +template class tensor_t; namespace detail { template @@ -459,13 +465,14 @@ constexpr std::array, N> to_array(T (&a)[N]) template class tensor_t; template class tensor_impl_t; // Traits for casting down to impl tensor conditionally -template struct base_type { +template +struct base_type { using type = T; }; -template -struct base_type> { - using type = tensor_impl_t; +template +struct base_type>> { + using type = tensor_impl_t; }; // Type traits to help with the lack of short-circuit template logic. Numpy diff --git a/test/00_operators/ReductionTests.cu b/test/00_operators/ReductionTests.cu index d676f975..b69b285b 100644 --- a/test/00_operators/ReductionTests.cu +++ b/test/00_operators/ReductionTests.cu @@ -109,18 +109,33 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalf, VarianceStd) MATX_EXIT_HANDLER(); } +TEST(TestRivaDeviceTensor, Min) +{ + auto t = matx::make_tensor({2, 5}); + t.SetVals({{2, 4, 1, 3, 5}, {3, 1, 6, 2, 4}}); + auto i = matx::make_tensor({2}); + auto v = matx::make_tensor({2}); + + matx::argmin(v, i, t); + i.Print(2); + v.Print(2); + EXPECT_EQ(i(0), 2); + EXPECT_EQ(i(1), 6); + EXPECT_EQ(v(0), 1.f); + EXPECT_EQ(v(1), 1.f); +} TYPED_TEST(ReductionTestsFloatNonComplexNonHalf, Sum) { MATX_ENTER_HANDLER(); { tensor_t t0; - auto t4 = ones({30, 40, 50, 60}); - auto t3 = ones({30, 40, 50}); - auto t2 = ones({30, 40}); - auto t1 = ones({30}); + auto t4 = ones({30, 40, 50, 60}); + auto t3 = ones({30, 40, 50}); + auto t2 = ones({30, 40}); + auto t1 = ones({30}); sum(t0, t4, 0); cudaStreamSynchronize(0); @@ -144,9 +159,9 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalf, Sum) { tensor_t t1({30}); - auto t4 = ones({30, 40, 50, 60}); - auto t3 = ones({30, 40, 50}); - auto t2 = ones({30, 40}); + auto t4 = ones({30, 40, 50, 60}); + auto t3 = ones({30, 40, 50}); + auto t2 = ones({30, 40}); sum(t1, t4, 0); @@ -181,8 +196,8 @@ TYPED_TEST(ReductionTestsFloatNonComplexNonHalf, Sum) { tensor_t t2({30, 40}); - auto t4 = ones({30, 40, 50, 60}); - auto t3 = ones({30, 40, 50}); + auto t4 = ones({30, 40, 50, 60}); + auto t3 = ones({30, 40, 50}); sum(t2, t4, 0); cudaStreamSynchronize(0); @@ -364,17 +379,17 @@ TYPED_TEST(ReductionTestsNumericNonComplex, MinMax) cudaStreamSynchronize(0); // We need to convert the absolute index into relative before comparing - auto rel = t2o.GetIdxFromAbs(t1i_small(0)); + auto rel = GetIdxFromAbs(t2o, t1i_small(0)); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TypeParam)(5))); - rel = t2o.GetIdxFromAbs(t1i_small(1)); + rel = GetIdxFromAbs(t2o, t1i_small(1)); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TypeParam)(5))); argmin(t1o_small, t1i_small, t2o); cudaStreamSynchronize(0); - rel = t2o.GetIdxFromAbs(t1i_small(0)); + rel = GetIdxFromAbs(t2o, t1i_small(0)); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TypeParam)(1))); - rel = t2o.GetIdxFromAbs(t1i_small(1)); + rel = GetIdxFromAbs(t2o, t1i_small(1)); EXPECT_TRUE(MatXUtils::MatXTypeCompare(t2o(rel), (TypeParam)(1))); } @@ -469,9 +484,9 @@ TYPED_TEST(ReductionTestsNumericNonComplex, Prod) std::array s2{3, 4}; std::array s1{3}; + auto t1 = make_tensor(s1); + auto t2 = make_tensor(s2); - tensor_t t1{s1}; - tensor_t t2{s2}; TypeParam t1p = (TypeParam)1; for (int i = 0; i < t1.Size(0); i++) { t1(i) = static_cast>((float)rand() / diff --git a/test/00_solver/Eigen.cu b/test/00_solver/Eigen.cu index f41b4e53..1eef9c94 100644 --- a/test/00_solver/Eigen.cu +++ b/test/00_solver/Eigen.cu @@ -83,7 +83,8 @@ TYPED_TEST(EigenSolverTestNonComplexFloatTypes, EigenBasic) matx::copy(this->Wv, v, 0); // Compute lambda*v - (this->Lvv = v * this->Wov(i)).run(); + auto b = v * this->Wov(i); + (this->Lvv = b).run(); // Compute A*v matmul(this->Gtv, this->Bv, this->Wv); diff --git a/test/00_tensor/CUBTests.cu b/test/00_tensor/CUBTests.cu index 0db0e14f..e574e09b 100644 --- a/test/00_tensor/CUBTests.cu +++ b/test/00_tensor/CUBTests.cu @@ -32,6 +32,7 @@ #include "assert.h" #include "matx.h" +#include "matx_cub.h" #include "test_types.h" #include "utilities.h" #include "gtest/gtest.h" @@ -138,7 +139,7 @@ TYPED_TEST(CUBTestsNumericNonComplex, CumSum) } // 2D tests - tensor_t tmpv2(this->t2.Shape()); + auto tmpv2 = make_tensor(this->t2.Shape()); for (index_t i = 0; i < this->t2.Size(0); i++) { for (index_t j = 0; j < this->t2.Size(1); j++) { @@ -167,7 +168,7 @@ TYPED_TEST(CUBTestsNumericNonComplex, Sort) this->t1(i) = static_cast((2 * (i % 2) - 1) * i); } - tensor_t tmpv({this->t1.Lsize()}); + auto tmpv = make_tensor({this->t1.Lsize()}); // Ascending matx::sort(tmpv, this->t1, SORT_DIR_ASC); @@ -186,7 +187,7 @@ TYPED_TEST(CUBTestsNumericNonComplex, Sort) } // 2D tests - tensor_t tmpv2(this->t2.Shape()); + auto tmpv2 = make_tensor(this->t2.Shape()); for (index_t i = 0; i < this->t2.Size(0); i++) { for (index_t j = 0; j < this->t2.Size(1); j++) { diff --git a/test/00_tensor/TensorCreationTests.cu b/test/00_tensor/TensorCreationTests.cu index 046ed0c2..6e88f49f 100644 --- a/test/00_tensor/TensorCreationTests.cu +++ b/test/00_tensor/TensorCreationTests.cu @@ -115,23 +115,23 @@ TYPED_TEST(TensorCreationTestsAll, MakeShape) ASSERT_EQ(mt4.Size(3), 3); } -TYPED_TEST(TensorCreationTestsAll, MakeStaticShape) -{ - auto mt1 = make_static_tensor(); - ASSERT_EQ(mt1.Size(0), 10); +// TYPED_TEST(TensorCreationTestsAll, MakeStaticShape) +// { +// auto mt1 = make_static_tensor(); +// ASSERT_EQ(mt1.Size(0), 10); - auto mt2 = make_static_tensor(); - ASSERT_EQ(mt2.Size(0), 10); - ASSERT_EQ(mt2.Size(1), 40); +// auto mt2 = make_static_tensor(); +// ASSERT_EQ(mt2.Size(0), 10); +// ASSERT_EQ(mt2.Size(1), 40); - auto mt3 = make_static_tensor(); - ASSERT_EQ(mt3.Size(0), 10); - ASSERT_EQ(mt3.Size(1), 40); - ASSERT_EQ(mt3.Size(2), 30); +// auto mt3 = make_static_tensor(); +// ASSERT_EQ(mt3.Size(0), 10); +// ASSERT_EQ(mt3.Size(1), 40); +// ASSERT_EQ(mt3.Size(2), 30); - auto mt4 = make_static_tensor(); - ASSERT_EQ(mt4.Size(0), 10); - ASSERT_EQ(mt4.Size(1), 40); - ASSERT_EQ(mt4.Size(2), 30); - ASSERT_EQ(mt4.Size(3), 6); -} +// auto mt4 = make_static_tensor(); +// ASSERT_EQ(mt4.Size(0), 10); +// ASSERT_EQ(mt4.Size(1), 40); +// ASSERT_EQ(mt4.Size(2), 30); +// ASSERT_EQ(mt4.Size(3), 6); +// }