NVIDIA · ZelboK · Jul 27, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
@@ -32,6 +32,8 @@
 
 #pragma once
 
+#include <thrust/reduce.h>
+
 #include "matx/core/iterator.h"
 #include "matx/core/type_utils.h"
 #include "matx/operators/collapse.h"
@@ -62,6 +64,30 @@ namespace matx {
     return func(in, iter, bi, ei);
   }  
 
+  template <typename In, typename Out, typename Op, bool ConvertType = true>
+  __MATX_HOST__ __MATX_INLINE__ auto ReduceInputThrust(In &&in, Out &&out, Op &&op) {
+    typename detail::base_type_t<In> in_base = in;    
+
+    auto begin = BeginOffset{in_base};
+    auto end = EndOffset{in_base};
+
+    if constexpr (in_base.Rank() < 2 && is_tensor_view_v<In>) {
+      using value_t = typename Out::value_type;
+      const auto &iter = matx::RandomOperatorIterator<decltype(in_base), ConvertType>{in_base};
+
+      if (in_base.IsContiguous()) {
+        // the conversion is already handled for us by RandomOperatorIterator
+        thrust::reduce(
+            iter + *begin, iter + *end, op.Init(), op
+        );
+      }
+    }
+
+    auto collapsed = matx::lcollapse<remove_cvref_t<decltype(out)>::Rank()>(rcollapse<remove_cvref_t<decltype(in)>::Rank() - remove_cvref_t<decltype(out)>::Rank()>(in_base));
+    const auto &iter = matx::RandomOperatorIterator<decltype(collapsed), ConvertType>{collapsed};
+    return thrust::reduce(iter + *begin, iter + *end, op.Init(), op); 
+  }
+
   template <typename Func, typename OutputOp, typename InputOp, bool ConvertType = true>
   __MATX_HOST__ __MATX_INLINE__ auto ReduceInput(Func &&func, OutputOp &&out, InputOp &&in) {
     typename detail::base_type_t<InputOp> in_base = in;    
@@ -83,7 +109,6 @@ namespace matx {
         }
       }
     }
-
     // Collapse the right-most dimensions by the difference in ranks for the reduction dimension,
     // then collapse the left size by the output rank to get the batch dimensions  
     auto collapsed = matx::lcollapse<remove_cvref_t<decltype(out)>::Rank()>(rcollapse<remove_cvref_t<decltype(in)>::Rank() - 

diff --git a/include/matx/operators/any.h b/include/matx/operators/any.h
@@ -32,6 +32,7 @@
 
 #pragma once
 
+#include <thrust/reduce.h>
 
 #include "matx/core/type_utils.h"
 #include "matx/operators/base_operator.h"
@@ -40,8 +41,6 @@
 
 namespace matx {
 
-
-
 namespace detail {
   template<typename OpA, int ORank>
   class AnyOp : public BaseOp<AnyOp<OpA, ORank>>
@@ -71,8 +70,17 @@ namespace detail {
       };
 
       template <typename Out, typename Executor>
-      void Exec(Out &&out, Executor &&ex) const {
-        any_impl(cuda::std::get<0>(out), a_, ex);
+      void Exec(Out &&out, Executor) const {
+        auto output_ = cuda::std::get<0>(out);
+        using out_t = decltype(output_);
+        using value_t = typename out_t::value_type;
+        using output_t = typename detail::base_type_t<out_t>;
+
+        output_t out_base = output_;
+        auto op = detail::reduceOpAny<value_t>();
+
+        auto rv = ReduceInputThrust(std::forward<OpA>(a_), std::forward<out_t>(output_), std::forward<decltype(op)>(op));
+        MATX_ASSERT_STR_EXP(rv, cudaSuccess, matxCudaError, "Error in any");
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()

diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
@@ -798,17 +798,24 @@ template <typename T> class reduceOpMax {
  * Performs a reduction of two values of type T by returning 1 if either
  * of the values are non-zero.
  */
-template <typename T> class reduceOpAny {
+template <typename T>
+class reduceOpAny {
 public:
+  using type = T; // This type is for Thrust
   using matx_reduce = bool;
   using matx_no_cub_reduce = bool; // Don't use CUB for this reduction type
-  __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T Reduce(const T &v1, const T &v2)
-  {
+
+  __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T operator()(const T &v1, const T &v2) const {
     return (v1 != 0) || (v2 != 0);
   }
-  __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T operator()(T &v1, T &v2) { v1 = ((v1 != 0) || (v2 != 0)); return v1; }
-  __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T Init() { return (T)(0); }
-  __MATX_DEVICE__ __MATX_INLINE__ void atomicReduce(T *addr, T val) { atomicAny(addr, val); }
+
+  __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T Init() const {
+    return static_cast<T>(0);
+  }
+
+  __MATX_DEVICE__ __MATX_INLINE__ void atomicReduce(T *addr, T val) const {
+    atomicAny(addr, val);
+  }
 };
 
 /**