ROCm · bpickrel · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 18, 2024
@@ -697,7 +697,7 @@ Operator Support Matrix
 |                          |           |                 | functions are                |
 |                          |           |                 | not enabled                  |
 +--------------------------+-----------+-----------------+------------------------------+
-| RoiAlign                 | ✅        | FP8, FP16,      |                              |
+| RoiAlign                 | ✅        | FP8, FP16,      |                               |
-| RoiAlign                 | ✅        | FP8, FP16,      |                               |
+| RoiAlign                 | ✅        | FP8, FP16,      |                              |
-| RoiAlign                 | ✅        | FP8, FP16,      |                               |
+| RoiAlign                 | ✅        | FP8, FP16,      |                              |
 |                          |           | FP32, FP64      |                              |
 +--------------------------+-----------+-----------------+------------------------------+
 | Round                    | ✅        | FP8, FP16,      |                              |

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -74,6 +74,15 @@ struct roialign
         auto type     = inputs.at(0).type();
 
         // check input correct
+        if(shape::is_integral(type))
+            MIGRAPHX_THROW("ROIALIGN: incorrect type for input data! (should be non-integer)");
+        if(shape::is_integral(inputs.at(1).type()))
+            MIGRAPHX_THROW("ROIALIGN: incorrect data type for rois! (should be non-integer)");
+        if(!shape::is_integral(inputs.at(2).type()))
+            MIGRAPHX_THROW(
+                "ROIALIGN: incorrect datatype for roi indices! (should be an integral type)");
+        if(x_lens.size() != 4)
+            MIGRAPHX_THROW("ROIALIGN: data input must have 4 dimensions n, c, h, w");
         if(bi_lens.size() != 1)
         {
             MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
@@ -92,8 +101,8 @@ struct roialign
 
         std::vector<std::size_t> out_lens = x_lens;
         out_lens[0]                       = roi_lens[0];
-        out_lens[2]                       = output_height;
-        out_lens[3]                       = output_width;
+        out_lens[2]                       = output_width;
+        out_lens[3]                       = output_height;
 
         return {type, out_lens};
     }
@@ -115,17 +124,22 @@ struct roialign
         std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
                                         output_width);
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
-            std::array<std::size_t, 2> p = {idx_v[0], idx_v[1]};
-            std::array<std::size_t, 2> i = {idx_v[2], idx_v[3]};
+            // The p and i indexes correspond to nested looping parameters in ORT that go in y, x
+            // order.  The i[x] value is least significant and iterates the fastest.
+            std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
+            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; // these are always equal
 
+            // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
+            // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies
+            // inside) from which we will interpolate.
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
             for(auto ii : range(p.size()))
             {
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-                xy[ii] = (coord_trans_mode == "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
                     results[index] = pos_weight{};
@@ -140,21 +154,18 @@ struct roialign
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
                 }
             }
+            results[index].pos = {low[1] * dims[0] + low[0],
+                                  low[1] * dims[0] + high[0],
+                                  high[1] * dims[0] + low[0],
+                                  high[1] * dims[0] + high[0]};
 
-            results[index].pos = {low[0] * dims[1] + low[1],
-                                  low[0] * dims[1] + high[1],
-                                  high[0] * dims[1] + low[1],
-                                  high[0] * dims[1] + high[1]};
-
-            float ly = xy[0] - low[0];
-            float lx = xy[1] - low[1];
+            float lx = xy[0] - low[0];
+            float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-
-            // save weights and indeces
+            // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
         });
-
         return results;
     }
 
@@ -176,11 +187,12 @@ struct roialign
         double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
     };
 
+    // Calculate a pooling value for 1 block of bin_grid_size*bin_grid_size weights
     template <class T, class Op>
     std::tuple<double, int64_t> calc_pooling(const T& data,
                                              const std::array<std::size_t, 2>& bin_grid_size,
                                              const std::vector<pos_weight>& pos_weights,
-                                             int64_t index,
+                                             int64_t index, // index to c
                                              Op op) const
     {
         double output_val   = op.init();
@@ -208,36 +220,38 @@ struct roialign
         int64_t n_rois       = out_lens[0];
         std::size_t channels = out_lens[1];
         // output dims of height and width, in all 2-dim arrays, the first dim
-        // is for height and second dim is for width
+        // is for height and second dim is for width i.e. (y, x) order
         std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
         const auto& x_lens                  = args.at(0).get_shape().lens();
         // input dims of height and width
-        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        std::array<std::size_t, 2> in_dims = {x_lens[3], x_lens[2]};
         auto roi_s                         = args.at(1).get_shape();
 
         visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
             const auto* batch_indices = args.at(2).cast<int64_t>();
             par_for(n_rois, [&](auto n) {
                 const auto bottom_data   = x.begin();
                 const auto roi_batch_ind = batch_indices[n];
-                // Do not using rounding; this implementation detail is critical
+                // Do not use rounding here even if data is a quantized type; this
+                // implementation detail is critical
+                const float offset              = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
                 std::array<float, 2> roi_starts = {
-                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
-                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale - offset),
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale - offset)};
                 std::array<float, 2> roi_ends = {
-                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
-                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale - offset),
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale - offset)};
 
-                // Force malformed ROIs to be 1x1
+                // Force malformed ROIs to be 1x1, if in output_half_pixel transform mode
                 std::array<float, 2> roi_size{};
                 std::array<float, 2> bin_size{};
                 std::array<std::size_t, 2> bin_grid_size{};
 
                 for(auto ii : range(roi_size.size()))
                 {
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
-                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
-
+                    if(coord_trans_mode != "half_pixel")
+                        roi_size[ii] = std::max(roi_size[ii], 1.0f);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
@@ -247,22 +261,24 @@ struct roialign
                 // we want to precalculate indices and weights shared by all channels,
                 // this is the key point of optimization
                 std::vector<std::size_t> comp_lens = {
-                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                    out_dims[1], out_dims[0], bin_grid_size[1], bin_grid_size[0]};
                 shape comp_s{shape::float_type, comp_lens};
                 auto pre_calc =
                     this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
 
                 std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
                 shape comp_s1{migraphx::shape::float_type, comp_lens1};
                 std::vector<int64_t> vec_index(channels, 0);
+
                 shape_for_each(comp_s1, [&](const auto& idx) {
-                    auto c  = idx[0];
+                    auto c  = idx[0]; // channel count
                     auto ph = idx[1];
                     auto pw = idx[2];
 
                     const auto offset_bottom_data =
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
+
                     double output_val;
                     std::tie(output_val, vec_index[c]) =
                         (mode == migraphx::op::pooling_mode::average)

@@ -24,6 +24,8 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
 #define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
 
+// #include <migraphx/kernels/debug.hpp>
+// #include <migraphx/kernels/print.hpp>
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/dfor.hpp>
 #include <migraphx/kernels/ops.hpp>
@@ -87,13 +89,14 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
         }
     }
-    array<index_int, 4> locs = {low[0] * dims[1] + low[1],
-                                low[0] * dims[1] + high[1],
-                                high[0] * dims[1] + low[1],
-                                high[0] * dims[1] + high[1]};
+    array<index_int, 4> locs = {low[1] * dims[0] + low[0],
+                                low[1] * dims[0] + high[0],
+                                high[1] * dims[0] + low[0],
+                                high[1] * dims[0] + high[0]};
+
+    float lx = xy[0] - low[0];
+    float ly = xy[1] - low[1];
 
-    float ly = xy[0] - low[0];
-    float lx = xy[1] - low[1];
     float hy = 1.0f - ly;
     float hx = 1.0f - lx;
     // do calculations in floating point and convert final result to required type
@@ -104,24 +107,23 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
     return implicit_conversion(pooling(v01, v23));
 }
 
+// Calculate a single pooled output value
 template <class Iterator, class Op>
 MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                             const array<float, 2>& roi_starts,
                                             const array<float, 2>& bin_size,
                                             const array<int, 2>& idx,
                                             const array<index_int, 2>& bin_grid_size,
                                             const array<index_int, 2>& dims,
-                                            float roi_offset,
                                             Op op)
 {
+    // for one idx (output height and width coordinates) we iterate through all bin_grid values
     using in_dtype      = typename Iterator::value_type;
     in_dtype output_val = in_dtype{op.init()};
     const int64_t count = bin_grid_size[0] * bin_grid_size[1];
     dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
         array<index_int, 2> id = {iy, ix};
-        array<float, 2> locs =
-            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
-
+        array<float, 2> locs = roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;
         auto val   = bilinear_interpolate(data, dims, locs, op);
         output_val = op(output_val, val);
     });
@@ -155,7 +157,7 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
     auto channel_num = x_lens[1];
     // input dims of height and width, in all 2-dim arrays, the first dim
     // is for height and second dim is for width
-    array<index_int, 2> in_dims = {x_lens[2], x_lens[3]};
+    array<index_int, 2> in_dims = {x_lens[3], x_lens[2]};
 
     const auto stride   = index.nglobal();
     auto out_s          = y_t.get_shape();
@@ -166,6 +168,17 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
     const auto& out_lens         = out_s.lens;
     array<index_int, 2> out_dims = {out_lens[2], out_lens[3]};
 
+    // Compute lens and strides vectors for use in reindexing output.
+    // Todo: look for a less indirect way to reconcile the ordering of iteration
+    // between this op. and the reference.
+    array<size_t, 4> m_lens{out_lens[0], out_lens[1], out_lens[3], out_lens[2]};
+    array<size_t, 4> m_strides;
+    m_strides[3] = 1;
+    for(int k = 2; k >= 0; k--)
+    {
+        m_strides[k] = m_strides[k + 1] * m_lens[k + 1];
+    }
+
     for(index_int i = index.global; i < out_s.elements(); i += stride)
     {
         auto idx = out_s.multi(i);
@@ -177,12 +190,17 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
         const auto offset_rois = rois + (n * roi_column_num);
         const int batch_ind    = ind[n];
 
+        // Note that roi_offset in src/targets/gpu/jit/roialign.cpp uses a negative value, so we add
+        // rather than subtract it here
         array<float, 2> roi_starts = {
-            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
-            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) +
+                s.roi_offset};
+
         array<float, 2> roi_ends = {
-            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
-            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale) +
+                s.roi_offset};
 
         array<float, 2> roi_size{};
         array<float, 2> bin_size{};
@@ -191,36 +209,37 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
         for(index_int ii = 0; ii < roi_size.size(); ++ii)
         {
             roi_size[ii] = roi_ends[ii] - roi_starts[ii];
-            roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
+            if(s.roi_offset == 0.f)
+                roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
 
             bin_size[ii]      = roi_size[ii] / out_dims[ii];
             bin_grid_size[ii] = (s.sampling_ratio > 0)
                                     ? s.sampling_ratio
                                     : migraphx::ceil(roi_size[ii] / out_dims[ii]);
         }
-
         const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+
+        //
+        //  Reindexing.  Calculations to this point did not iterate in the same order as
+        // in the reference op; we now calculate the output index corresponding to i
+        //
+        size_t pp = i;
+        size_t jj = (pp / m_strides[0]) * m_strides[0];
+        pp        = pp % m_strides[0];
+        jj += (pp / m_strides[1]) * m_strides[1];
+        pp %= m_strides[1];
+        pp = pp / m_lens[2] + (pp % m_lens[2]) * m_strides[2];
+        jj += pp;
+
         if constexpr(s.is_avg_pooling)
         {
-            y_t[i] = calc_pooling(offset_x,
-                                  roi_starts,
-                                  bin_size,
-                                  {ph, pw},
-                                  bin_grid_size,
-                                  in_dims,
-                                  s.roi_offset,
-                                  avg_pool{});
+            y_t[jj] = calc_pooling(
+                offset_x, roi_starts, bin_size, {ph, pw}, bin_grid_size, in_dims, avg_pool{});
         }
         else
         {
-            y_t[i] = calc_pooling(offset_x,
-                                  roi_starts,
-                                  bin_size,
-                                  {ph, pw},
-                                  bin_grid_size,
-                                  in_dims,
-                                  s.roi_offset,
-                                  max_pool{});
+            y_t[jj] = calc_pooling(
+                offset_x, roi_starts, bin_size, {ph, pw}, bin_grid_size, in_dims, max_pool{});
         }
     }
 }