From 735e7a6f28cec763f446fee83f4f14671a38ec9e Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 17 Jul 2024 22:23:46 +0000
Subject: [PATCH 01/56] adjust stride ordering rules for standard shape: stride
 can be anything in a dimension of size 1

---
 src/shape.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/shape.cpp b/src/shape.cpp
index f9a42361465..9b48a631b73 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -63,8 +63,14 @@ struct shape_impl
     {
         assert(t != shape::tuple_type);
         assert(m_lens.size() == m_strides.size());
+
+        std::vector<size_t> filtered_strides;
+        for(size_t ind = 0; ind < m_strides.size(); ind++)
+            if(m_lens[ind] != 1)
+                filtered_strides.push_back(m_strides[ind]);
+
         m_standard = this->elements() == this->element_space() and not skips() and
-                     std::is_sorted(m_strides.rbegin(), m_strides.rend());
+                     std::is_sorted(filtered_strides.rbegin(), filtered_strides.rend());
     }
 
     shape_impl(shape::type_t t, std::vector<shape::dynamic_dimension> dims)

From 92bac55267b171ba35d12505ccf0944ecfaee5e4 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 17 Jul 2024 23:42:34 +0000
Subject: [PATCH 02/56] add a shape test

---
 src/shape.cpp       |  5 ++++-
 test/shape_test.cpp | 11 +++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/shape.cpp b/src/shape.cpp
index 9b48a631b73..59355d36874 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -65,10 +65,13 @@ struct shape_impl
         assert(m_lens.size() == m_strides.size());
 
         std::vector<size_t> filtered_strides;
+        std::vector<size_t> ffss;
         for(size_t ind = 0; ind < m_strides.size(); ind++)
             if(m_lens[ind] != 1)
                 filtered_strides.push_back(m_strides[ind]);
-
+auto asdf = std::is_sorted(filtered_strides.begin(), filtered_strides.end());
+auto asdf2 = std::is_sorted(filtered_strides.rbegin(), filtered_strides.rend());
+auto asdf3 = skips();
         m_standard = this->elements() == this->element_space() and not skips() and
                      std::is_sorted(filtered_strides.rbegin(), filtered_strides.rend());
     }
diff --git a/test/shape_test.cpp b/test/shape_test.cpp
index 22ac7f54c0d..e1b683def27 100644
--- a/test/shape_test.cpp
+++ b/test/shape_test.cpp
@@ -86,6 +86,17 @@ TEST_CASE(test_shape_standard_singleton_dim)
     EXPECT(not s.broadcasted());
 }
 
+TEST_CASE(test_shape_standard_stray_singleton_dim)
+{
+    // A shape can be transposed (nonzero strides out of order) but still be considered
+    // standard if the only out-of-order strides are on axes with a length of 1.
+    migraphx::shape s{migraphx::shape::float_type, {5, 1, 1, 8}, {8, 3, 4, 1}};
+    EXPECT(s.standard());
+    EXPECT(s.packed());
+    EXPECT(s.transposed());
+    EXPECT(not s.broadcasted());
+}
+
 TEST_CASE(test_shape_min_max_opt)
 {
     migraphx::shape s{migraphx::shape::float_type, {2, 2, 3}, {6, 3, 1}};

From 090c767a04d7e8c42a24da96995711a25f201de9 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 17 Jul 2024 23:54:16 +0000
Subject: [PATCH 03/56] debug code removed

---
 src/shape.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/shape.cpp b/src/shape.cpp
index 59355d36874..463d12d7d3c 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -69,9 +69,6 @@ struct shape_impl
         for(size_t ind = 0; ind < m_strides.size(); ind++)
             if(m_lens[ind] != 1)
                 filtered_strides.push_back(m_strides[ind]);
-auto asdf = std::is_sorted(filtered_strides.begin(), filtered_strides.end());
-auto asdf2 = std::is_sorted(filtered_strides.rbegin(), filtered_strides.rend());
-auto asdf3 = skips();
         m_standard = this->elements() == this->element_space() and not skips() and
                      std::is_sorted(filtered_strides.rbegin(), filtered_strides.rend());
     }

From 9e8528ff87f1c9664d16e0b811d607fb485600a6 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 18 Jul 2024 15:01:54 +0000
Subject: [PATCH 04/56] fix a test

---
 src/shape.cpp       | 2 +-
 test/shape_test.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/shape.cpp b/src/shape.cpp
index 463d12d7d3c..7e4e1ef4924 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -64,8 +64,8 @@ struct shape_impl
         assert(t != shape::tuple_type);
         assert(m_lens.size() == m_strides.size());
 
+        // Calculate standard shape flag for these lens/strides
         std::vector<size_t> filtered_strides;
-        std::vector<size_t> ffss;
         for(size_t ind = 0; ind < m_strides.size(); ind++)
             if(m_lens[ind] != 1)
                 filtered_strides.push_back(m_strides[ind]);
diff --git a/test/shape_test.cpp b/test/shape_test.cpp
index e1b683def27..56425c52c61 100644
--- a/test/shape_test.cpp
+++ b/test/shape_test.cpp
@@ -537,7 +537,7 @@ TEST_CASE(test_shape_broadcasted)
 TEST_CASE(test_shape_broadcasted2)
 {
     migraphx::shape s{migraphx::shape::float_type, {1, 2}, {0, 1}};
-    EXPECT(not s.standard());
+    EXPECT(s.standard());
     EXPECT(s.packed());
     EXPECT(not s.transposed());
     EXPECT(s.broadcasted());

From 22cc5ffcd6922374bb58f048fb33df0090414e3f Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 30 Jul 2024 21:48:31 +0000
Subject: [PATCH 05/56] added shape::compatible_lens() method

---
 src/include/migraphx/shape.hpp |  5 +++++
 src/program.cpp                |  2 +-
 src/shape.cpp                  | 20 ++++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index 0c1e7b269d4..822d0f33d94 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -296,6 +296,11 @@ struct MIGRAPHX_EXPORT shape
     /// not transposed.
     bool standard() const;
 
+    /// Returns true if the shapes are compatible.  TODO: better description
+    // Paul, How would you describe the purpose of that equality check for the shapes?  I'm trying to come up with a 
+    // function description that explains why it's ok for the strides not to match sometimes.
+    bool compatible_lens(const shape& s2) const;
+
     /// Returns true if all strides are equal to 0 (scalar tensor)
     bool scalar() const;
 
diff --git a/src/program.cpp b/src/program.cpp
index 2e34cd51505..a00f600d5dc 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -508,7 +508,7 @@ std::vector<argument> generic_eval(const module* mod,
         }
         assert(results.find(ins) != results.end());
         assert(ins->get_shape().any_of_dynamic() or
-               results.at(ins).get_shape() == ins->get_shape());
+               results.at(ins).get_shape().compatible_lens(ins->get_shape()));
     }
     return {results.at(std::prev(mod->end()))};
 }
diff --git a/src/shape.cpp b/src/shape.cpp
index 7e4e1ef4924..5cf19831d10 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -490,6 +490,26 @@ bool shape::scalar() const
 
 bool shape::standard() const { return impl->m_standard; }
 
+
+bool shape::compatible_lens(const shape& s2) const
+{
+    if(dynamic() or s2.dynamic()) return true;
+    if(lens() != s2.lens() or type() != s2.type()) return false;
+    
+    // Lens must be the same; strides must be same except that
+    // axes with len=1 don't matter
+    for(size_t ind = 0; ind < lens().size(); ind++)
+    {
+        size_t l_ind(lens()[ind]);
+        if(l_ind != s2.lens()[ind] or
+          (l_ind != 1 and strides()[ind] != s2.strides()[ind]))
+            return false;
+    }
+    return true;
+    // TODO:  Do these checks matter here?
+    // m_standard = this->elements() == this->element_space() and not skips() and
+}
+
 shape shape::normalize_standard() const
 {
     if(this->standard())

From c5508e6abbb2769a6657877c7829714c2e9af088 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 30 Jul 2024 22:03:46 +0000
Subject: [PATCH 06/56] format

---
 src/include/migraphx/shape.hpp |  5 +++--
 src/shape.cpp                  | 12 ++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index 822d0f33d94..4d0bc406d43 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -297,8 +297,9 @@ struct MIGRAPHX_EXPORT shape
     bool standard() const;
 
     /// Returns true if the shapes are compatible.  TODO: better description
-    // Paul, How would you describe the purpose of that equality check for the shapes?  I'm trying to come up with a 
-    // function description that explains why it's ok for the strides not to match sometimes.
+    // Paul, How would you describe the purpose of that equality check for the shapes?  I'm trying
+    // to come up with a function description that explains why it's ok for the strides not to match
+    // sometimes.
     bool compatible_lens(const shape& s2) const;
 
     /// Returns true if all strides are equal to 0 (scalar tensor)
diff --git a/src/shape.cpp b/src/shape.cpp
index 5cf19831d10..7f0f4aa9385 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -490,19 +490,19 @@ bool shape::scalar() const
 
 bool shape::standard() const { return impl->m_standard; }
 
-
 bool shape::compatible_lens(const shape& s2) const
 {
-    if(dynamic() or s2.dynamic()) return true;
-    if(lens() != s2.lens() or type() != s2.type()) return false;
-    
+    if(dynamic() or s2.dynamic())
+        return true;
+    if(lens() != s2.lens() or type() != s2.type())
+        return false;
+
     // Lens must be the same; strides must be same except that
     // axes with len=1 don't matter
     for(size_t ind = 0; ind < lens().size(); ind++)
     {
         size_t l_ind(lens()[ind]);
-        if(l_ind != s2.lens()[ind] or
-          (l_ind != 1 and strides()[ind] != s2.strides()[ind]))
+        if(l_ind != s2.lens()[ind] or (l_ind != 1 and strides()[ind] != s2.strides()[ind]))
             return false;
     }
     return true;

From 21dddd0e47363eccb2807eee78ca5729d263630d Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 31 Jul 2024 18:24:54 +0000
Subject: [PATCH 07/56] refactor the function for testing compatible shapes to
 a non-member static.

---
 src/include/migraphx/shape.hpp |  6 ------
 src/program.cpp                | 23 ++++++++++++++++++++++-
 src/shape.cpp                  | 20 --------------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index 4d0bc406d43..0c1e7b269d4 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -296,12 +296,6 @@ struct MIGRAPHX_EXPORT shape
     /// not transposed.
     bool standard() const;
 
-    /// Returns true if the shapes are compatible.  TODO: better description
-    // Paul, How would you describe the purpose of that equality check for the shapes?  I'm trying
-    // to come up with a function description that explains why it's ok for the strides not to match
-    // sometimes.
-    bool compatible_lens(const shape& s2) const;
-
     /// Returns true if all strides are equal to 0 (scalar tensor)
     bool scalar() const;
 
diff --git a/src/program.cpp b/src/program.cpp
index a00f600d5dc..ced6e43df68 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -426,6 +426,25 @@ void preview_argument(std::ostream& os, const argument& a)
         });
 }
 
+static bool is_compatible_shape(const shape& actual, const shape& expected)
+{
+    // Check subshapes
+    if(expected.type() == shape::tuple_type)
+        return equal(actual.sub_shapes(), expected.sub_shapes(), &is_compatible_shape);
+    // Only the expected can be dynamic
+    if(expected.dynamic())
+        return true;
+    if(actual == expected)
+        return true;
+    if(actual.type() != expected.type())
+        return false;
+    // If both shapes are standard and lens match, they are considered compatible
+    // even if strides are different.
+    if(actual.standard() and expected.standard())
+        return actual.lens() == expected.lens();
+    return false;
+}
+
 template <class F>
 std::vector<argument> generic_eval(const module* mod,
                                    std::vector<context>& ctx,
@@ -507,8 +526,10 @@ std::vector<argument> generic_eval(const module* mod,
                 }));
         }
         assert(results.find(ins) != results.end());
+        // TODO: what order do the arguments to is_compatible_shape() come in?  One
+        // can be dynamic.
         assert(ins->get_shape().any_of_dynamic() or
-               results.at(ins).get_shape().compatible_lens(ins->get_shape()));
+               is_compatible_shape(ins->get_shape(), results.at(ins).get_shape()));
     }
     return {results.at(std::prev(mod->end()))};
 }
diff --git a/src/shape.cpp b/src/shape.cpp
index 7f0f4aa9385..7e4e1ef4924 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -490,26 +490,6 @@ bool shape::scalar() const
 
 bool shape::standard() const { return impl->m_standard; }
 
-bool shape::compatible_lens(const shape& s2) const
-{
-    if(dynamic() or s2.dynamic())
-        return true;
-    if(lens() != s2.lens() or type() != s2.type())
-        return false;
-
-    // Lens must be the same; strides must be same except that
-    // axes with len=1 don't matter
-    for(size_t ind = 0; ind < lens().size(); ind++)
-    {
-        size_t l_ind(lens()[ind]);
-        if(l_ind != s2.lens()[ind] or (l_ind != 1 and strides()[ind] != s2.strides()[ind]))
-            return false;
-    }
-    return true;
-    // TODO:  Do these checks matter here?
-    // m_standard = this->elements() == this->element_space() and not skips() and
-}
-
 shape shape::normalize_standard() const
 {
     if(this->standard())

From 31addfc49b141d0569f4772506449961e6be6f2a Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 31 Jul 2024 18:44:18 +0000
Subject: [PATCH 08/56] changing recursive equal call

---
 src/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/program.cpp b/src/program.cpp
index ced6e43df68..6392f179280 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -430,7 +430,7 @@ static bool is_compatible_shape(const shape& actual, const shape& expected)
 {
     // Check subshapes
     if(expected.type() == shape::tuple_type)
-        return equal(actual.sub_shapes(), expected.sub_shapes(), &is_compatible_shape);
+        return equal(actual.sub_shapes().begin(), actual.sub_shapes().end(), expected.sub_shapes().begin(), &is_compatible_shape);
     // Only the expected can be dynamic
     if(expected.dynamic())
         return true;

From 02633f230d848d7bdbdb2d29c2501897a6c42c45 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 31 Jul 2024 20:43:14 +0000
Subject: [PATCH 09/56] conditional conpilation for is_compatible_shape()

---
 src/program.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/program.cpp b/src/program.cpp
index 6392f179280..7ea5f884e9c 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -426,11 +426,16 @@ void preview_argument(std::ostream& os, const argument& a)
         });
 }
 
+// This function currently used only in an Assertion.
+#ifndef NDEBUG
 static bool is_compatible_shape(const shape& actual, const shape& expected)
 {
     // Check subshapes
     if(expected.type() == shape::tuple_type)
-        return equal(actual.sub_shapes().begin(), actual.sub_shapes().end(), expected.sub_shapes().begin(), &is_compatible_shape);
+        return equal(actual.sub_shapes().begin(),
+                     actual.sub_shapes().end(),
+                     expected.sub_shapes().begin(),
+                     &is_compatible_shape);
     // Only the expected can be dynamic
     if(expected.dynamic())
         return true;
@@ -444,6 +449,7 @@ static bool is_compatible_shape(const shape& actual, const shape& expected)
         return actual.lens() == expected.lens();
     return false;
 }
+#endif
 
 template <class F>
 std::vector<argument> generic_eval(const module* mod,

From 68467b633d36adc98a98cd31486f2d9ddcc5a4b8 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 31 Jul 2024 22:18:59 +0000
Subject: [PATCH 10/56] different workaround for compile problem

---
 src/program.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/program.cpp b/src/program.cpp
index 7ea5f884e9c..0c00c8526e5 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -427,7 +427,6 @@ void preview_argument(std::ostream& os, const argument& a)
 }
 
 // This function currently used only in an Assertion.
-#ifndef NDEBUG
 static bool is_compatible_shape(const shape& actual, const shape& expected)
 {
     // Check subshapes
@@ -449,7 +448,6 @@ static bool is_compatible_shape(const shape& actual, const shape& expected)
         return actual.lens() == expected.lens();
     return false;
 }
-#endif
 
 template <class F>
 std::vector<argument> generic_eval(const module* mod,
@@ -534,6 +532,7 @@ std::vector<argument> generic_eval(const module* mod,
         assert(results.find(ins) != results.end());
         // TODO: what order do the arguments to is_compatible_shape() come in?  One
         // can be dynamic.
+        (void)(is_compatible_shape(shape{}, shape{}));
         assert(ins->get_shape().any_of_dynamic() or
                is_compatible_shape(ins->get_shape(), results.at(ins).get_shape()));
     }

From be7a72e67e12d338963ff3cb797ae5e3a8f0e8e7 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 1 Aug 2024 14:54:08 +0000
Subject: [PATCH 11/56] misc small fixes

---
 src/program.cpp | 10 +++++-----
 src/shape.cpp   |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/program.cpp b/src/program.cpp
index 0c00c8526e5..25cb16cc950 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -427,6 +427,9 @@ void preview_argument(std::ostream& os, const argument& a)
 }
 
 // This function currently used only in an Assertion.
+// "Almost identical" shapes.  To support an MLIR feature, there is a limited
+// case where shapes may both be standard but have non-identical strides.
+#ifndef NDEBUG
 static bool is_compatible_shape(const shape& actual, const shape& expected)
 {
     // Check subshapes
@@ -448,6 +451,7 @@ static bool is_compatible_shape(const shape& actual, const shape& expected)
         return actual.lens() == expected.lens();
     return false;
 }
+#endif
 
 template <class F>
 std::vector<argument> generic_eval(const module* mod,
@@ -530,11 +534,7 @@ std::vector<argument> generic_eval(const module* mod,
                 }));
         }
         assert(results.find(ins) != results.end());
-        // TODO: what order do the arguments to is_compatible_shape() come in?  One
-        // can be dynamic.
-        (void)(is_compatible_shape(shape{}, shape{}));
-        assert(ins->get_shape().any_of_dynamic() or
-               is_compatible_shape(ins->get_shape(), results.at(ins).get_shape()));
+        assert(is_compatible_shape(results.at(ins).get_shape(), ins->get_shape()));
     }
     return {results.at(std::prev(mod->end()))};
 }
diff --git a/src/shape.cpp b/src/shape.cpp
index 7e4e1ef4924..cfa3a1c2b43 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -64,7 +64,8 @@ struct shape_impl
         assert(t != shape::tuple_type);
         assert(m_lens.size() == m_strides.size());
 
-        // Calculate standard shape flag for these lens/strides
+        // Calculate standard shape flag for these lens/strides.  Strides on size-1
+        // axes are ignored to support an MLIR rule.
         std::vector<size_t> filtered_strides;
         for(size_t ind = 0; ind < m_strides.size(); ind++)
             if(m_lens[ind] != 1)

From e0f169546db636c379baed121c1191baae8f37f8 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 7 Aug 2024 21:06:24 +0000
Subject: [PATCH 12/56] changes to compatible check, want to see if this passes
 jenkins

---
 src/include/migraphx/check_shapes.hpp | 27 +++++++++++++++++++++++++-
 src/include/migraphx/program.hpp      |  1 -
 src/include/migraphx/shape.hpp        | 27 ++++++++++++++++++++++++++
 src/program.cpp                       | 28 +--------------------------
 4 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/src/include/migraphx/check_shapes.hpp b/src/include/migraphx/check_shapes.hpp
index 05118082ee8..a19dbe2e000 100644
--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/shape.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
 #include <algorithm>
 
@@ -224,6 +225,16 @@ struct check_shapes
         return *this;
     }
 
+    /*!
+     * Check all shapes are compatible.
+     */
+    // const check_shapes& same_compatible() const
+    // {
+    //     if(not this->same([](const shape& s) { return is_compatible(*this, s); }))
+    //         MIGRAPHX_THROW(prefix() + "Shapes don't match");
+    //     return *this;
+    // }
+
     /*!
      * Check all shapes have the same number of dimensions.
      */
@@ -239,8 +250,13 @@ struct check_shapes
      */
     const check_shapes& same_layout() const
     {
-        if(not this->same([](const shape& s) { return find_permutation(s); }))
+
+       if(not same_compatible())
             MIGRAPHX_THROW(prefix() + "Layouts do not match");
+
+
+        // if(not this->same_compatible([](const shape& s) { return find_permutation(s); }))
+        //     MIGRAPHX_THROW(prefix() + "Layouts do not match");
         return *this;
     }
 
@@ -368,6 +384,15 @@ struct check_shapes
         return this->all_of([&](const shape& s) { return f(s) == key; });
     }
 
+
+    bool same_compatible() const
+    {
+        if(begin == end)
+            return true;
+        return this->all_of([&](const shape& s) { return migraphx::is_compatible_shape(s, *begin)
+        or   find_permutation(s) == find_permutation(*begin) ; });
+    }
+
     template <class Predicate>
     bool all_of(Predicate p) const
     {
diff --git a/src/include/migraphx/program.hpp b/src/include/migraphx/program.hpp
index e86ba628656..6e9bb41f4af 100644
--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -161,7 +161,6 @@ struct MIGRAPHX_EXPORT program
     void assign(const program& p);
     std::unique_ptr<program_impl> impl;
 };
-
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 
diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index 0c1e7b269d4..890d7721eb4 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -431,6 +431,33 @@ struct MIGRAPHX_EXPORT shape
     std::shared_ptr<const shape_impl> impl;
 };
 
+
+// "Almost identical" shapes.  To support an MLIR feature, there is a limited
+// case where shapes may both be standard but have non-identical strides.
+// #ifndef NDEBUG
+static bool inline is_compatible_shape(const shape& actual, const shape& expected)
+{
+    // Check subshapes
+    if(expected.type() == shape::tuple_type)
+        return equal(actual.sub_shapes().begin(),
+                     actual.sub_shapes().end(),
+                     expected.sub_shapes().begin(),
+                     &is_compatible_shape);
+    // Only the expected can be dynamic
+    if(expected.dynamic())
+        return true;
+    if(actual == expected)
+        return true;
+    if(actual.type() != expected.type())
+        return false;
+    // If both shapes are standard and lens match, they are considered compatible
+    // even if strides are different.
+    if(actual.standard() and expected.standard())
+        return actual.lens() == expected.lens();
+    return false;
+}
+// #endif
+
 /// Flatten subshapes to a single vector of non-tuple type of shapes
 MIGRAPHX_EXPORT std::vector<shape> flatten(const std::vector<shape>& shapes);
 
diff --git a/src/program.cpp b/src/program.cpp
index 25cb16cc950..7de4fa65054 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -35,6 +35,7 @@
 #include <migraphx/register_target.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/iterator.hpp>
+#include <migraphx/shape.hpp>
 #include <migraphx/algorithm.hpp>
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/make_op.hpp>
@@ -426,33 +427,6 @@ void preview_argument(std::ostream& os, const argument& a)
         });
 }
 
-// This function currently used only in an Assertion.
-// "Almost identical" shapes.  To support an MLIR feature, there is a limited
-// case where shapes may both be standard but have non-identical strides.
-#ifndef NDEBUG
-static bool is_compatible_shape(const shape& actual, const shape& expected)
-{
-    // Check subshapes
-    if(expected.type() == shape::tuple_type)
-        return equal(actual.sub_shapes().begin(),
-                     actual.sub_shapes().end(),
-                     expected.sub_shapes().begin(),
-                     &is_compatible_shape);
-    // Only the expected can be dynamic
-    if(expected.dynamic())
-        return true;
-    if(actual == expected)
-        return true;
-    if(actual.type() != expected.type())
-        return false;
-    // If both shapes are standard and lens match, they are considered compatible
-    // even if strides are different.
-    if(actual.standard() and expected.standard())
-        return actual.lens() == expected.lens();
-    return false;
-}
-#endif
-
 template <class F>
 std::vector<argument> generic_eval(const module* mod,
                                    std::vector<context>& ctx,

From 95d7a2fe29a62ca09103c764c5438aaecf81855b Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 8 Aug 2024 16:02:23 +0000
Subject: [PATCH 13/56] style

---
 src/include/migraphx/check_shapes.hpp | 9 +++++----
 src/onnx/onnx_parser.cpp              | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/include/migraphx/check_shapes.hpp b/src/include/migraphx/check_shapes.hpp
index a19dbe2e000..dbee0d2242c 100644
--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -251,10 +251,9 @@ struct check_shapes
     const check_shapes& same_layout() const
     {
 
-       if(not same_compatible())
+        if(not same_compatible())
             MIGRAPHX_THROW(prefix() + "Layouts do not match");
 
-
         // if(not this->same_compatible([](const shape& s) { return find_permutation(s); }))
         //     MIGRAPHX_THROW(prefix() + "Layouts do not match");
         return *this;
@@ -389,8 +388,10 @@ struct check_shapes
     {
         if(begin == end)
             return true;
-        return this->all_of([&](const shape& s) { return migraphx::is_compatible_shape(s, *begin)
-        or   find_permutation(s) == find_permutation(*begin) ; });
+        return this->all_of([&](const shape& s) {
+            return migraphx::is_compatible_shape(s, *begin) or
+                   find_permutation(s) == find_permutation(*begin);
+        });
     }
 
     template <class Predicate>
diff --git a/src/onnx/onnx_parser.cpp b/src/onnx/onnx_parser.cpp
index 07d7f6a52d7..0e58aabf976 100644
--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -300,7 +300,7 @@ int64_t onnx_parser::get_opset_version(const onnx::ModelProto& model)
     return version;
 }
 
-void print_added_instructions(module* mod,
+void print_added_instructions(const module* mod,
                               const std::vector<instruction_ref>& args,
                               const std::vector<instruction_ref>& result)
 {

From a3f40ddba1a62d4f30f4c149c66e2141b7aa7319 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 8 Aug 2024 17:26:21 +0000
Subject: [PATCH 14/56] cleanup method names

---
 src/include/migraphx/check_shapes.hpp         | 34 ++++---------------
 src/include/migraphx/shape.hpp                |  1 -
 .../gpu/include/migraphx/gpu/convolution.hpp  |  2 +-
 test/check_shapes_test.cpp                    |  4 +--
 4 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/src/include/migraphx/check_shapes.hpp b/src/include/migraphx/check_shapes.hpp
index dbee0d2242c..afc77c0c55e 100644
--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -225,16 +225,6 @@ struct check_shapes
         return *this;
     }
 
-    /*!
-     * Check all shapes are compatible.
-     */
-    // const check_shapes& same_compatible() const
-    // {
-    //     if(not this->same([](const shape& s) { return is_compatible(*this, s); }))
-    //         MIGRAPHX_THROW(prefix() + "Shapes don't match");
-    //     return *this;
-    // }
-
     /*!
      * Check all shapes have the same number of dimensions.
      */
@@ -246,16 +236,15 @@ struct check_shapes
     }
 
     /*!
-     * Check all shapes have the same layout.
+     * Check all shapes have the same layout, with minor differences allowed.
      */
-    const check_shapes& same_layout() const
+    const check_shapes& compatible_layout() const
     {
-
-        if(not same_compatible())
+        if(begin != end and this->any_of([&](const shape& s) {
+            return not migraphx::is_compatible_shape(s, *begin) and
+                   find_permutation(s) != find_permutation(*begin);
+        }))
             MIGRAPHX_THROW(prefix() + "Layouts do not match");
-
-        // if(not this->same_compatible([](const shape& s) { return find_permutation(s); }))
-        //     MIGRAPHX_THROW(prefix() + "Layouts do not match");
         return *this;
     }
 
@@ -383,17 +372,6 @@ struct check_shapes
         return this->all_of([&](const shape& s) { return f(s) == key; });
     }
 
-
-    bool same_compatible() const
-    {
-        if(begin == end)
-            return true;
-        return this->all_of([&](const shape& s) {
-            return migraphx::is_compatible_shape(s, *begin) or
-                   find_permutation(s) == find_permutation(*begin);
-        });
-    }
-
     template <class Predicate>
     bool all_of(Predicate p) const
     {
diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index 890d7721eb4..6a2492792f6 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -431,7 +431,6 @@ struct MIGRAPHX_EXPORT shape
     std::shared_ptr<const shape_impl> impl;
 };
 
-
 // "Almost identical" shapes.  To support an MLIR feature, there is a limited
 // case where shapes may both be standard but have non-identical strides.
 // #ifndef NDEBUG
diff --git a/src/targets/gpu/include/migraphx/gpu/convolution.hpp b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
index 1b1c3169830..0738324af4a 100644
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -85,7 +85,7 @@ struct miopen_convolution
         check_shapes{conv_inputs, *this}
             .max_ndims(5)
             .packed_layouts({{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}})
-            .same_layout();
+            .compatible_layout();
         return migraphx::compute_shape<Op>(op, conv_inputs);
     }
 
diff --git a/test/check_shapes_test.cpp b/test/check_shapes_test.cpp
index 42b514d02f8..58241576648 100644
--- a/test/check_shapes_test.cpp
+++ b/test/check_shapes_test.cpp
@@ -53,7 +53,7 @@ TEST_CASE(same_layout_fail)
     EXPECT(test::throws([] {
         shape a{shape::float_type, {2, 3}};
         shape b{shape::float_type, {2, 3}, {1, 2}};
-        migraphx::check_shapes{{a, b}, ""}.same_layout();
+        migraphx::check_shapes{{a, b}, ""}.compatible_layout();
     }));
 }
 
@@ -62,7 +62,7 @@ TEST_CASE(same_layout_pass)
     EXPECT(not test::throws([] {
         shape a{shape::float_type, {2, 3}, {1, 2}};
         shape b{shape::float_type, {2, 3}, {1, 2}};
-        migraphx::check_shapes{{a, b}, ""}.same_layout();
+        migraphx::check_shapes{{a, b}, ""}.compatible_layout();
     }));
 }
 

From 4fffb177e1e58fae3e18782374d587ec652fa752 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 8 Aug 2024 17:26:59 +0000
Subject: [PATCH 15/56] format

---
 src/include/migraphx/check_shapes.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/include/migraphx/check_shapes.hpp b/src/include/migraphx/check_shapes.hpp
index afc77c0c55e..0d59588509f 100644
--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -241,9 +241,9 @@ struct check_shapes
     const check_shapes& compatible_layout() const
     {
         if(begin != end and this->any_of([&](const shape& s) {
-            return not migraphx::is_compatible_shape(s, *begin) and
-                   find_permutation(s) != find_permutation(*begin);
-        }))
+               return not migraphx::is_compatible_shape(s, *begin) and
+                      find_permutation(s) != find_permutation(*begin);
+           }))
             MIGRAPHX_THROW(prefix() + "Layouts do not match");
         return *this;
     }

From 20fb5bc849e931fb9655cc90edbe1cfa3e64ca98 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 8 Aug 2024 18:22:19 +0000
Subject: [PATCH 16/56] style

---
 src/include/migraphx/check_shapes.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/include/migraphx/check_shapes.hpp b/src/include/migraphx/check_shapes.hpp
index 0d59588509f..073b2fe31f0 100644
--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -28,7 +28,6 @@
 #include <migraphx/shape.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/stringutils.hpp>
-#include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
 #include <algorithm>
 

From 23712315e4731e67d859827953988100b2a53567 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 8 Aug 2024 18:29:03 +0000
Subject: [PATCH 17/56] comment

---
 src/include/migraphx/shape.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index 6a2492792f6..e7ff55dfcf1 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -433,7 +433,6 @@ struct MIGRAPHX_EXPORT shape
 
 // "Almost identical" shapes.  To support an MLIR feature, there is a limited
 // case where shapes may both be standard but have non-identical strides.
-// #ifndef NDEBUG
 static bool inline is_compatible_shape(const shape& actual, const shape& expected)
 {
     // Check subshapes
@@ -455,7 +454,6 @@ static bool inline is_compatible_shape(const shape& actual, const shape& expecte
         return actual.lens() == expected.lens();
     return false;
 }
-// #endif
 
 /// Flatten subshapes to a single vector of non-tuple type of shapes
 MIGRAPHX_EXPORT std::vector<shape> flatten(const std::vector<shape>& shapes);

From 0c6bef7d29abd0c767e9c1b4b399ac9896ccbcc9 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Fri, 9 Aug 2024 16:31:06 +0000
Subject: [PATCH 18/56] add test subcases for new function

---
 src/onnx/onnx_parser.cpp |  2 +-
 test/shape_test.cpp      | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/onnx/onnx_parser.cpp b/src/onnx/onnx_parser.cpp
index 0e58aabf976..07d7f6a52d7 100644
--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -300,7 +300,7 @@ int64_t onnx_parser::get_opset_version(const onnx::ModelProto& model)
     return version;
 }
 
-void print_added_instructions(const module* mod,
+void print_added_instructions(module* mod,
                               const std::vector<instruction_ref>& args,
                               const std::vector<instruction_ref>& result)
 {
diff --git a/test/shape_test.cpp b/test/shape_test.cpp
index 56425c52c61..f8f9f34074e 100644
--- a/test/shape_test.cpp
+++ b/test/shape_test.cpp
@@ -90,11 +90,13 @@ TEST_CASE(test_shape_standard_stray_singleton_dim)
 {
     // A shape can be transposed (nonzero strides out of order) but still be considered
     // standard if the only out-of-order strides are on axes with a length of 1.
-    migraphx::shape s{migraphx::shape::float_type, {5, 1, 1, 8}, {8, 3, 4, 1}};
-    EXPECT(s.standard());
-    EXPECT(s.packed());
-    EXPECT(s.transposed());
-    EXPECT(not s.broadcasted());
+    migraphx::shape s1{migraphx::shape::float_type, {5, 1, 1, 8}, {8, 3, 4, 1}};
+    migraphx::shape s2{migraphx::shape::float_type, {5, 1, 1, 8}, {8, 3, 5, 1}};
+    EXPECT(s1.standard());
+    EXPECT(s1.packed());
+    EXPECT(s1.transposed());
+    EXPECT(not s1.broadcasted());
+    EXPECT(is_compatible_shape(s1, s2));
 }
 
 TEST_CASE(test_shape_min_max_opt)
@@ -826,6 +828,7 @@ TEST_CASE(tuple_copy)
     EXPECT(s3 == s2);
     migraphx::shape s4{{migraphx::shape{migraphx::shape::int8_type},
                         migraphx::shape{migraphx::shape::float_type}}};
+    EXPECT(!is_compatible_shape(s1, s4));
     EXPECT(s4 != s1);
     EXPECT(s4 != s2);
     EXPECT(s4 != s3);

From ef1d2f6ff2726af0bc2cc833362b5e86c6a2dff6 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Fri, 9 Aug 2024 20:48:30 +0000
Subject: [PATCH 19/56] style

---
 test/shape_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/shape_test.cpp b/test/shape_test.cpp
index f8f9f34074e..005968c4996 100644
--- a/test/shape_test.cpp
+++ b/test/shape_test.cpp
@@ -828,7 +828,7 @@ TEST_CASE(tuple_copy)
     EXPECT(s3 == s2);
     migraphx::shape s4{{migraphx::shape{migraphx::shape::int8_type},
                         migraphx::shape{migraphx::shape::float_type}}};
-    EXPECT(!is_compatible_shape(s1, s4));
+    EXPECT(not is_compatible_shape(s1, s4));
     EXPECT(s4 != s1);
     EXPECT(s4 != s2);
     EXPECT(s4 != s3);

From 94392aac0218e95dbb7bee685c2409e0fb4f5c64 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 17 Sep 2024 22:44:38 +0000
Subject: [PATCH 20/56] bug fix work in progress.  Contains fixed source code. 
 Contains debug code.  Tests need to be completed, including updating
 generated onnx test files.

---
 src/include/migraphx/op/roialign.hpp  |  91 ++++++++++++++++++++++----
 test/onnx/gen_onnx.py                 |  15 +++--
 test/onnx/roialign_default_test.onnx  |   5 +-
 test/onnx/roialign_test.onnx          | Bin 345 -> 338 bytes
 test/onnx/verify/celu_verify_test.cpp |  20 +++++-
 5 files changed, 108 insertions(+), 23 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index d66e8f0feeb..63a398fe5b0 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -114,46 +114,94 @@ struct roialign
     {
         std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
                                         output_width);
+
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
-            std::array<std::size_t, 2> p = {idx_v[0], idx_v[1]};
-            std::array<std::size_t, 2> i = {idx_v[2], idx_v[3]};
+for(auto aa : comp_s.multi(index)) printf(", %lu ", aa);
+printf("index\n");
+
+            // The p and i indexes are looping parameters in ORT and go in y, x order.  The i[x] value is least significant
+            // and iterates the fastest.
+            std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
+            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]};
+printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
 
+            // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
+            // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies inside)
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
+    // std::cout << " GGGGG inputs to xy calculation: roi_start=" << roi_start[0] << ", " << roi_start[1] << ",  p=[0,1]: " << p[0] << ", " << p[1] << ", bin_size="
+    //                  << bin_size[0] << ", "  << bin_size[1] << " rounding factor=" << (i[0] + .5f) << ", " << (i[1] + .5f) << "   bin_grid_size=" << bin_grid_size[0] <<", " << bin_grid_size[1] <<"\n";
             for(auto ii : range(p.size()))
             {
+    // if(ii == 0)
+    // printf("QQQQQ x: " );
+    // else
+    // printf("QQQQQ y: " );
+                // for width & height dimensions,
+                // transform the roi start point to scaled coordinates
+// printf("    roi_start[ii] %f p[ii] %lu bin_size[ii] %f (i[ii] + .5f) %f    bin_size[ii] %f   bin_grid_size[ii] %lu       ",
+// roi_start[ii], p[ii], bin_size[ii], (i[ii] + .5f),    bin_size[ii],   bin_grid_size[ii] );
+
+
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-                xy[ii] = (coord_trans_mode == "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+// printf("L137 %f ", xy[ii]);                        
+                xy[ii] = (coord_trans_mode != "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+// printf("L139 %f ", xy[ii]);                        
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
+// printf("L142 results = pos_weight \n ");                        
                     results[index] = pos_weight{};
                     return;
                 }
 
                 xy[ii]   = std::max(xy[ii], 0.0f);
+// printf("L148 %f ", xy[ii]);                        
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
                 if(low[ii] >= dims[ii] - 1)
                 {
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+// printf("L154 %f ", xy[ii]);                        
                 }
+// printf("\n");
             }
-
+            // printf(" FFFFF  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld \n",
+            //                 xy[0], xy[1], dims[1], low[0], low[1],  high[0], high[1]);
             results[index].pos = {low[0] * dims[1] + low[1],
                                   low[0] * dims[1] + high[1],
                                   high[0] * dims[1] + low[1],
                                   high[0] * dims[1] + high[1]};
 
-            float ly = xy[0] - low[0];
-            float lx = xy[1] - low[1];
+            float lx = xy[0] - low[0];
+            float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-
+            printf(" HHHHH partial pixel values, index=%lu  ly=%f, lx=%f, hy=%f, hx=%f\n", index, ly, lx, hy, hx);
             // save weights and indeces
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+
+            // printf("  DDDDD calc_pos_weight precalc ");
+            // for(int aa = 0; aa < 4; aa++)
+            // {
+            //     std::cout << results[index].pos[aa] << ", " << results[index].w[aa] << "    ";
+            // }
+
+     printf(" DDDDD index  %zu    %f  %f  %f  %f \n\n", index,
+                // results[index].pos[0], 
+                // results[index].pos[1], 
+                // results[index].pos[2], 
+                // results[index].pos[3], 
+          float(results[index].w[0]), 
+          float(results[index].w[1]), 
+          float(results[index].w[2]), 
+          float(results[index].w[3]) 
+          );
+
+
         });
+      printf("size of calc_pos_weight vector is %lu\n", results.size());
 
         return results;
     }
@@ -219,14 +267,26 @@ struct roialign
             const auto* batch_indices = args.at(2).cast<int64_t>();
             par_for(n_rois, [&](auto n) {
                 const auto bottom_data   = x.begin();
+                std::cout << "MIGraphX AAAAA x begins " <<  "\n";
                 const auto roi_batch_ind = batch_indices[n];
                 // Do not using rounding; this implementation detail is critical
+      float offset = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
                 std::array<float, 2> roi_starts = {
-                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
-                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale - offset),
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale - offset)};
                 std::array<float, 2> roi_ends = {
-                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
-                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale - offset),
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale - offset)};
+
+                // std::cout << " CCCCC roialign compute(): scale ,  starts (x, x)  ends (x, x)" << ", " << spatial_scale << ",  " <<  roi_starts[0] << ", " << 
+                //  roi_starts[1] << ",  " << 
+                //     roi_ends[0] << ", " <<  roi_ends[1] << "\n";
+                // std::cout << " CCCCC roi is  x, x, x, x x" << ", " <<  roi[roi_s.index({n, 0})] << ", " <<  
+                //     roi[roi_s.index({n, 1})] << ", " <<  roi[roi_s.index({n, 2})] << ", " <<  roi[roi_s.index({n, 3})] << "\n\n";
+
+      printf("CCCCC roialign compute():  roi_start_w = %f, roi_start_h =%f, roi_end_w=%f, roi_end_h=%f \n",
+              float(roi_starts[0]), float(roi_starts[1]), float(roi_ends[0]), float(roi_ends[1]));
+
 
                 // Force malformed ROIs to be 1x1
                 std::array<float, 2> roi_size{};
@@ -236,7 +296,8 @@ struct roialign
                 for(auto ii : range(roi_size.size()))
                 {
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
-                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
+                    if(coord_trans_mode != "half_pixel")
+                        roi_size[ii] = std::max(roi_size[ii], 1.0f);
 
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
@@ -277,10 +338,16 @@ struct roialign
                                                  vec_index[c],
                                                  max_pool{});
                     output(n, c, ph, pw) = output_val;
+            // int64_t index = index_n_c + ph * pooled_width + pw;
+
+                    //  printf(" GGGGG a single output is %f f   n %lu c %lu ph %lu pw %lu\n" , 
+                    // float(output_val),   n, c , ph , pw);
                 });
             });
         });
 
+        printf(" end compute\n\n\n");
+
         return result;
     }
 };
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index 3efc787c559..c31805e8294 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -751,7 +751,7 @@ def celu_default_test():
 
     return ([node], [x], [y])
 
-
+# see also def roialign_test():
 @onnx_test()
 def celu_verify_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 3])
@@ -10110,14 +10110,15 @@ def roialign_default_test():
 
     return ([node], [x, roi, bi], [y])
 
-
+# see also celu_verify_test
 @onnx_test()
 def roialign_test():
-    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 5, 4, 7])
-    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [8, 4])
-    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [8])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [8, 4, 5, 5])
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 1, 2, 3])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [1, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [1])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4, 2, 2])
 
+    # half_pixel is the new mode we're developing for
     node = onnx.helper.make_node(
         'RoiAlign',
         inputs=['x', 'rois', 'batch_ind'],
@@ -10127,7 +10128,7 @@ def roialign_test():
         output_width=5,
         sampling_ratio=3,
         mode="avg",
-        coordinate_transformation_mode="output_half_pixel")
+        coordinate_transformation_mode="half_pixel")
 
     return ([node], [x, roi, bi], [y])
 
diff --git a/test/onnx/roialign_default_test.onnx b/test/onnx/roialign_default_test.onnx
index 4421e17be60..3f54104fdd6 100644
--- a/test/onnx/roialign_default_test.onnx
+++ b/test/onnx/roialign_default_test.onnx
@@ -1,4 +1,5 @@
-roialign_default_test:�
+
+roialign_default_test:�
 !
 x
 rois
@@ -23,4 +24,4 @@
 
 
 
-B
\ No newline at end of file
+B
\ No newline at end of file
diff --git a/test/onnx/roialign_test.onnx b/test/onnx/roialign_test.onnx
index f39485530c4758b3fadd6c7d5fe5ad180cc75a73..d5b9d5bbad1d95d7b076fb3c3e0d258aedb88a37 100644
GIT binary patch
delta 113
zcmcb~bcsofgG-3FC_ghXCo?@Sz9hA{#A+84*WQU@%8dFGtrQu#CMHh@l1v;R%ml>D
xQDR&yKn=x0d|W&nj6xhBxyjOuGD?g|(p-#{LSkGZKw$x>78a;BCMPBVQ2?7o64L+x

delta 143
zcmcb_bdyPpgI$QXC_ghXCo?@Sz9hA{#Ofdu*O7^0%8X_ctrR%~^Giz#N=xD=#!Qdr
z;$q@p1!5K;W{(o%Vgc$a7UJXL;b0Wv0E&SyOO!YlXHsHGaz=b+UWyPG7YEP+kOqz<
aX)eY}Au%oyplSh_Y9I|Vk=2PwfCm771{x6n

diff --git a/test/onnx/verify/celu_verify_test.cpp b/test/onnx/verify/celu_verify_test.cpp
index dc715255037..e71e300e665 100644
--- a/test/onnx/verify/celu_verify_test.cpp
+++ b/test/onnx/verify/celu_verify_test.cpp
@@ -28,18 +28,34 @@
 
 TEST_CASE(celu_verify_test)
 {
-    migraphx::program p = read_onnx("celu_verify_test.onnx");
+    //  ../../build/bin/test_verify_onnx celu_verify_test
+    migraphx::program p = read_onnx("roialign_test.onnx");
     p.compile(migraphx::make_target("ref"));
 
-    migraphx::shape s{migraphx::shape::float_type, {2, 3}};
+    migraphx::shape s{migraphx::shape::float_type, {1, 1, 2, 3}};
     std::vector<float> data = {-5.5, 2.0, 100., 7.0, 0., -1.};
 
     migraphx::parameter_map pp;
     pp["x"]     = migraphx::argument(s, data.data());
+    pp["y"]     = migraphx::argument(s, data.data());  // ?
+
+        // migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
+    migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
+    std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35};
+    migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
+    std::vector<float> bi_data = {0};
+
+    pp["rois"]    = migraphx::argument(srois, rois_data.data());
+    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
+
     auto result = p.eval(pp).back();
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
+printf(" result:  ");
+for(auto aa : result_vector) printf(" %f ", aa);
+printf("\n");
+
     std::vector<float> gold(6);
     float alpha = 0.5;
     std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {

From a43303c4612e623daee46a60a82a8e2d886f8019 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 23 Sep 2024 22:21:10 +0000
Subject: [PATCH 21/56] reordered lens for iteration shape; added some tests. 
 Passes roialign_half_pixel_verify_test for first roi but fails for second

---
 ort_roialign.py                           |  59 ++++++++++++++
 src/include/migraphx/op/roialign.hpp      |  93 ++++++----------------
 test/onnx/gen_onnx.py                     |  61 ++++++++++++--
 test/onnx/parse/roialign_test.cpp         |   4 +-
 test/onnx/roialign_half_pixel_test.onnx   | Bin 0 -> 360 bytes
 test/onnx/roialign_test.onnx              | Bin 338 -> 345 bytes
 test/onnx/verify/celu_verify_test.cpp     |  20 +----
 test/onnx/verify/roialign_verify_test.cpp |  71 +++++++++++++++++
 tools/build_and_test_onnxrt.sh            |  10 +--
 9 files changed, 218 insertions(+), 100 deletions(-)
 create mode 100644 ort_roialign.py
 create mode 100644 test/onnx/roialign_half_pixel_test.onnx
 create mode 100644 test/onnx/verify/roialign_verify_test.cpp

diff --git a/ort_roialign.py b/ort_roialign.py
new file mode 100644
index 00000000000..b7a1a770bd8
--- /dev/null
+++ b/ort_roialign.py
@@ -0,0 +1,59 @@
+
+# Not for release.  This test script is for develop/test only
+
+import onnx
+import onnxruntime as rt
+# from https://onnxruntime.ai/docs/get-started/with-python.html
+import numpy as np
+print(" version: ", onnx.__version__, rt.__version__)
+
+
+x = np.array(np.arange(2*2*4*3), dtype='f')
+x = np.reshape(x, [2, 2, 4, 3])
+
+y=np.ones([2, 2, 4, 7], dtype='f')
+
+# x = np.array([[[[2,3,4], [5,6, 7]]]], dtype='f')
+rois=np.array([[0.1, 0.15, 0.6, 0.35],
+                [0.1, 0.15, 2.6, 1.35]], dtype='f')
+sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_half_pixel_test.onnx')
+res = sess.run(['y'], {'x': x,
+                    'rois': rois,
+                    'batch_ind': [0, 1]})
+print(res)
+       
+		
+# model_file = "test/onnx/roialign_test.onnx"
+# onnx_model = onnx.load(model_file)
+# onnx.checker.check_model(onnx_model)
+
+
+# #define the priority order for the execution providers
+# EP_list = ['CPUExecutionProvider']
+
+# aa = np.asarray(np.arange(3*2*4*5), dtype='f')
+# # bi = np.reshape(aa, [3, 2, 4, 5])
+
+# # initialize the model.onnx
+# sess = rt.InferenceSession(model_file, providers=EP_list)
+# x, rois, batch_ind = (np.reshape(aa, [3, 2, 4, 5]),
+#             np.array([[0.1, 0.15, 0.6, 0.35],
+#                       [2.1, 1.73, 3.8, 2.13]], dtype='f'),
+#             np.array([0, 1], dtype='int64'))
+
+# #  Use the parameter names defined in the onnx file
+# output = sess.run(None, {'x':  x,
+#                          'rois': rois,
+#                          'batch_ind': batch_ind,
+#                          })
+
+# print(' output is ', output)
+
+
+# # get the outputs metadata as a list of :class:`onnxruntime.NodeArg`
+# output_name = sess.get_outputs()[0].name
+
+# # get the inputs metadata as a list of :class:`onnxruntime.NodeArg`
+# input_name = sess.get_inputs()[0].name
+# print("Names are  ",input_name, output_name)
+
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 63a398fe5b0..c36b7f9b501 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -92,8 +92,8 @@ struct roialign
 
         std::vector<std::size_t> out_lens = x_lens;
         out_lens[0]                       = roi_lens[0];
-        out_lens[2]                       = output_height;
-        out_lens[3]                       = output_width;
+        out_lens[2]                       = output_width;
+        out_lens[3]                       = output_height;
 
         return {type, out_lens};
     }
@@ -116,59 +116,54 @@ struct roialign
                                         output_width);
 
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
-for(auto aa : comp_s.multi(index)) printf(", %lu ", aa);
-printf("index\n");
 
-            // The p and i indexes are looping parameters in ORT and go in y, x order.  The i[x] value is least significant
+            // The p and i indexes correspond to nested looping parameters in ORT that go in y, x order.  The i[x] value is least significant
             // and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
-            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]};
-printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
-
+            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]};//  <== these are always the same
+printf(" IIIII other index %lu , %lu , %lu , %lu  i=%lu \n", p[0], p[1], i[0], i[1], index);
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
             // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies inside)
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
-    // std::cout << " GGGGG inputs to xy calculation: roi_start=" << roi_start[0] << ", " << roi_start[1] << ",  p=[0,1]: " << p[0] << ", " << p[1] << ", bin_size="
-    //                  << bin_size[0] << ", "  << bin_size[1] << " rounding factor=" << (i[0] + .5f) << ", " << (i[1] + .5f) << "   bin_grid_size=" << bin_grid_size[0] <<", " << bin_grid_size[1] <<"\n";
+
             for(auto ii : range(p.size()))
             {
     // if(ii == 0)
-    // printf("QQQQQ x: " );
+    // printf("x: " );
     // else
-    // printf("QQQQQ y: " );
+    // printf("y: " );
                 // for width & height dimensions,
                 // transform the roi start point to scaled coordinates
-// printf("    roi_start[ii] %f p[ii] %lu bin_size[ii] %f (i[ii] + .5f) %f    bin_size[ii] %f   bin_grid_size[ii] %lu       ",
-// roi_start[ii], p[ii], bin_size[ii], (i[ii] + .5f),    bin_size[ii],   bin_grid_size[ii] );
-
+printf("    roi_start[ii] %f    p[ii]  %lu   bin_size[ii] %f   (i[ii] + .5f) %f      bin_grid_size[ii] %lu       \n",
+roi_start[ii], p[ii], bin_size[ii], (i[ii] + .5f),     bin_grid_size[ii] );
 
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-// printf("L137 %f ", xy[ii]);                        
+printf(" QQQQQQ  L137 x=%f  y=%f\n", xy[0], xy[1]);                                        
                 xy[ii] = (coord_trans_mode != "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
 // printf("L139 %f ", xy[ii]);                        
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
-// printf("L142 results = pos_weight \n ");                        
+// printf("L142 results = pos_weight \n ");                    
                     results[index] = pos_weight{};
                     return;
                 }
 
                 xy[ii]   = std::max(xy[ii], 0.0f);
-// printf("L148 %f ", xy[ii]);                        
+// printf("L148 %f ", xy[ii]);                
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
                 if(low[ii] >= dims[ii] - 1)
                 {
+// printf("L154 %f ", xy[ii]);                    
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
-// printf("L154 %f ", xy[ii]);                        
                 }
-// printf("\n");
+// printf("\n");                
             }
-            // printf(" FFFFF  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld \n",
-            //                 xy[0], xy[1], dims[1], low[0], low[1],  high[0], high[1]);
+            // printf(" JJJJJ  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld   i=%zu\n\n",
+            //                 xy[0], xy[1], dims[1], low[0], low[1],  high[0], high[1], index);
             results[index].pos = {low[0] * dims[1] + low[1],
                                   low[0] * dims[1] + high[1],
                                   high[0] * dims[1] + low[1],
@@ -178,30 +173,11 @@ printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
             float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-            printf(" HHHHH partial pixel values, index=%lu  ly=%f, lx=%f, hy=%f, hx=%f\n", index, ly, lx, hy, hx);
-            // save weights and indeces
+            // printf(" HHHHH partial pixel values, index=%lu  ly=%f, lx=%f, hy=%f, hx=%f\n\n", index, ly, lx, hy, hx);
+            // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
 
-            // printf("  DDDDD calc_pos_weight precalc ");
-            // for(int aa = 0; aa < 4; aa++)
-            // {
-            //     std::cout << results[index].pos[aa] << ", " << results[index].w[aa] << "    ";
-            // }
-
-     printf(" DDDDD index  %zu    %f  %f  %f  %f \n\n", index,
-                // results[index].pos[0], 
-                // results[index].pos[1], 
-                // results[index].pos[2], 
-                // results[index].pos[3], 
-          float(results[index].w[0]), 
-          float(results[index].w[1]), 
-          float(results[index].w[2]), 
-          float(results[index].w[3]) 
-          );
-
-
         });
-      printf("size of calc_pos_weight vector is %lu\n", results.size());
 
         return results;
     }
@@ -256,7 +232,7 @@ printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
         int64_t n_rois       = out_lens[0];
         std::size_t channels = out_lens[1];
         // output dims of height and width, in all 2-dim arrays, the first dim
-        // is for height and second dim is for width
+        // is for height and second dim is for width i.e. (y, x) order
         std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
         const auto& x_lens                  = args.at(0).get_shape().lens();
         // input dims of height and width
@@ -267,10 +243,9 @@ printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
             const auto* batch_indices = args.at(2).cast<int64_t>();
             par_for(n_rois, [&](auto n) {
                 const auto bottom_data   = x.begin();
-                std::cout << "MIGraphX AAAAA x begins " <<  "\n";
                 const auto roi_batch_ind = batch_indices[n];
-                // Do not using rounding; this implementation detail is critical
-      float offset = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
+                // Do not use rounding; this implementation detail is critical
+                float offset = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
                 std::array<float, 2> roi_starts = {
                     static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale - offset),
                     static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale - offset)};
@@ -278,17 +253,7 @@ printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
                     static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale - offset),
                     static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale - offset)};
 
-                // std::cout << " CCCCC roialign compute(): scale ,  starts (x, x)  ends (x, x)" << ", " << spatial_scale << ",  " <<  roi_starts[0] << ", " << 
-                //  roi_starts[1] << ",  " << 
-                //     roi_ends[0] << ", " <<  roi_ends[1] << "\n";
-                // std::cout << " CCCCC roi is  x, x, x, x x" << ", " <<  roi[roi_s.index({n, 0})] << ", " <<  
-                //     roi[roi_s.index({n, 1})] << ", " <<  roi[roi_s.index({n, 2})] << ", " <<  roi[roi_s.index({n, 3})] << "\n\n";
-
-      printf("CCCCC roialign compute():  roi_start_w = %f, roi_start_h =%f, roi_end_w=%f, roi_end_h=%f \n",
-              float(roi_starts[0]), float(roi_starts[1]), float(roi_ends[0]), float(roi_ends[1]));
-
-
-                // Force malformed ROIs to be 1x1
+                // Force malformed ROIs to be 1x1, output_half_pixel transform mode
                 std::array<float, 2> roi_size{};
                 std::array<float, 2> bin_size{};
                 std::array<std::size_t, 2> bin_grid_size{};
@@ -298,7 +263,7 @@ printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
                     if(coord_trans_mode != "half_pixel")
                         roi_size[ii] = std::max(roi_size[ii], 1.0f);
-
+printf("\n KKKKK roi_size %f out_dims %lu     \n", roi_size[ii] , out_dims[ii]);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
@@ -308,7 +273,7 @@ printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
                 // we want to precalculate indices and weights shared by all channels,
                 // this is the key point of optimization
                 std::vector<std::size_t> comp_lens = {
-                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                    out_dims[1], out_dims[0], bin_grid_size[1], bin_grid_size[0]};
                 shape comp_s{shape::float_type, comp_lens};
                 auto pre_calc =
                     this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
@@ -338,16 +303,10 @@ printf(" IIIII other index %lu , %lu , %lu , %lu\n", p[0], p[1], i[0], i[1]);
                                                  vec_index[c],
                                                  max_pool{});
                     output(n, c, ph, pw) = output_val;
-            // int64_t index = index_n_c + ph * pooled_width + pw;
-
-                    //  printf(" GGGGG a single output is %f f   n %lu c %lu ph %lu pw %lu\n" , 
-                    // float(output_val),   n, c , ph , pw);
                 });
             });
         });
 
-        printf(" end compute\n\n\n");
-
         return result;
     }
 };
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index c31805e8294..8f0e9656c51 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -751,7 +751,6 @@ def celu_default_test():
 
     return ([node], [x], [y])
 
-# see also def roialign_test():
 @onnx_test()
 def celu_verify_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 3])
@@ -10110,15 +10109,13 @@ def roialign_default_test():
 
     return ([node], [x, roi, bi], [y])
 
-# see also celu_verify_test
+
 @onnx_test()
 def roialign_test():
-    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 1, 2, 3])
-    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [1, 4])
-    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [1])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4, 2, 2])
-
-    # half_pixel is the new mode we're developing for
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 2, 4, 5])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 2, 4, 5])
     node = onnx.helper.make_node(
         'RoiAlign',
         inputs=['x', 'rois', 'batch_ind'],
@@ -10128,11 +10125,59 @@ def roialign_test():
         output_width=5,
         sampling_ratio=3,
         mode="avg",
+        coordinate_transformation_mode="output_half_pixel")
+
+    return ([node], [x, roi, bi], [y])
+
+
+@onnx_test()
+def roialign_half_pixel_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 2, 4, 3])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 4, 3])
+
+    # half_pixel is the new mode we're developing for
+    node = onnx.helper.make_node(
+        'RoiAlign',
+        inputs=['x', 'rois', 'batch_ind'],
+        outputs=['y'],
+        spatial_scale=2.0,
+        output_height=7,
+        output_width=9,
+        sampling_ratio=3,
+        mode="avg",
         coordinate_transformation_mode="half_pixel")
 
     return ([node], [x, roi, bi], [y])
 
 
+
+
+
+
+# @onnx_test()
+# def roialign_half_pixel_test():
+#     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 1, 2, 3])
+#     roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [1, 4])
+#     bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [1])
+#     y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4, 2, 2])
+
+#     # half_pixel is the new mode we're developing for
+#     node = onnx.helper.make_node(
+#         'RoiAlign',
+#         inputs=['x', 'rois', 'batch_ind'],
+#         outputs=['y'],
+#         spatial_scale=2.0,
+#         output_height=5,
+#         output_width=5,
+#         sampling_ratio=3,
+#         mode="avg",
+#         coordinate_transformation_mode="half_pixel")
+
+#     return ([node], [x, roi, bi], [y])
+
+
 @onnx_test()
 def round_half_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [4, 4])
diff --git a/test/onnx/parse/roialign_test.cpp b/test/onnx/parse/roialign_test.cpp
index 05f27b6473c..52bb8681d4d 100644
--- a/test/onnx/parse/roialign_test.cpp
+++ b/test/onnx/parse/roialign_test.cpp
@@ -26,7 +26,7 @@
 
 TEST_CASE(roialign_test)
 {
-    migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
+    migraphx::shape sx{migraphx::shape::float_type, {3, 2, 4, 5}};
     migraphx::shape srois{migraphx::shape::float_type, {8, 4}};
     migraphx::shape sbi{migraphx::shape::int64_type, {8}};
 
@@ -41,7 +41,7 @@ TEST_CASE(roialign_test)
                           {{"coordinate_transformation_mode", "output_half_pixel"},
                            {"spatial_scale", 2.0f},
                            {"output_height", 5},
-                           {"output_width", 5},
+                           {"output_width", 3},
                            {"sampling_ratio", 3}}),
         x,
         rois,
diff --git a/test/onnx/roialign_half_pixel_test.onnx b/test/onnx/roialign_half_pixel_test.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..b6ca215a9fd362ea9737b42897c1a291c8502e03
GIT binary patch
literal 360
zcmZ`#u};G<5RIKkbx&<nnWl||#LzJdI}`8;j9Dx<aT8099oa6lf0G|!<iD_?6)YW2
zr}ysOJG~=7OKT)oveT^R>cB>M5{h}@{L>ePPXy=$Sy*u}-*eyeOzM`R^Nze3`4VN-
zBRn+5*j8%p1@o3`cQAI~UK-5?(~2GZIZr69a>#z@l)rZouVHD9emr{C3)%I)oJ|N<
zcd)pyAF}m*Igexkx#PpAr0$rF7s@o!TUfd=K(3f;xDrK@B+nB<x5L%<AsF4jnjpFY
nOz@?$(9?SJ8=2;?2y=3s)!YA103c*WqX6%N+xeE;D_#}f&gENm

literal 0
HcmV?d00001

diff --git a/test/onnx/roialign_test.onnx b/test/onnx/roialign_test.onnx
index d5b9d5bbad1d95d7b076fb3c3e0d258aedb88a37..3127c15a2b79db3a13aaaf38c46390d22ba63b70 100644
GIT binary patch
delta 143
zcmcb_bdyPpgG-3FC_ghXCo?@Sz9hA{#Ofdu*O7^0%8X_ctrR%~^Giz#N=xD=#!QcA
z=3?St0%8^*W{nc#Vgc$a7UJXL;b0WvfXPLPb8#jmmLzAyXXd2{adB~QunU1SFeOQI
WF;)tRaftv`3jhUBOmkuq5Cs5voEiN9

delta 113
zcmcb~bcsofgG-3FC_ghXCo?@Sz9hA{#A+84*WQU@%8dFGtrQu#CMHh@l1v;R%ml>D
xQDR&yKn=x0d|W&nj6xhBxyjOuGD?g|(p-#{LSkGZKw$x>78a;BCMPBVQ2?7o64L+x

diff --git a/test/onnx/verify/celu_verify_test.cpp b/test/onnx/verify/celu_verify_test.cpp
index e71e300e665..dc715255037 100644
--- a/test/onnx/verify/celu_verify_test.cpp
+++ b/test/onnx/verify/celu_verify_test.cpp
@@ -28,34 +28,18 @@
 
 TEST_CASE(celu_verify_test)
 {
-    //  ../../build/bin/test_verify_onnx celu_verify_test
-    migraphx::program p = read_onnx("roialign_test.onnx");
+    migraphx::program p = read_onnx("celu_verify_test.onnx");
     p.compile(migraphx::make_target("ref"));
 
-    migraphx::shape s{migraphx::shape::float_type, {1, 1, 2, 3}};
+    migraphx::shape s{migraphx::shape::float_type, {2, 3}};
     std::vector<float> data = {-5.5, 2.0, 100., 7.0, 0., -1.};
 
     migraphx::parameter_map pp;
     pp["x"]     = migraphx::argument(s, data.data());
-    pp["y"]     = migraphx::argument(s, data.data());  // ?
-
-        // migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
-    migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
-    std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35};
-    migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
-    std::vector<float> bi_data = {0};
-
-    pp["rois"]    = migraphx::argument(srois, rois_data.data());
-    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
-
     auto result = p.eval(pp).back();
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-printf(" result:  ");
-for(auto aa : result_vector) printf(" %f ", aa);
-printf("\n");
-
     std::vector<float> gold(6);
     float alpha = 0.5;
     std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {
diff --git a/test/onnx/verify/roialign_verify_test.cpp b/test/onnx/verify/roialign_verify_test.cpp
new file mode 100644
index 00000000000..b1c9b715af4
--- /dev/null
+++ b/test/onnx/verify/roialign_verify_test.cpp
@@ -0,0 +1,71 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/register_target.hpp>
+#include <migraphx/verify.hpp>
+#include <onnx_test.hpp>
+
+TEST_CASE(roialign_verify_test)
+{
+    migraphx::program p = read_onnx("roialign_test.onnx");
+    p.compile(migraphx::make_target("ref"));
+
+    migraphx::shape s{migraphx::shape::float_type, {3, 2, 4, 5}};
+    std::vector<float> data(3*5*4*2);
+    std::iota(data.begin(), data.end(), 0);
+
+    migraphx::parameter_map pp;
+    pp["x"]     = migraphx::argument(s, data.data());
+    pp["y"]     = migraphx::argument(s, data.data());
+
+    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
+    std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35,
+                                    2.1, 1.73, 3.8, 2.13};
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    std::vector<float> bi_data = {0, 1};
+
+    pp["rois"]    = migraphx::argument(srois, rois_data.data());
+    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
+
+    auto result = p.eval(pp).back();
+    std::vector<float> result_vector;
+    result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
+
+printf(" result:  ");
+for(auto aa : result_vector) printf(" %f ", aa);
+printf("\n");
+
+    std::vector<float> gold = {   0.000000,  0.022222,  0.200000,  0.400000,  0.600000,  0.500000,  0.522222,  0.700000,  0.900000,  1.100000,  1.500000,  1.522223,  1.700000,
+      1.900000, 2.100000, 2.500000, 2.522222, 2.700000, 2.900000, 3.100000, 3.500000, 3.522222, 3.700000, 3.900000, 4.100000, 20.000000, 20.022223, 20.200001, 20.400000, 20.600000, 20.500000, 20.522223, 
+      20.700001, 20.900000, 21.100000, 21.500000, 21.522223, 21.700001, 21.900000, 22.100000, 22.500000, 22.522223, 22.700001, 22.900000, 23.100000, 23.500000, 23.522223, 23.700001, 
+      23.900000, 24.100000, 5.888889, 0.000000, 0.000000, 0.000000, 0.000000, 6.000000, 0.000000, 0.000000, 0.000000, 0.000000, 6.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+    6.000000, 0.000000, 0.000000, 0.000000, 0.000000, 6.000000, 0.000000, 0.000000, 0.000000, 0.000000, 12.555555, 0.000000, 0.000000, 0.000000, 0.000000, 12.666667, 0.000000,
+        0.000000, 0.000000, 0.000000, 12.666667, 0.000000, 0.000000, 0.000000, 0.000000, 12.666667, 0.000000, 0.000000, 0.000000, 0.000000, 12.666667, 0.000000, 0.000000,
+        0.000000,  0.000000 };
+    float alpha = 0.5;
+    std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {
+        return std::max(0.0f, x) + std::min(0.0f, alpha * std::expm1(x / alpha));
+    });
+    EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
+}
diff --git a/tools/build_and_test_onnxrt.sh b/tools/build_and_test_onnxrt.sh
index 19147c84ddb..a3a8fdfbf61 100755
--- a/tools/build_and_test_onnxrt.sh
+++ b/tools/build_and_test_onnxrt.sh
@@ -36,8 +36,8 @@ export CXXFLAGS="-D__HIP_PLATFORM_AMD__=1 -w"
 cd build/Linux/Release
 #Add test launcher for onnxrt tests
 
-echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions2' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-echo 'InferenceSessionTests.Test3LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-echo 'InferenceSessionTests.Test2LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-../../../tools/ci_build/github/pai/pai_test_launcher.sh || (gdb ./onnxruntime_test_all core -batch -ex bt && exit 1)
+# echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+# echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions2' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+# echo 'InferenceSessionTests.Test3LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+# echo 'InferenceSessionTests.Test2LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+# ../../../tools/ci_build/github/pai/pai_test_launcher.sh || (gdb ./onnxruntime_test_all core -batch -ex bt && exit 1)

From 69d0d444d10299550a97c0211f2bec38d1b5e100 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 24 Sep 2024 23:13:07 +0000
Subject: [PATCH 22/56] bug fixes and added roialign_half_pixel_verify_test
 which passes.  Work in progress with debug code.

---
 ort_roialign.py                               |  11 +-
 src/include/migraphx/op/roialign.hpp          |  49 ++++---
 test/onnx/gen_onnx.py                         |   4 +-
 test/onnx/roialign_half_pixel_test.onnx       | Bin 360 -> 360 bytes
 .../roialign_half_pixel_verify_test.cpp       | 120 ++++++++++++++++++
 test/onnx/verify/roialign_verify_test.cpp     |  14 +-
 6 files changed, 169 insertions(+), 29 deletions(-)
 create mode 100644 test/onnx/verify/roialign_half_pixel_verify_test.cpp

diff --git a/ort_roialign.py b/ort_roialign.py
index b7a1a770bd8..db06f24c07d 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -13,13 +13,16 @@
 
 y=np.ones([2, 2, 4, 7], dtype='f')
 
-# x = np.array([[[[2,3,4], [5,6, 7]]]], dtype='f')
-rois=np.array([[0.1, 0.15, 0.6, 0.35],
-                [0.1, 0.15, 2.6, 1.35]], dtype='f')
+# rois=np.array([[0.1, 0.15, 0.6, 0.35],
+#                 [0.1, 0.15, 2.6, 1.35]], dtype='f')
+
+rois=np.array([
+                [ 1.1, 0.73, 2.2, 1.13]], dtype='f')
 sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_half_pixel_test.onnx')
 res = sess.run(['y'], {'x': x,
                     'rois': rois,
-                    'batch_ind': [0, 1]})
+                    # 'batch_ind': [0, 1]})
+                    'batch_ind': [0]})
 print(res)
        
 		
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index c36b7f9b501..06ec14c0016 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -114,20 +114,25 @@ struct roialign
     {
         std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
                                         output_width);
-
+std::vector<std::size_t> temp_lens = comp_s.lens();                                        
+shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], temp_lens[2] }};
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
 
             // The p and i indexes correspond to nested looping parameters in ORT that go in y, x order.  The i[x] value is least significant
             // and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
             std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]};//  <== these are always the same
-printf(" IIIII other index %lu , %lu , %lu , %lu  i=%lu \n", p[0], p[1], i[0], i[1], index);
+printf("\n IIIII other index %lu , %lu , %lu , %lu  i=%lu   temp_index = %lu \n", p[0], p[1], i[0], i[1], index, temp_s.index({p[0], p[1], i[0], i[1]}));
+printf(" my index= %lu  reverse temp=%lu\n ", comp_s.index({p[1], p[0], i[1], i[0]}), temp_s.index({p[1], p[0], i[1], i[0]}));
+printf(" more index= %lu  reverse ...=%lu\n ", comp_s.index({p[0], p[1], i[0], i[1]}), temp_s.index({p[0], p[1], i[0], i[1]}));
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
             // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies inside)
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
 
+            // size_t adj_index = temp_s.index({p[1], p[0], i[1], i[0]});
+
             for(auto ii : range(p.size()))
             {
     // if(ii == 0)
@@ -136,48 +141,54 @@ printf(" IIIII other index %lu , %lu , %lu , %lu  i=%lu \n", p[0], p[1], i[0], i
     // printf("y: " );
                 // for width & height dimensions,
                 // transform the roi start point to scaled coordinates
-printf("    roi_start[ii] %f    p[ii]  %lu   bin_size[ii] %f   (i[ii] + .5f) %f      bin_grid_size[ii] %lu       \n",
-roi_start[ii], p[ii], bin_size[ii], (i[ii] + .5f),     bin_grid_size[ii] );
+// printf("    roi_start[ii] %f    p[ii]  %lu   bin_size[ii] %f   (i[ii] + .5f) %f      bin_grid_size[ii] %lu       \n",
+// roi_start[ii], p[ii], bin_size[ii], (i[ii] + .5f),     bin_grid_size[ii] );
 
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-printf(" QQQQQQ  L137 x=%f  y=%f\n", xy[0], xy[1]);                                        
+// printf(" QQQQQQ  L137 x=%f  y=%f  ", xy[0], xy[1]);                                        
                 xy[ii] = (coord_trans_mode != "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
-// printf("L139 %f ", xy[ii]);                        
+// printf(" L139 %f ", xy[ii]);                        
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
-// printf("L142 results = pos_weight \n ");                    
-                    results[index] = pos_weight{};
+// printf(" L142 results = pos_weight i=%lu dims=%lu, %lu  \n ", index,  dims[0], dims[1]);                    
+                    // results[adj_index] = pos_weight{};  // all zeroes
+                    results[index] = pos_weight{};  // all zeroes
                     return;
                 }
 
                 xy[ii]   = std::max(xy[ii], 0.0f);
-// printf("L148 %f ", xy[ii]);                
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
+// printf(" L148 %f  low[ii] %lu, dims[ii] %lu", xy[ii],  low[ii], dims[ii]);                
                 if(low[ii] >= dims[ii] - 1)
                 {
-// printf("L154 %f ", xy[ii]);                    
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+// printf(" L154 %f ", xy[ii]);                    
                 }
-// printf("\n");                
+// printf(" \n");                
             }
-            // printf(" JJJJJ  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld   i=%zu\n\n",
-            //                 xy[0], xy[1], dims[1], low[0], low[1],  high[0], high[1], index);
-            results[index].pos = {low[0] * dims[1] + low[1],
-                                  low[0] * dims[1] + high[1],
-                                  high[0] * dims[1] + low[1],
-                                  high[0] * dims[1] + high[1]};
+            printf(" JJJJJ  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld   i=%zu      dims[0]=%lu \n\n",
+                            xy[0], xy[1], dims[1], low[1], low[0],  high[1], high[0], index, dims[0]);
+            results[index].pos = {low[1] * dims[0] + low[0],
+                                  low[1] * dims[0] + high[0],
+                                  high[1] * dims[0] + low[0],
+                                  high[1] * dims[0] + high[0]};
 
             float lx = xy[0] - low[0];
             float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-            // printf(" HHHHH partial pixel values, index=%lu  ly=%f, lx=%f, hy=%f, hx=%f\n\n", index, ly, lx, hy, hx);
+            // printf(" HHHHH partial pixel values, index=%lu pci=%lu  ly=%f, lx=%f, hy=%f, hx=%f\n\n", index, temp_s.index({p[1], p[0], i[1], i[0]}), 
+            //    ly, lx, hy, hx);
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
 
         });
+printf(" AAAAA here we are\n");
+        for(int iix = 0; iix < results.size(); iix++)
+          printf(" SSSSS %d    %lu  %lu  %lu  %lu   %f  %f  %f  %f\n", iix, results[iix].pos[0], results[iix].pos[1], results[iix].pos[2], results[iix].pos[3],
+                    results[iix].w[0], results[iix].w[1], results[iix].w[2], results[iix].w[3]);
 
         return results;
     }
@@ -236,7 +247,7 @@ printf(" QQQQQQ  L137 x=%f  y=%f\n", xy[0], xy[1]);
         std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
         const auto& x_lens                  = args.at(0).get_shape().lens();
         // input dims of height and width
-        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        std::array<std::size_t, 2> in_dims = {x_lens[3], x_lens[2]};
         auto roi_s                         = args.at(1).get_shape();
 
         visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index 8f0e9656c51..f95feec6bc2 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -10133,8 +10133,8 @@ def roialign_test():
 @onnx_test()
 def roialign_half_pixel_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 2, 4, 3])
-    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
-    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [1, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [1])
     y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 4, 3])
 
     # half_pixel is the new mode we're developing for
diff --git a/test/onnx/roialign_half_pixel_test.onnx b/test/onnx/roialign_half_pixel_test.onnx
index b6ca215a9fd362ea9737b42897c1a291c8502e03..cf2d236317be872ab8ba51a28bb8d8c93c6591f5 100644
GIT binary patch
delta 28
jcmaFC^nz)FCL<%`WGzM+HpV1rF2>4<|0GzPm;^)tZcqlN

delta 28
jcmaFC^nz)FCL<%$WGzM+Hl`$LF2>4<|0GzPm;^)tZgK{x

diff --git a/test/onnx/verify/roialign_half_pixel_verify_test.cpp b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
new file mode 100644
index 00000000000..03b7cd48ac5
--- /dev/null
+++ b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
@@ -0,0 +1,120 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/register_target.hpp>
+#include <migraphx/verify.hpp>
+#include <onnx_test.hpp>
+
+// This passes its own test but doesn't match ort version of test
+TEST_CASE(roialign_half_pixel_verify_test)
+{
+    migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
+    p.compile(migraphx::make_target("ref"));
+
+    migraphx::shape s{migraphx::shape::float_type, {2, 2, 4, 3}};
+    std::vector<float> data(2*2*4*3);
+    std::iota(data.begin(), data.end(), 0.f);
+    migraphx::parameter_map pp;
+    pp["x"]     = migraphx::argument(s, data.data());
+    pp["y"]     = migraphx::argument(s, data.data());  // ?
+
+    // migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
+    // std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35,
+    //                                 2.1, 1.73, 3.8, 2.13};
+    // migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    // std::vector<float> bi_data = {0, 1};
+
+    migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
+    std::vector<float> rois_data = {
+                                    1.1, 0.73, 2.2, 1.13};
+    migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
+    std::vector<float> bi_data = {0};
+
+
+    pp["rois"]    = migraphx::argument(srois, rois_data.data());
+    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
+    pp["y"]     = migraphx::argument(s, data.data());
+
+    auto result = p.eval(pp).back();
+    std::vector<float> result_vector;
+    result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
+
+printf(" result:  \n");
+for(int i = 0; i < result_vector.size(); i++)
+{
+ printf(" %f ", result_vector[i]);
+ if(i % 9 == 8)
+     printf("\n");
+}
+printf("\n");
+
+    std::vector<float> gold={
+        0.000000, 0.022222, 0.200000, 0.044444, 0.066667, 0.244444, 0.400000, 0.422222, 0.600000, 0.800000,
+        0.822222, 1.000000, 1.200000, 1.222222, 1.400000, 12.000000, 12.022223, 12.200000, 12.044445, 12.066667,
+        12.244445, 12.400000, 12.422222, 12.600000, 12.800000, 12.822222, 13.000000, 13.200000, 13.222222, 13.400000,
+        0.911111, 3.200000, 6.200000, 1.911111, 4.200000, 7.200000, 2.829630, 5.022223, 8.022223, 2.000000,
+        4.000000, 7.000000, 0.000000, 0.000000, 0.000000, 12.911111, 15.200000, 18.200001, 13.911111, 16.199999,  
+        19.200001, 14.829630, 17.022223, 20.022223, 14.000000, 16.000000, 19.000000, 0.000000, 0.000000, 0.000000
+    };
+
+    EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
+}
+
+
+// TEST_CASE(roialign_half_pixel_verify_test)
+// {
+//     migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
+//     p.compile(migraphx::make_target("ref"));
+
+//     migraphx::shape s{migraphx::shape::float_type, {1, 1, 2, 3}};
+//     std::vector<float> data = {-5.5, 2.0, 100., 7.0, 0., -1.};
+
+//     migraphx::parameter_map pp;
+//     pp["x"]     = migraphx::argument(s, data.data());
+//     pp["y"]     = migraphx::argument(s, data.data());
+
+//         // migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
+//     migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
+//     std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35};
+//     migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
+//     std::vector<float> bi_data = {0};
+
+//     pp["rois"]    = migraphx::argument(srois, rois_data.data());
+//     pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
+
+//     auto result = p.eval(pp).back();
+//     std::vector<float> result_vector;
+//     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
+
+// printf(" result:  ");
+// for(auto aa : result_vector) printf(" %f ", aa);
+// printf("\n");
+
+//     std::vector<float> gold(6);
+//     float alpha = 0.5;
+//     std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {
+//         return std::max(0.0f, x) + std::min(0.0f, alpha * std::expm1(x / alpha));
+//     });
+//     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
+// }
diff --git a/test/onnx/verify/roialign_verify_test.cpp b/test/onnx/verify/roialign_verify_test.cpp
index b1c9b715af4..f2107280df0 100644
--- a/test/onnx/verify/roialign_verify_test.cpp
+++ b/test/onnx/verify/roialign_verify_test.cpp
@@ -39,11 +39,17 @@ TEST_CASE(roialign_verify_test)
     pp["x"]     = migraphx::argument(s, data.data());
     pp["y"]     = migraphx::argument(s, data.data());
 
-    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
-    std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35,
+    // migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
+    // std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35,
+    //                                 2.1, 1.73, 3.8, 2.13};
+    // migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    // std::vector<float> bi_data = {0, 1};
+
+    migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
+    std::vector<float> rois_data = {
                                     2.1, 1.73, 3.8, 2.13};
-    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
-    std::vector<float> bi_data = {0, 1};
+    migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
+    std::vector<float> bi_data = {0};
 
     pp["rois"]    = migraphx::argument(srois, rois_data.data());
     pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());

From dbe18b552fa9fee276e4d090394c7289e8162593 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 25 Sep 2024 16:45:47 +0000
Subject: [PATCH 23/56] test cases 2 rois, fails

---
 ort_roialign.py                               |  12 +-
 test/onnx/gen_onnx.py                         |   6 +-
 test/onnx/roialign_half_pixel_test.onnx       | Bin 360 -> 360 bytes
 .../roialign_half_pixel_verify_test.cpp       | 111 +++++++++++++++---
 4 files changed, 102 insertions(+), 27 deletions(-)

diff --git a/ort_roialign.py b/ort_roialign.py
index db06f24c07d..296d69f20ab 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -13,16 +13,16 @@
 
 y=np.ones([2, 2, 4, 7], dtype='f')
 
-# rois=np.array([[0.1, 0.15, 0.6, 0.35],
-#                 [0.1, 0.15, 2.6, 1.35]], dtype='f')
+rois=np.array([[0.1, 0.15, 0.6, 0.35],
+                [0.1, 0.15, 2.6, 1.35]], dtype='f')
 
-rois=np.array([
-                [ 1.1, 0.73, 2.2, 1.13]], dtype='f')
+# rois=np.array([
+#                 [ 1.1, 0.73, 2.2, 1.13]], dtype='f')
 sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_half_pixel_test.onnx')
 res = sess.run(['y'], {'x': x,
                     'rois': rois,
-                    # 'batch_ind': [0, 1]})
-                    'batch_ind': [0]})
+                    'batch_ind': [0, 1]})
+                    # 'batch_ind': [0]})
 print(res)
        
 		
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index f95feec6bc2..997d8510c7a 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -10133,11 +10133,11 @@ def roialign_test():
 @onnx_test()
 def roialign_half_pixel_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 2, 4, 3])
-    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [1, 4])
-    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [1])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
     y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 4, 3])
 
-    # half_pixel is the new mode we're developing for
+    # half_pixel is the newer mode for ROIAlign
     node = onnx.helper.make_node(
         'RoiAlign',
         inputs=['x', 'rois', 'batch_ind'],
diff --git a/test/onnx/roialign_half_pixel_test.onnx b/test/onnx/roialign_half_pixel_test.onnx
index cf2d236317be872ab8ba51a28bb8d8c93c6591f5..b6ca215a9fd362ea9737b42897c1a291c8502e03 100644
GIT binary patch
delta 28
jcmaFC^nz)FCL<%$WGzM+Hl`$LF2>4<|0GzPm;^)tZgK{x

delta 28
jcmaFC^nz)FCL<%`WGzM+HpV1rF2>4<|0GzPm;^)tZcqlN

diff --git a/test/onnx/verify/roialign_half_pixel_verify_test.cpp b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
index 03b7cd48ac5..6dcb45521c0 100644
--- a/test/onnx/verify/roialign_half_pixel_verify_test.cpp
+++ b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
@@ -26,12 +26,10 @@
 #include <migraphx/verify.hpp>
 #include <onnx_test.hpp>
 
-// This passes its own test but doesn't match ort version of test
 TEST_CASE(roialign_half_pixel_verify_test)
 {
     migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
     p.compile(migraphx::make_target("ref"));
-
     migraphx::shape s{migraphx::shape::float_type, {2, 2, 4, 3}};
     std::vector<float> data(2*2*4*3);
     std::iota(data.begin(), data.end(), 0.f);
@@ -39,17 +37,12 @@ TEST_CASE(roialign_half_pixel_verify_test)
     pp["x"]     = migraphx::argument(s, data.data());
     pp["y"]     = migraphx::argument(s, data.data());  // ?
 
-    // migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
-    // std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35,
-    //                                 2.1, 1.73, 3.8, 2.13};
-    // migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
-    // std::vector<float> bi_data = {0, 1};
-
-    migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
+    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
     std::vector<float> rois_data = {
+                                    0.1, 0.15, 0.6, 0.35,
                                     1.1, 0.73, 2.2, 1.13};
-    migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
-    std::vector<float> bi_data = {0};
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    std::vector<float> bi_data = {0, 1};
 
 
     pp["rois"]    = migraphx::argument(srois, rois_data.data());
@@ -70,13 +63,95 @@ for(int i = 0; i < result_vector.size(); i++)
 printf("\n");
 
     std::vector<float> gold={
-        0.000000, 0.022222, 0.200000, 0.044444, 0.066667, 0.244444, 0.400000, 0.422222, 0.600000, 0.800000,
-        0.822222, 1.000000, 1.200000, 1.222222, 1.400000, 12.000000, 12.022223, 12.200000, 12.044445, 12.066667,
-        12.244445, 12.400000, 12.422222, 12.600000, 12.800000, 12.822222, 13.000000, 13.200000, 13.222222, 13.400000,
-        0.911111, 3.200000, 6.200000, 1.911111, 4.200000, 7.200000, 2.829630, 5.022223, 8.022223, 2.000000,
-        4.000000, 7.000000, 0.000000, 0.000000, 0.000000, 12.911111, 15.200000, 18.200001, 13.911111, 16.199999,  
-        19.200001, 14.829630, 17.022223, 20.022223, 14.000000, 16.000000, 19.000000, 0.000000, 0.000000, 0.000000
-    };
+    0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
+          8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
+          4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
+         0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
+          8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
+          4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
+         0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
+          8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
+          4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
+         1.90476179e-02, 1.90476179e-02, 2.39858869e-02,
+          1.07936502e-01, 2.19047621e-01, 3.30158740e-01,
+          4.41269815e-01, 5.52380979e-01, 6.63492084e-01,
+         1.71428561e-01, 1.71428561e-01, 1.76366836e-01,
+          2.60317445e-01, 3.71428549e-01, 4.82539713e-01,
+          5.93650818e-01, 7.04761863e-01, 8.15872967e-01,
+         3.42857152e-01, 3.42857152e-01, 3.47795397e-01,
+          4.31746036e-01, 5.42857111e-01, 6.53968275e-01,
+          7.65079260e-01, 8.76190484e-01, 9.87301588e-01,
+         5.14285743e-01, 5.14285743e-01, 5.19223928e-01,
+          6.03174567e-01, 7.14285672e-01, 8.25396836e-01,
+          9.36507940e-01, 1.04761910e+00, 1.15873003e+00,
+
+        1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
+          1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
+          1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
+         1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
+          1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
+          1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
+         1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
+          1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
+          1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
+         1.20190477e+01, 1.20190477e+01, 1.20239868e+01,
+          1.21079369e+01, 1.22190475e+01, 1.23301582e+01,
+          1.24412699e+01, 1.25523796e+01, 1.26634922e+01,
+         1.21714277e+01, 1.21714277e+01, 1.21763659e+01,
+          1.22603178e+01, 1.23714285e+01, 1.24825401e+01,
+          1.25936518e+01, 1.27047615e+01, 1.28158722e+01,
+         1.23428583e+01, 1.23428583e+01, 1.23477964e+01,
+          1.24317465e+01, 1.25428581e+01, 1.26539688e+01,
+          1.27650795e+01, 1.28761902e+01, 1.29873009e+01,
+         1.25142860e+01, 1.25142860e+01, 1.25192232e+01,
+          1.26031752e+01, 1.27142859e+01, 1.28253975e+01,
+          1.29365072e+01, 1.30476189e+01, 1.31587305e+01,
+
+
+       2.41400356e+01, 2.46190472e+01, 2.51746025e+01,
+          2.57301579e+01, 2.60857143e+01, 2.60857143e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         2.49971752e+01, 2.54761906e+01, 2.60317459e+01,
+          2.65873032e+01, 2.69428539e+01, 2.69428539e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         2.60257492e+01, 2.65047607e+01, 2.70603180e+01,
+          2.76158714e+01, 2.79714279e+01, 2.79714279e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         2.70543232e+01, 2.75333328e+01, 2.80888901e+01,
+          2.86444473e+01, 2.90000038e+01, 2.90000038e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         2.80828934e+01, 2.85619030e+01, 2.91174583e+01,
+          2.96730137e+01, 3.00285721e+01, 3.00285721e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         2.91114635e+01, 2.95904770e+01, 3.01460342e+01,
+          3.07015896e+01, 3.10571423e+01, 3.10571423e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         3.01400356e+01, 3.06190453e+01, 3.11746006e+01,
+          3.17301598e+01, 3.20857124e+01, 3.20857124e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+
+        3.61400337e+01, 3.66190453e+01, 3.71746063e+01,
+          3.77301559e+01, 3.80857124e+01, 3.80857124e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         3.69971771e+01, 3.74761848e+01, 3.80317497e+01,
+          3.85872993e+01, 3.89428558e+01, 3.89428558e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         3.80257492e+01, 3.85047646e+01, 3.90603180e+01,
+          3.96158714e+01, 3.99714279e+01, 3.99714279e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         3.90543251e+01, 3.95333328e+01, 4.00888863e+01,
+          4.06444435e+01, 4.10000038e+01, 4.10000038e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         4.00828934e+01, 4.05619049e+01, 4.11174622e+01,
+          4.16730156e+01, 4.20285721e+01, 4.20285721e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         4.11114655e+01, 4.15904732e+01, 4.21460304e+01,
+          4.27015839e+01, 4.30571404e+01, 4.30571404e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+         4.21400299e+01, 4.26190529e+01, 4.31746025e+01,
+          4.37301636e+01, 4.40857201e+01, 4.40857201e+01,
+          0.00000000e+00, 0.00000000e+00, 0.00000000e+00
+              };
 
     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
 }

From 4cb582e2522b5b61cae8d8d8c005e4b24b05ebe2 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 25 Sep 2024 18:28:19 +0000
Subject: [PATCH 24/56] created out of bounds test for roialign.  Learned that
 existing code give correct result only for ROI in bounds.

---
 ort_roialign.py                               |   2 +-
 .../roialign_half_pixel_verify_test.cpp       | 213 ++++++++----------
 2 files changed, 96 insertions(+), 119 deletions(-)

diff --git a/ort_roialign.py b/ort_roialign.py
index 296d69f20ab..cf02b431da4 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -14,7 +14,7 @@
 y=np.ones([2, 2, 4, 7], dtype='f')
 
 rois=np.array([[0.1, 0.15, 0.6, 0.35],
-                [0.1, 0.15, 2.6, 1.35]], dtype='f')
+                [1.1, 0.73, 1.9, 1.13]], dtype='f')
 
 # rois=np.array([
 #                 [ 1.1, 0.73, 2.2, 1.13]], dtype='f')
diff --git a/test/onnx/verify/roialign_half_pixel_verify_test.cpp b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
index 6dcb45521c0..417988277b7 100644
--- a/test/onnx/verify/roialign_half_pixel_verify_test.cpp
+++ b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
@@ -40,7 +40,7 @@ TEST_CASE(roialign_half_pixel_verify_test)
     migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
     std::vector<float> rois_data = {
                                     0.1, 0.15, 0.6, 0.35,
-                                    1.1, 0.73, 2.2, 1.13};
+                                    1.1, 0.73, 1.9, 1.13};
     migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
     std::vector<float> bi_data = {0, 1};
 
@@ -63,133 +63,110 @@ for(int i = 0; i < result_vector.size(); i++)
 printf("\n");
 
     std::vector<float> gold={
-    0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
-          8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
-          4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
-         0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
-          8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
-          4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
-         0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
-          8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
-          4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
-         1.90476179e-02, 1.90476179e-02, 2.39858869e-02,
-          1.07936502e-01, 2.19047621e-01, 3.30158740e-01,
-          4.41269815e-01, 5.52380979e-01, 6.63492084e-01,
-         1.71428561e-01, 1.71428561e-01, 1.76366836e-01,
-          2.60317445e-01, 3.71428549e-01, 4.82539713e-01,
-          5.93650818e-01, 7.04761863e-01, 8.15872967e-01,
-         3.42857152e-01, 3.42857152e-01, 3.47795397e-01,
-          4.31746036e-01, 5.42857111e-01, 6.53968275e-01,
-          7.65079260e-01, 8.76190484e-01, 9.87301588e-01,
-         5.14285743e-01, 5.14285743e-01, 5.19223928e-01,
-          6.03174567e-01, 7.14285672e-01, 8.25396836e-01,
-          9.36507940e-01, 1.04761910e+00, 1.15873003e+00,
-
-        1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
-          1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
-          1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
-         1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
-          1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
-          1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
-         1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
-          1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
-          1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
-         1.20190477e+01, 1.20190477e+01, 1.20239868e+01,
-          1.21079369e+01, 1.22190475e+01, 1.23301582e+01,
-          1.24412699e+01, 1.25523796e+01, 1.26634922e+01,
-         1.21714277e+01, 1.21714277e+01, 1.21763659e+01,
-          1.22603178e+01, 1.23714285e+01, 1.24825401e+01,
-          1.25936518e+01, 1.27047615e+01, 1.28158722e+01,
-         1.23428583e+01, 1.23428583e+01, 1.23477964e+01,
-          1.24317465e+01, 1.25428581e+01, 1.26539688e+01,
-          1.27650795e+01, 1.28761902e+01, 1.29873009e+01,
-         1.25142860e+01, 1.25142860e+01, 1.25192232e+01,
-          1.26031752e+01, 1.27142859e+01, 1.28253975e+01,
-          1.29365072e+01, 1.30476189e+01, 1.31587305e+01,
-
-
-       2.41400356e+01, 2.46190472e+01, 2.51746025e+01,
-          2.57301579e+01, 2.60857143e+01, 2.60857143e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         2.49971752e+01, 2.54761906e+01, 2.60317459e+01,
-          2.65873032e+01, 2.69428539e+01, 2.69428539e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         2.60257492e+01, 2.65047607e+01, 2.70603180e+01,
-          2.76158714e+01, 2.79714279e+01, 2.79714279e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         2.70543232e+01, 2.75333328e+01, 2.80888901e+01,
-          2.86444473e+01, 2.90000038e+01, 2.90000038e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         2.80828934e+01, 2.85619030e+01, 2.91174583e+01,
-          2.96730137e+01, 3.00285721e+01, 3.00285721e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         2.91114635e+01, 2.95904770e+01, 3.01460342e+01,
-          3.07015896e+01, 3.10571423e+01, 3.10571423e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         3.01400356e+01, 3.06190453e+01, 3.11746006e+01,
-          3.17301598e+01, 3.20857124e+01, 3.20857124e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-
-        3.61400337e+01, 3.66190453e+01, 3.71746063e+01,
-          3.77301559e+01, 3.80857124e+01, 3.80857124e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         3.69971771e+01, 3.74761848e+01, 3.80317497e+01,
-          3.85872993e+01, 3.89428558e+01, 3.89428558e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         3.80257492e+01, 3.85047646e+01, 3.90603180e+01,
-          3.96158714e+01, 3.99714279e+01, 3.99714279e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         3.90543251e+01, 3.95333328e+01, 4.00888863e+01,
-          4.06444435e+01, 4.10000038e+01, 4.10000038e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         4.00828934e+01, 4.05619049e+01, 4.11174622e+01,
-          4.16730156e+01, 4.20285721e+01, 4.20285721e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         4.11114655e+01, 4.15904732e+01, 4.21460304e+01,
-          4.27015839e+01, 4.30571404e+01, 4.30571404e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-         4.21400299e+01, 4.26190529e+01, 4.31746025e+01,
-          4.37301636e+01, 4.40857201e+01, 4.40857201e+01,
-          0.00000000e+00, 0.00000000e+00, 0.00000000e+00
+ 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
+ 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
+ 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
+ 0.019048, 0.019048, 0.023986, 0.107937, 0.219048, 0.330159, 0.441270, 0.552381, 0.663492,
+ 0.171429, 0.171429, 0.176367, 0.260317, 0.371429, 0.482540, 0.593651, 0.704762, 0.815873,
+ 0.342857, 0.342857, 0.347795, 0.431746, 0.542857, 0.653968, 0.765079, 0.876190, 0.987302,
+ 0.514286, 0.514286, 0.519224, 0.603175, 0.714286, 0.825397, 0.936508, 1.047619, 1.158730,
+ 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
+ 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
+ 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
+ 12.019048, 12.019048, 12.023986, 12.107937, 12.219048, 12.330158, 12.441270, 12.552382, 12.663492,
+ 12.171429, 12.171429, 12.176367, 12.260318, 12.371428, 12.482540, 12.593651, 12.704762, 12.815873,
+ 12.342857, 12.342857, 12.347795, 12.431746, 12.542857, 12.653969, 12.765079, 12.876190, 12.987302,
+ 12.514286, 12.514286, 12.519224, 12.603174, 12.714286, 12.825397, 12.936508, 13.047619, 13.158731,
+ 4.840318, 5.009453, 5.051429, 5.051429, 5.051429, 5.051429, 5.051429, 1.683810, 0.000000,
+ 5.183175, 5.352311, 5.394286, 5.394286, 5.394286, 5.394286, 5.394286, 1.798095, 0.000000,
+ 5.526032, 5.695168, 5.737143, 5.737143, 5.737143, 5.737143, 5.737143, 1.912381, 0.000000,
+ 5.868889, 6.038025, 6.080000, 6.080000, 6.080000, 6.080000, 6.080000, 2.026667, 0.000000,
+ 6.211746, 6.380882, 6.422857, 6.422857, 6.422857, 6.422857, 6.422857, 2.140952, 0.000000,
+ 6.554603, 6.723739, 6.765714, 6.765714, 6.765714, 6.765714, 6.765714, 2.255238, 0.000000,
+ 6.897460, 7.066596, 7.108572, 7.108572, 7.108572, 7.108572, 7.108572, 2.369524, 0.000000,
+ 16.840317, 17.009453, 17.051428, 17.051428, 17.051428, 17.051428, 17.051428, 5.683809, 0.000000,
+ 17.183174, 17.352310, 17.394285, 17.394285, 17.394285, 17.394285, 17.394285, 5.798095, 0.000000,
+ 17.526031, 17.695168, 17.737143, 17.737143, 17.737143, 17.737143, 17.737143, 5.912381, 0.000000,
+ 17.868889, 18.038025, 18.080000, 18.080000, 18.080000, 18.080000, 18.080000, 6.026667, 0.000000,
+ 18.211746, 18.380882, 18.422857, 18.422857, 18.422857, 18.422857, 18.422857, 6.140953, 0.000000,
+ 18.554604, 18.723740, 18.765715, 18.765715, 18.765715, 18.765715, 18.765715, 6.255238, 0.000000,
+ 18.897461, 19.066597, 19.108572, 19.108572, 19.108572, 19.108572, 19.108572, 6.369524, 0.000000
               };
 
     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
 }
 
 
-// TEST_CASE(roialign_half_pixel_verify_test)
-// {
-//     migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
-//     p.compile(migraphx::make_target("ref"));
+TEST_CASE(roialign_half_pixel_oob_verify_test)
+{
+    // One ROI extends outside of bounds of input array,
+    // when scaled by spatial_scale
+    migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
+    p.compile(migraphx::make_target("ref"));
+    migraphx::shape s{migraphx::shape::float_type, {2, 2, 4, 3}};
+    std::vector<float> data(2*2*4*3);
+    std::iota(data.begin(), data.end(), 0.f);
+    migraphx::parameter_map pp;
+    pp["x"]     = migraphx::argument(s, data.data());
+    pp["y"]     = migraphx::argument(s, data.data());  // ?
 
-//     migraphx::shape s{migraphx::shape::float_type, {1, 1, 2, 3}};
-//     std::vector<float> data = {-5.5, 2.0, 100., 7.0, 0., -1.};
+    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
+    std::vector<float> rois_data = {
+                                    0.1, 0.15, 0.6, 0.35,
+                                    1.1, 0.73, 2.5, 1.13};
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    std::vector<float> bi_data = {0, 1};
+
+
+    pp["rois"]    = migraphx::argument(srois, rois_data.data());
+    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
+    pp["y"]     = migraphx::argument(s, data.data());
 
-//     migraphx::parameter_map pp;
-//     pp["x"]     = migraphx::argument(s, data.data());
-//     pp["y"]     = migraphx::argument(s, data.data());
+    auto result = p.eval(pp).back();
+    std::vector<float> result_vector;
+    result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-//         // migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
-//     migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
-//     std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35};
-//     migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
-//     std::vector<float> bi_data = {0};
+printf(" result:  \n");
+for(int i = 0; i < result_vector.size(); i++)
+{
+ printf(" %f ", result_vector[i]);
+ if(i % 9 == 8)
+     printf("\n");
+}
+printf("\n");
 
-//     pp["rois"]    = migraphx::argument(srois, rois_data.data());
-//     pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
+    std::vector<float> gold={
+ 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
+ 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
+ 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
+ 0.019048, 0.019048, 0.023986, 0.107937, 0.219048, 0.330159, 0.441270, 0.552381, 0.663492,
+ 0.171429, 0.171429, 0.176367, 0.260317, 0.371429, 0.482540, 0.593651, 0.704762, 0.815873,
+ 0.342857, 0.342857, 0.347795, 0.431746, 0.542857, 0.653968, 0.765079, 0.876190, 0.987302,
+ 0.514286, 0.514286, 0.519224, 0.603175, 0.714286, 0.825397, 0.936508, 1.047619, 1.158730,
+ 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
+ 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
+ 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
+ 12.019048, 12.019048, 12.023986, 12.107937, 12.219048, 12.330158, 12.441270, 12.552382, 12.663492,
+ 12.171429, 12.171429, 12.176367, 12.260318, 12.371428, 12.482540, 12.593651, 12.704762, 12.815873,
+ 12.342857, 12.342857, 12.347795, 12.431746, 12.542857, 12.653969, 12.765079, 12.876190, 12.987302,
+ 12.514286, 12.514286, 12.519224, 12.603174, 12.714286, 12.825397, 12.936508, 13.047619, 13.158731,
+ 4.840318, 5.009453, 5.051429, 5.051429, 5.051429, 5.051429, 5.051429, 1.683810, 0.000000,
+ 5.183175, 5.352311, 5.394286, 5.394286, 5.394286, 5.394286, 5.394286, 1.798095, 0.000000,
+ 5.526032, 5.695168, 5.737143, 5.737143, 5.737143, 5.737143, 5.737143, 1.912381, 0.000000,
+ 5.868889, 6.038025, 6.080000, 6.080000, 6.080000, 6.080000, 6.080000, 2.026667, 0.000000,
+ 6.211746, 6.380882, 6.422857, 6.422857, 6.422857, 6.422857, 6.422857, 2.140952, 0.000000,
+ 6.554603, 6.723739, 6.765714, 6.765714, 6.765714, 6.765714, 6.765714, 2.255238, 0.000000,
+ 6.897460, 7.066596, 7.108572, 7.108572, 7.108572, 7.108572, 7.108572, 2.369524, 0.000000,
+ 16.840317, 17.009453, 17.051428, 17.051428, 17.051428, 17.051428, 17.051428, 5.683809, 0.000000,
+ 17.183174, 17.352310, 17.394285, 17.394285, 17.394285, 17.394285, 17.394285, 5.798095, 0.000000,
+ 17.526031, 17.695168, 17.737143, 17.737143, 17.737143, 17.737143, 17.737143, 5.912381, 0.000000,
+ 17.868889, 18.038025, 18.080000, 18.080000, 18.080000, 18.080000, 18.080000, 6.026667, 0.000000,
+ 18.211746, 18.380882, 18.422857, 18.422857, 18.422857, 18.422857, 18.422857, 6.140953, 0.000000,
+ 18.554604, 18.723740, 18.765715, 18.765715, 18.765715, 18.765715, 18.765715, 6.255238, 0.000000,
+ 18.897461, 19.066597, 19.108572, 19.108572, 19.108572, 19.108572, 19.108572, 6.369524, 0.000000
+              };
 
-//     auto result = p.eval(pp).back();
-//     std::vector<float> result_vector;
-//     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
+}
 
-// printf(" result:  ");
-// for(auto aa : result_vector) printf(" %f ", aa);
-// printf("\n");
 
-//     std::vector<float> gold(6);
-//     float alpha = 0.5;
-//     std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {
-//         return std::max(0.0f, x) + std::min(0.0f, alpha * std::expm1(x / alpha));
-//     });
-//     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
-// }

From 0469b83ef13efec8078a4fff549aec73f3fbd9f9 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 2 Oct 2024 19:00:40 +0000
Subject: [PATCH 25/56] work in progress

---
 ort_roialign.py                               |  57 +++-----
 src/include/migraphx/op/roialign.hpp          |  62 ++++++---
 test/onnx/gen_onnx.py                         |  57 ++++----
 test/onnx/parse/roialign_test.cpp             |   4 +-
 test/onnx/roialign_half_pixel_test.onnx       | Bin 360 -> 360 bytes
 test/onnx/roialign_test.onnx                  | Bin 345 -> 345 bytes
 .../roialign_half_pixel_verify_test.cpp       | 127 +++---------------
 test/op_shape_test.cpp                        |  11 ++
 8 files changed, 121 insertions(+), 197 deletions(-)

diff --git a/ort_roialign.py b/ort_roialign.py
index cf02b431da4..6a7a3ee3dce 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -11,52 +11,27 @@
 x = np.array(np.arange(2*2*4*3), dtype='f')
 x = np.reshape(x, [2, 2, 4, 3])
 
-y=np.ones([2, 2, 4, 7], dtype='f')
-
-rois=np.array([[0.1, 0.15, 0.6, 0.35],
-                [1.1, 0.73, 1.9, 1.13]], dtype='f')
+y=np.ones([2, 2, 4, 3], dtype='f')
+
+# matches roialign_half_pixel_verify_test
+# rois=np.array([[0.1, 0.15, 0.6, 0.35],
+#                 [1.1, 0.73, 1.9, 1.13]], dtype='f')
+# matches roialign_half_pixel_oob_verify_test
+rois=np.array([
+                [1.1, 0.73, 1.7, 1.13],
+                [1.1, 0.73, 2.6, 1.13]
+                #         [1.1, 0.73, 2.6, 1.13]
+                ], dtype='f')
 
 # rois=np.array([
 #                 [ 1.1, 0.73, 2.2, 1.13]], dtype='f')
 sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_half_pixel_test.onnx')
+# sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_one_roi_asdf_test.onnx') 
 res = sess.run(['y'], {'x': x,
                     'rois': rois,
                     'batch_ind': [0, 1]})
-                    # 'batch_ind': [0]})
-print(res)
+                  #   'batch_ind': [0]})
+print(' ORT test model is roialign_one_roi_asdf_test.onnx, rois_data is \n',rois, 
+      ' result is \n', res)
        
-		
-# model_file = "test/onnx/roialign_test.onnx"
-# onnx_model = onnx.load(model_file)
-# onnx.checker.check_model(onnx_model)
-
-
-# #define the priority order for the execution providers
-# EP_list = ['CPUExecutionProvider']
-
-# aa = np.asarray(np.arange(3*2*4*5), dtype='f')
-# # bi = np.reshape(aa, [3, 2, 4, 5])
-
-# # initialize the model.onnx
-# sess = rt.InferenceSession(model_file, providers=EP_list)
-# x, rois, batch_ind = (np.reshape(aa, [3, 2, 4, 5]),
-#             np.array([[0.1, 0.15, 0.6, 0.35],
-#                       [2.1, 1.73, 3.8, 2.13]], dtype='f'),
-#             np.array([0, 1], dtype='int64'))
-
-# #  Use the parameter names defined in the onnx file
-# output = sess.run(None, {'x':  x,
-#                          'rois': rois,
-#                          'batch_ind': batch_ind,
-#                          })
-
-# print(' output is ', output)
-
-
-# # get the outputs metadata as a list of :class:`onnxruntime.NodeArg`
-# output_name = sess.get_outputs()[0].name
-
-# # get the inputs metadata as a list of :class:`onnxruntime.NodeArg`
-# input_name = sess.get_inputs()[0].name
-# print("Names are  ",input_name, output_name)
-
+		
\ No newline at end of file
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 06ec14c0016..ca8a2db1e3d 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -74,6 +74,11 @@ struct roialign
         auto type     = inputs.at(0).type();
 
         // check input correct
+        if(inputs.at(0).type() != shape::float_type or inputs.at(1).type() != shape::float_type or inputs.at(2).type() != shape::int64_type)
+        {
+            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 1 or 2 or 3!");
+        }
+
         if(bi_lens.size() != 1)
         {
             MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
@@ -122,9 +127,9 @@ shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], tem
             // and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
             std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]};//  <== these are always the same
-printf("\n IIIII other index %lu , %lu , %lu , %lu  i=%lu   temp_index = %lu \n", p[0], p[1], i[0], i[1], index, temp_s.index({p[0], p[1], i[0], i[1]}));
-printf(" my index= %lu  reverse temp=%lu\n ", comp_s.index({p[1], p[0], i[1], i[0]}), temp_s.index({p[1], p[0], i[1], i[0]}));
-printf(" more index= %lu  reverse ...=%lu\n ", comp_s.index({p[0], p[1], i[0], i[1]}), temp_s.index({p[0], p[1], i[0], i[1]}));
+// printf("\n IIIII other index %lu , %lu , %lu , %lu  i=%lu   temp_index = %lu \n", p[0], p[1], i[0], i[1], index, temp_s.index({p[0], p[1], i[0], i[1]}));
+// printf(" my index= %lu  reverse temp=%lu\n ", comp_s.index({p[1], p[0], i[1], i[0]}), temp_s.index({p[1], p[0], i[1], i[0]}));
+// printf(" more index= %lu  reverse ...=%lu\n ", comp_s.index({p[0], p[1], i[0], i[1]}), temp_s.index({p[0], p[1], i[0], i[1]}));
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
             // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies inside)
@@ -168,8 +173,8 @@ printf(" more index= %lu  reverse ...=%lu\n ", comp_s.index({p[0], p[1], i[0], i
                 }
 // printf(" \n");                
             }
-            printf(" JJJJJ  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld   i=%zu      dims[0]=%lu \n\n",
-                            xy[0], xy[1], dims[1], low[1], low[0],  high[1], high[0], index, dims[0]);
+            // printf(" JJJJJ  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld   i=%zu      dims[0]=%lu \n\n",
+            //                 xy[0], xy[1], dims[1], low[1], low[0],  high[1], high[0], index, dims[0]);
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
                                   high[1] * dims[0] + low[0],
@@ -183,12 +188,14 @@ printf(" more index= %lu  reverse ...=%lu\n ", comp_s.index({p[0], p[1], i[0], i
             //    ly, lx, hy, hx);
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+// printf(" DDDDD index %d    %f  %f  %f  %f \n", pre_calc_index,
+//     float(pc.w1), float(pc.w2), float(pc.w3), float(pc.w4));
 
         });
-printf(" AAAAA here we are\n");
-        for(int iix = 0; iix < results.size(); iix++)
-          printf(" SSSSS %d    %lu  %lu  %lu  %lu   %f  %f  %f  %f\n", iix, results[iix].pos[0], results[iix].pos[1], results[iix].pos[2], results[iix].pos[3],
-                    results[iix].w[0], results[iix].w[1], results[iix].w[2], results[iix].w[3]);
+// // printf(" AAAAA here we are\n");
+//         for(int iix = 0; iix < results.size(); iix++)
+//           printf(" SSSSS %d    %lu  %lu  %lu  %lu   %f  %f  %f  %f\n", iix, results[iix].pos[0], results[iix].pos[1], results[iix].pos[2], results[iix].pos[3],
+//                    results[iix].w[0], results[iix].w[1], results[iix].w[2], results[iix].w[3]);
 
         return results;
     }
@@ -211,11 +218,12 @@ printf(" AAAAA here we are\n");
         double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
     };
 
+    // Calculate a pooling value for 1 block of bin_grid_size*bin_grid_size weights
     template <class T, class Op>
-    std::tuple<double, int64_t> calc_pooling(const T& data,
+    double calc_pooling(const T& data,
                                              const std::array<std::size_t, 2>& bin_grid_size,
                                              const std::vector<pos_weight>& pos_weights,
-                                             int64_t index,
+                                             int64_t& index,
                                              Op op) const
     {
         double output_val   = op.init();
@@ -223,17 +231,26 @@ printf(" AAAAA here we are\n");
         dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
             const auto& pc = pos_weights[index];
             std::array<double, 4> wv;
+            // printf(" WWWWW ");
             std::transform(
                 pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
+
+
+
+// std::cout << " YYYYY data starting at " << &(*(data)) ;
+// printf("  %lu, %f->%f   \n", pos, w, *(data + pos) * w);
                     return *(data + pos) * w;
                 });
+    // for(double aa : wv)
+    //   printf(" %d   ", aa);
+            // printf("\n");
             output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
             index += 1;
         });
 
         output_val = op.final(output_val, count);
 
-        return {output_val, index};
+        return output_val;
     }
 
     argument compute(const shape& output_shape, std::vector<argument> args) const
@@ -274,7 +291,7 @@ printf(" AAAAA here we are\n");
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
                     if(coord_trans_mode != "half_pixel")
                         roi_size[ii] = std::max(roi_size[ii], 1.0f);
-printf("\n KKKKK roi_size %f out_dims %lu     \n", roi_size[ii] , out_dims[ii]);
+// printf("\n KKKKK ii %ld  roi_size %f   roi_batch_ind %ld  out_dims %lu     \n", ii, roi_size[ii] , roi_batch_ind,  out_dims[ii]);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
@@ -292,8 +309,14 @@ printf("\n KKKKK roi_size %f out_dims %lu     \n", roi_size[ii] , out_dims[ii]);
                 std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
                 shape comp_s1{migraphx::shape::float_type, comp_lens1};
                 std::vector<int64_t> vec_index(channels, 0);
-                shape_for_each(comp_s1, [&](const auto& idx) {
-                    auto c  = idx[0];
+// printf(" XXXXX  %lu    (bottom_data + %d * %ld + %ld) * %lu * %lu\n",// ORT does this for 2 channels, 2 ROI
+//  static_cast<int64_t>((roi_batch_ind * channels + 0) *
+//                                                            in_dims[0] * in_dims[1]),
+//      int(roi_batch_ind),  channels, (size_t)0, in_dims[0], in_dims[1]);  // offset pointer to data for this ROI (4 total)
+    
+                    // Iterate through each dimension in [channels, out_dims[1], out_dims[2]]
+                    shape_for_each(comp_s1, [&](const auto& idx) {
+                    auto c  = idx[0];  // channel count
                     auto ph = idx[1];
                     auto pw = idx[2];
 
@@ -301,7 +324,13 @@ printf("\n KKKKK roi_size %f out_dims %lu     \n", roi_size[ii] , out_dims[ii]);
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
                     double output_val;
-                    std::tie(output_val, vec_index[c]) =
+// printf(" UUUUU  bottom_data %d  %lu %lu pre_calc size=%lu vec_index %lu    ", int(*offset_bottom_data), 
+// bin_grid_size[0], bin_grid_size[1],
+// pre_calc.size(), vec_index[c]);
+
+// printf("cont.  c=%ld  ph  =  %ld  pw = %ld  n=%ld roi_batch_ind %ld\n", c, ph, pw, n, roi_batch_ind);
+
+                    output_val =
                         (mode == migraphx::op::pooling_mode::average)
                             ? this->calc_pooling(offset_bottom_data,
                                                  bin_grid_size,
@@ -313,6 +342,7 @@ printf("\n KKKKK roi_size %f out_dims %lu     \n", roi_size[ii] , out_dims[ii]);
                                                  pre_calc,
                                                  vec_index[c],
                                                  max_pool{});
+// printf(" TTTTT idx=%3ld  output_val=%f\n", vec_index[c] % 9 - 1, output_val);                                                 
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index 997d8510c7a..3b7e4697556 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -10112,10 +10112,11 @@ def roialign_default_test():
 
 @onnx_test()
 def roialign_test():
-    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 2, 4, 5])
-    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
-    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 2, 4, 5])
+    # Roialign with output_half_pixel mode is backward-compatible.
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 5, 4, 7])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [8, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [8])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [8, 4, 5, 5])
     node = onnx.helper.make_node(
         'RoiAlign',
         inputs=['x', 'rois', 'batch_ind'],
@@ -10143,39 +10144,37 @@ def roialign_half_pixel_test():
         inputs=['x', 'rois', 'batch_ind'],
         outputs=['y'],
         spatial_scale=2.0,
-        output_height=7,
-        output_width=9,
-        sampling_ratio=3,
+        output_height=2,
+        output_width=3,
+        sampling_ratio=2,
         mode="avg",
         coordinate_transformation_mode="half_pixel")
 
     return ([node], [x, roi, bi], [y])
 
 
+@onnx_test()
+def roialign_half_pixel_roi_test():
+    # Same as roialign_half_pixel_test but contains more ROIs than there
+    # are batch dimensions.
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 1, 2, 3])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 4, 2, 2])
 
+    # half_pixel is the new mode we're developing for
+    node = onnx.helper.make_node(
+        'RoiAlign',
+        inputs=['x', 'rois', 'batch_ind'],
+        outputs=['y'],
+        spatial_scale=2.0,
+        output_height=2,
+        output_width=3,
+        sampling_ratio=2,
+        mode="avg",
+        coordinate_transformation_mode="half_pixel")
 
-
-
-# @onnx_test()
-# def roialign_half_pixel_test():
-#     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 1, 2, 3])
-#     roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [1, 4])
-#     bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [1])
-#     y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4, 2, 2])
-
-#     # half_pixel is the new mode we're developing for
-#     node = onnx.helper.make_node(
-#         'RoiAlign',
-#         inputs=['x', 'rois', 'batch_ind'],
-#         outputs=['y'],
-#         spatial_scale=2.0,
-#         output_height=5,
-#         output_width=5,
-#         sampling_ratio=3,
-#         mode="avg",
-#         coordinate_transformation_mode="half_pixel")
-
-#     return ([node], [x, roi, bi], [y])
+    return ([node], [x, roi, bi], [y])
 
 
 @onnx_test()
diff --git a/test/onnx/parse/roialign_test.cpp b/test/onnx/parse/roialign_test.cpp
index 52bb8681d4d..05f27b6473c 100644
--- a/test/onnx/parse/roialign_test.cpp
+++ b/test/onnx/parse/roialign_test.cpp
@@ -26,7 +26,7 @@
 
 TEST_CASE(roialign_test)
 {
-    migraphx::shape sx{migraphx::shape::float_type, {3, 2, 4, 5}};
+    migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
     migraphx::shape srois{migraphx::shape::float_type, {8, 4}};
     migraphx::shape sbi{migraphx::shape::int64_type, {8}};
 
@@ -41,7 +41,7 @@ TEST_CASE(roialign_test)
                           {{"coordinate_transformation_mode", "output_half_pixel"},
                            {"spatial_scale", 2.0f},
                            {"output_height", 5},
-                           {"output_width", 3},
+                           {"output_width", 5},
                            {"sampling_ratio", 3}}),
         x,
         rois,
diff --git a/test/onnx/roialign_half_pixel_test.onnx b/test/onnx/roialign_half_pixel_test.onnx
index b6ca215a9fd362ea9737b42897c1a291c8502e03..76daf3d0c0df8de4fe13e5e0133da6c3bb1e4cce 100644
GIT binary patch
delta 55
zcmaFC^nz)^G+m|zj7(a>Ts--uB?YA=@#UE*B^eUTAVE<szT(8(f}G5}^!TF0lFWPw
Jriq7_0s!s35>WsE

delta 55
zcmaFC^nz)^G+p)uj7(a>Ts--uB?YA=@#UE*B^eT&AVE<szT(8(f}G5}^!TF0lFWPw
J=81=w0s!w*5?ufQ

diff --git a/test/onnx/roialign_test.onnx b/test/onnx/roialign_test.onnx
index 3127c15a2b79db3a13aaaf38c46390d22ba63b70..0a60795f561572d993de35769d4b8aef8b520e49 100644
GIT binary patch
delta 79
zcmcb~bdzbqFGVgcCJt60W&vXMC^0UUqWsKaAwDi14n`pkpcuzwX+{|(jwER=#!4YE
UE)k%x07w|B1*DbLiAg{d0Q-vxo&W#<

delta 79
zcmcb~bdzbqFGXf9CJrVbW&vW>C^0UUqWsKaAwDi14n`pkklbWxMj0ihBxx?jN+B^W
Q5umUDP#8s<6O(`_0QMURegFUf

diff --git a/test/onnx/verify/roialign_half_pixel_verify_test.cpp b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
index 417988277b7..579106b7a52 100644
--- a/test/onnx/verify/roialign_half_pixel_verify_test.cpp
+++ b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
@@ -26,6 +26,7 @@
 #include <migraphx/verify.hpp>
 #include <onnx_test.hpp>
 
+// The half_pixel mode for the ROIAlign op 
 TEST_CASE(roialign_half_pixel_verify_test)
 {
     migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
@@ -35,88 +36,15 @@ TEST_CASE(roialign_half_pixel_verify_test)
     std::iota(data.begin(), data.end(), 0.f);
     migraphx::parameter_map pp;
     pp["x"]     = migraphx::argument(s, data.data());
-    pp["y"]     = migraphx::argument(s, data.data());  // ?
-
-    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
-    std::vector<float> rois_data = {
-                                    0.1, 0.15, 0.6, 0.35,
-                                    1.1, 0.73, 1.9, 1.13};
-    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
-    std::vector<float> bi_data = {0, 1};
-
-
-    pp["rois"]    = migraphx::argument(srois, rois_data.data());
-    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
     pp["y"]     = migraphx::argument(s, data.data());
 
-    auto result = p.eval(pp).back();
-    std::vector<float> result_vector;
-    result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
-
-printf(" result:  \n");
-for(int i = 0; i < result_vector.size(); i++)
-{
- printf(" %f ", result_vector[i]);
- if(i % 9 == 8)
-     printf("\n");
-}
-printf("\n");
-
-    std::vector<float> gold={
- 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
- 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
- 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
- 0.019048, 0.019048, 0.023986, 0.107937, 0.219048, 0.330159, 0.441270, 0.552381, 0.663492,
- 0.171429, 0.171429, 0.176367, 0.260317, 0.371429, 0.482540, 0.593651, 0.704762, 0.815873,
- 0.342857, 0.342857, 0.347795, 0.431746, 0.542857, 0.653968, 0.765079, 0.876190, 0.987302,
- 0.514286, 0.514286, 0.519224, 0.603175, 0.714286, 0.825397, 0.936508, 1.047619, 1.158730,
- 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
- 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
- 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
- 12.019048, 12.019048, 12.023986, 12.107937, 12.219048, 12.330158, 12.441270, 12.552382, 12.663492,
- 12.171429, 12.171429, 12.176367, 12.260318, 12.371428, 12.482540, 12.593651, 12.704762, 12.815873,
- 12.342857, 12.342857, 12.347795, 12.431746, 12.542857, 12.653969, 12.765079, 12.876190, 12.987302,
- 12.514286, 12.514286, 12.519224, 12.603174, 12.714286, 12.825397, 12.936508, 13.047619, 13.158731,
- 4.840318, 5.009453, 5.051429, 5.051429, 5.051429, 5.051429, 5.051429, 1.683810, 0.000000,
- 5.183175, 5.352311, 5.394286, 5.394286, 5.394286, 5.394286, 5.394286, 1.798095, 0.000000,
- 5.526032, 5.695168, 5.737143, 5.737143, 5.737143, 5.737143, 5.737143, 1.912381, 0.000000,
- 5.868889, 6.038025, 6.080000, 6.080000, 6.080000, 6.080000, 6.080000, 2.026667, 0.000000,
- 6.211746, 6.380882, 6.422857, 6.422857, 6.422857, 6.422857, 6.422857, 2.140952, 0.000000,
- 6.554603, 6.723739, 6.765714, 6.765714, 6.765714, 6.765714, 6.765714, 2.255238, 0.000000,
- 6.897460, 7.066596, 7.108572, 7.108572, 7.108572, 7.108572, 7.108572, 2.369524, 0.000000,
- 16.840317, 17.009453, 17.051428, 17.051428, 17.051428, 17.051428, 17.051428, 5.683809, 0.000000,
- 17.183174, 17.352310, 17.394285, 17.394285, 17.394285, 17.394285, 17.394285, 5.798095, 0.000000,
- 17.526031, 17.695168, 17.737143, 17.737143, 17.737143, 17.737143, 17.737143, 5.912381, 0.000000,
- 17.868889, 18.038025, 18.080000, 18.080000, 18.080000, 18.080000, 18.080000, 6.026667, 0.000000,
- 18.211746, 18.380882, 18.422857, 18.422857, 18.422857, 18.422857, 18.422857, 6.140953, 0.000000,
- 18.554604, 18.723740, 18.765715, 18.765715, 18.765715, 18.765715, 18.765715, 6.255238, 0.000000,
- 18.897461, 19.066597, 19.108572, 19.108572, 19.108572, 19.108572, 19.108572, 6.369524, 0.000000
-              };
-
-    EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
-}
-
-
-TEST_CASE(roialign_half_pixel_oob_verify_test)
-{
-    // One ROI extends outside of bounds of input array,
-    // when scaled by spatial_scale
-    migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
-    p.compile(migraphx::make_target("ref"));
-    migraphx::shape s{migraphx::shape::float_type, {2, 2, 4, 3}};
-    std::vector<float> data(2*2*4*3);
-    std::iota(data.begin(), data.end(), 0.f);
-    migraphx::parameter_map pp;
-    pp["x"]     = migraphx::argument(s, data.data());
-    pp["y"]     = migraphx::argument(s, data.data());  // ?
-
     migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
     std::vector<float> rois_data = {
-                                    0.1, 0.15, 0.6, 0.35,
-                                    1.1, 0.73, 2.5, 1.13};
+                                    1.1, 0.73, 1.7, 1.13,
+                                    1.1, 0.73, 2.6, 1.13
+                                     };
     migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
-    std::vector<float> bi_data = {0, 1};
-
+    std::vector<int64_t> bi_data = {0, 1};
 
     pp["rois"]    = migraphx::argument(srois, rois_data.data());
     pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
@@ -130,43 +58,24 @@ printf(" result:  \n");
 for(int i = 0; i < result_vector.size(); i++)
 {
  printf(" %f ", result_vector[i]);
- if(i % 9 == 8)
+ if(i % 6 == 5)
      printf("\n");
 }
 printf("\n");
-
-    std::vector<float> gold={
- 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
- 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
- 0.000000, 0.000000, 0.004938, 0.088889, 0.200000, 0.311111, 0.422222, 0.533333, 0.644444,
- 0.019048, 0.019048, 0.023986, 0.107937, 0.219048, 0.330159, 0.441270, 0.552381, 0.663492,
- 0.171429, 0.171429, 0.176367, 0.260317, 0.371429, 0.482540, 0.593651, 0.704762, 0.815873,
- 0.342857, 0.342857, 0.347795, 0.431746, 0.542857, 0.653968, 0.765079, 0.876190, 0.987302,
- 0.514286, 0.514286, 0.519224, 0.603175, 0.714286, 0.825397, 0.936508, 1.047619, 1.158730,
- 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
- 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
- 12.000000, 12.000000, 12.004938, 12.088889, 12.200000, 12.311111, 12.422222, 12.533334, 12.644444,
- 12.019048, 12.019048, 12.023986, 12.107937, 12.219048, 12.330158, 12.441270, 12.552382, 12.663492,
- 12.171429, 12.171429, 12.176367, 12.260318, 12.371428, 12.482540, 12.593651, 12.704762, 12.815873,
- 12.342857, 12.342857, 12.347795, 12.431746, 12.542857, 12.653969, 12.765079, 12.876190, 12.987302,
- 12.514286, 12.514286, 12.519224, 12.603174, 12.714286, 12.825397, 12.936508, 13.047619, 13.158731,
- 4.840318, 5.009453, 5.051429, 5.051429, 5.051429, 5.051429, 5.051429, 1.683810, 0.000000,
- 5.183175, 5.352311, 5.394286, 5.394286, 5.394286, 5.394286, 5.394286, 1.798095, 0.000000,
- 5.526032, 5.695168, 5.737143, 5.737143, 5.737143, 5.737143, 5.737143, 1.912381, 0.000000,
- 5.868889, 6.038025, 6.080000, 6.080000, 6.080000, 6.080000, 6.080000, 2.026667, 0.000000,
- 6.211746, 6.380882, 6.422857, 6.422857, 6.422857, 6.422857, 6.422857, 2.140952, 0.000000,
- 6.554603, 6.723739, 6.765714, 6.765714, 6.765714, 6.765714, 6.765714, 2.255238, 0.000000,
- 6.897460, 7.066596, 7.108572, 7.108572, 7.108572, 7.108572, 7.108572, 2.369524, 0.000000,
- 16.840317, 17.009453, 17.051428, 17.051428, 17.051428, 17.051428, 17.051428, 5.683809, 0.000000,
- 17.183174, 17.352310, 17.394285, 17.394285, 17.394285, 17.394285, 17.394285, 5.798095, 0.000000,
- 17.526031, 17.695168, 17.737143, 17.737143, 17.737143, 17.737143, 17.737143, 5.912381, 0.000000,
- 17.868889, 18.038025, 18.080000, 18.080000, 18.080000, 18.080000, 18.080000, 6.026667, 0.000000,
- 18.211746, 18.380882, 18.422857, 18.422857, 18.422857, 18.422857, 18.422857, 6.140953, 0.000000,
- 18.554604, 18.723740, 18.765715, 18.765715, 18.765715, 18.765715, 18.765715, 6.255238, 0.000000,
- 18.897461, 19.066597, 19.108572, 19.108572, 19.108572, 19.108572, 19.108572, 6.369524, 0.000000
-              };
+    // Gold values were generated with onnxruntime
+    std::vector<float> gold={  
+            5.38, 5.4799995, 5.4799995,
+            6.58, 6.68, 6.68,
+            17.38, 17.48, 17.48,
+            18.58, 18.68, 18.68,
+            29.454998, 14.74, 0.,
+            30.654999, 15.34, 0.,
+            41.455, 20.74, 0.,
+            42.655003, 21.34, 0. 
+    };
 
     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
 }
 
 
+
diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp
index 8d08455d814..2b7d38fd3db 100644
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -5161,11 +5161,22 @@ TEST_CASE(roialign_test)
     migraphx::shape sbi2{migraphx::shape::int64_type, {3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois, sbi2);
 
+    migraphx::shape sbi_float{migraphx::shape::float_type, {2}};
+    throws_shape(migraphx::make_op("roialign"), sx, srois, sbi_float);
+
     migraphx::shape srois1{migraphx::shape::float_type, {2, 4, 3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois1, sbi);
 
+    // wrong data types
     migraphx::shape srois2{migraphx::shape::float_type, {2, 3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois2, sbi);
+
+    migraphx::shape srois_int{migraphx::shape::int32_type, {2, 3}};
+    throws_shape(migraphx::make_op("roialign"), sx, srois_int, sbi);
+
+    migraphx::shape sx_int{migraphx::shape::int64_type, {3, 4, 5, 6}};
+    throws_shape(migraphx::make_op("roialign"), sx_int, srois, sbi);
+
 }
 
 TEST_CASE(test_concat)

From 1837f1add5254c549fa513f4c4b4a1f71f9df30b Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 2 Oct 2024 21:55:45 +0000
Subject: [PATCH 26/56] clean up debug code and tests work in progress

---
 src/include/migraphx/op/roialign.hpp | 77 +++++-----------------------
 test/op_shape_test.cpp               |  9 ++++
 2 files changed, 21 insertions(+), 65 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index ca8a2db1e3d..f3f54af8028 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -74,11 +74,12 @@ struct roialign
         auto type     = inputs.at(0).type();
 
         // check input correct
-        if(inputs.at(0).type() != shape::float_type or inputs.at(1).type() != shape::float_type or inputs.at(2).type() != shape::int64_type)
-        {
-            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 1 or 2 or 3!");
-        }
-
+        if(shape::is_integral(inputs.at(0).type()))
+            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 1!");
+        if(shape::is_integral(inputs.at(1).type())) 
+            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 2!");
+        if(shape::is_integral(inputs.at(2).type()))
+            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 3!");
         if(bi_lens.size() != 1)
         {
             MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
@@ -119,62 +120,39 @@ struct roialign
     {
         std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
                                         output_width);
-std::vector<std::size_t> temp_lens = comp_s.lens();                                        
-shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], temp_lens[2] }};
+
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
 
             // The p and i indexes correspond to nested looping parameters in ORT that go in y, x order.  The i[x] value is least significant
             // and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
             std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]};//  <== these are always the same
-// printf("\n IIIII other index %lu , %lu , %lu , %lu  i=%lu   temp_index = %lu \n", p[0], p[1], i[0], i[1], index, temp_s.index({p[0], p[1], i[0], i[1]}));
-// printf(" my index= %lu  reverse temp=%lu\n ", comp_s.index({p[1], p[0], i[1], i[0]}), temp_s.index({p[1], p[0], i[1], i[0]}));
-// printf(" more index= %lu  reverse ...=%lu\n ", comp_s.index({p[0], p[1], i[0], i[1]}), temp_s.index({p[0], p[1], i[0], i[1]}));
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
             // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies inside)
+            // from which we will interpolate.
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
 
-            // size_t adj_index = temp_s.index({p[1], p[0], i[1], i[0]});
-
             for(auto ii : range(p.size()))
             {
-    // if(ii == 0)
-    // printf("x: " );
-    // else
-    // printf("y: " );
-                // for width & height dimensions,
-                // transform the roi start point to scaled coordinates
-// printf("    roi_start[ii] %f    p[ii]  %lu   bin_size[ii] %f   (i[ii] + .5f) %f      bin_grid_size[ii] %lu       \n",
-// roi_start[ii], p[ii], bin_size[ii], (i[ii] + .5f),     bin_grid_size[ii] );
-
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-// printf(" QQQQQQ  L137 x=%f  y=%f  ", xy[0], xy[1]);                                        
                 xy[ii] = (coord_trans_mode != "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
-// printf(" L139 %f ", xy[ii]);                        
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
-// printf(" L142 results = pos_weight i=%lu dims=%lu, %lu  \n ", index,  dims[0], dims[1]);                    
-                    // results[adj_index] = pos_weight{};  // all zeroes
-                    results[index] = pos_weight{};  // all zeroes
+                    results[index] = pos_weight{};
                     return;
                 }
 
                 xy[ii]   = std::max(xy[ii], 0.0f);
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
-// printf(" L148 %f  low[ii] %lu, dims[ii] %lu", xy[ii],  low[ii], dims[ii]);                
                 if(low[ii] >= dims[ii] - 1)
                 {
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
-// printf(" L154 %f ", xy[ii]);                    
                 }
-// printf(" \n");                
             }
-            // printf(" JJJJJ  xy[0]=%f  xy[1] = %f                             dims[1]=%lu  low%ld-%ld  high %ld-%ld   i=%zu      dims[0]=%lu \n\n",
-            //                 xy[0], xy[1], dims[1], low[1], low[0],  high[1], high[0], index, dims[0]);
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
                                   high[1] * dims[0] + low[0],
@@ -184,19 +162,10 @@ shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], tem
             float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-            // printf(" HHHHH partial pixel values, index=%lu pci=%lu  ly=%f, lx=%f, hy=%f, hx=%f\n\n", index, temp_s.index({p[1], p[0], i[1], i[0]}), 
-            //    ly, lx, hy, hx);
+
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
-// printf(" DDDDD index %d    %f  %f  %f  %f \n", pre_calc_index,
-//     float(pc.w1), float(pc.w2), float(pc.w3), float(pc.w4));
-
         });
-// // printf(" AAAAA here we are\n");
-//         for(int iix = 0; iix < results.size(); iix++)
-//           printf(" SSSSS %d    %lu  %lu  %lu  %lu   %f  %f  %f  %f\n", iix, results[iix].pos[0], results[iix].pos[1], results[iix].pos[2], results[iix].pos[3],
-//                    results[iix].w[0], results[iix].w[1], results[iix].w[2], results[iix].w[3]);
-
         return results;
     }
 
@@ -231,19 +200,10 @@ shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], tem
         dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
             const auto& pc = pos_weights[index];
             std::array<double, 4> wv;
-            // printf(" WWWWW ");
             std::transform(
                 pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
-
-
-
-// std::cout << " YYYYY data starting at " << &(*(data)) ;
-// printf("  %lu, %f->%f   \n", pos, w, *(data + pos) * w);
                     return *(data + pos) * w;
                 });
-    // for(double aa : wv)
-    //   printf(" %d   ", aa);
-            // printf("\n");
             output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
             index += 1;
         });
@@ -291,7 +251,6 @@ shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], tem
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
                     if(coord_trans_mode != "half_pixel")
                         roi_size[ii] = std::max(roi_size[ii], 1.0f);
-// printf("\n KKKKK ii %ld  roi_size %f   roi_batch_ind %ld  out_dims %lu     \n", ii, roi_size[ii] , roi_batch_ind,  out_dims[ii]);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
@@ -309,13 +268,8 @@ shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], tem
                 std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
                 shape comp_s1{migraphx::shape::float_type, comp_lens1};
                 std::vector<int64_t> vec_index(channels, 0);
-// printf(" XXXXX  %lu    (bottom_data + %d * %ld + %ld) * %lu * %lu\n",// ORT does this for 2 channels, 2 ROI
-//  static_cast<int64_t>((roi_batch_ind * channels + 0) *
-//                                                            in_dims[0] * in_dims[1]),
-//      int(roi_batch_ind),  channels, (size_t)0, in_dims[0], in_dims[1]);  // offset pointer to data for this ROI (4 total)
-    
-                    // Iterate through each dimension in [channels, out_dims[1], out_dims[2]]
-                    shape_for_each(comp_s1, [&](const auto& idx) {
+
+                shape_for_each(comp_s1, [&](const auto& idx) {
                     auto c  = idx[0];  // channel count
                     auto ph = idx[1];
                     auto pw = idx[2];
@@ -324,12 +278,6 @@ shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], tem
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
                     double output_val;
-// printf(" UUUUU  bottom_data %d  %lu %lu pre_calc size=%lu vec_index %lu    ", int(*offset_bottom_data), 
-// bin_grid_size[0], bin_grid_size[1],
-// pre_calc.size(), vec_index[c]);
-
-// printf("cont.  c=%ld  ph  =  %ld  pw = %ld  n=%ld roi_batch_ind %ld\n", c, ph, pw, n, roi_batch_ind);
-
                     output_val =
                         (mode == migraphx::op::pooling_mode::average)
                             ? this->calc_pooling(offset_bottom_data,
@@ -342,7 +290,6 @@ shape temp_s = {shape::float_type,{temp_lens[1], temp_lens[0], temp_lens[3], tem
                                                  pre_calc,
                                                  vec_index[c],
                                                  max_pool{});
-// printf(" TTTTT idx=%3ld  output_val=%f\n", vec_index[c] % 9 - 1, output_val);                                                 
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp
index 2b7d38fd3db..6241b495ce2 100644
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -5167,6 +5167,15 @@ TEST_CASE(roialign_test)
     migraphx::shape srois1{migraphx::shape::float_type, {2, 4, 3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois1, sbi);
 
+    // alternate data types
+    migraphx::shape sx_d{migraphx::shape::double_type, {3, 4, 5, 6}};
+    migraphx::shape srois_d{migraphx::shape::double_type, {2, 4}};
+    migraphx::shape sbi_int{migraphx::shape::int32_type, {2}};
+    migraphx::shape sout_d{migraphx::shape::double_type, {2, 4, 1, 1}};
+    // to do: debug why this commented-out test failed
+    // expect_shape(sout_d, migraphx::make_op("roialign"), sx_d, srois_d, sbi_int);
+    expect_shape(sout, migraphx::make_op("roialign"), sx_d, srois, sbi);
+
     // wrong data types
     migraphx::shape srois2{migraphx::shape::float_type, {2, 3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois2, sbi);

From 9196b2ea7e2afee80ff5ca6f9c407c9f1554b373 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 2 Oct 2024 22:25:34 +0000
Subject: [PATCH 27/56] fixed some tests/checks

---
 src/include/migraphx/op/roialign.hpp | 10 +++++-----
 test/op_shape_test.cpp               | 16 +++++++---------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index f3f54af8028..1850131b52c 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -74,12 +74,12 @@ struct roialign
         auto type     = inputs.at(0).type();
 
         // check input correct
-        if(shape::is_integral(inputs.at(0).type()))
-            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 1!");
+        if(shape::is_integral(type))
+            MIGRAPHX_THROW("ROIALIGN: incorrect type for input data! (should be non-integer)");
         if(shape::is_integral(inputs.at(1).type())) 
-            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 2!");
-        if(shape::is_integral(inputs.at(2).type()))
-            MIGRAPHX_THROW("ROIALIGN: incorrect type for input 3!");
+            MIGRAPHX_THROW("ROIALIGN: incorrect data type for rois! (should be non-integer)");
+        if(!shape::is_integral(inputs.at(2).type()))
+            MIGRAPHX_THROW("ROIALIGN: incorrect datatype for roi indices! (should be an integral type)");
         if(bi_lens.size() != 1)
         {
             MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp
index 6241b495ce2..ecf4efa121a 100644
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -5161,31 +5161,29 @@ TEST_CASE(roialign_test)
     migraphx::shape sbi2{migraphx::shape::int64_type, {3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois, sbi2);
 
-    migraphx::shape sbi_float{migraphx::shape::float_type, {2}};
-    throws_shape(migraphx::make_op("roialign"), sx, srois, sbi_float);
-
     migraphx::shape srois1{migraphx::shape::float_type, {2, 4, 3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois1, sbi);
 
+    migraphx::shape srois2{migraphx::shape::float_type, {2, 3}};
+    throws_shape(migraphx::make_op("roialign"), sx, srois2, sbi);
+
     // alternate data types
     migraphx::shape sx_d{migraphx::shape::double_type, {3, 4, 5, 6}};
     migraphx::shape srois_d{migraphx::shape::double_type, {2, 4}};
     migraphx::shape sbi_int{migraphx::shape::int32_type, {2}};
     migraphx::shape sout_d{migraphx::shape::double_type, {2, 4, 1, 1}};
-    // to do: debug why this commented-out test failed
-    // expect_shape(sout_d, migraphx::make_op("roialign"), sx_d, srois_d, sbi_int);
-    expect_shape(sout, migraphx::make_op("roialign"), sx_d, srois, sbi);
+    expect_shape(sout_d, migraphx::make_op("roialign"), sx_d, srois_d, sbi_int);
 
     // wrong data types
-    migraphx::shape srois2{migraphx::shape::float_type, {2, 3}};
-    throws_shape(migraphx::make_op("roialign"), sx, srois2, sbi);
-
     migraphx::shape srois_int{migraphx::shape::int32_type, {2, 3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois_int, sbi);
 
     migraphx::shape sx_int{migraphx::shape::int64_type, {3, 4, 5, 6}};
     throws_shape(migraphx::make_op("roialign"), sx_int, srois, sbi);
 
+    migraphx::shape sbi_float{migraphx::shape::float_type, {2}};
+    throws_shape(migraphx::make_op("roialign"), sx, srois, sbi_float);
+
 }
 
 TEST_CASE(test_concat)

From 8f348b5a3b76ffb2db56f8567763eaa4e6f89937 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 2 Oct 2024 23:05:02 +0000
Subject: [PATCH 28/56] revert accidental change

---
 src/include/migraphx/check_shapes.hpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/include/migraphx/check_shapes.hpp b/src/include/migraphx/check_shapes.hpp
index 073b2fe31f0..cbffb758057 100644
--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -235,14 +235,11 @@ struct check_shapes
     }
 
     /*!
-     * Check all shapes have the same layout, with minor differences allowed.
+     * Check all shapes have the same layout.
      */
-    const check_shapes& compatible_layout() const
+    const check_shapes& same_layout() const
     {
-        if(begin != end and this->any_of([&](const shape& s) {
-               return not migraphx::is_compatible_shape(s, *begin) and
-                      find_permutation(s) != find_permutation(*begin);
-           }))
+        if(not this->same([](const shape& s) { return find_permutation(s); }))
             MIGRAPHX_THROW(prefix() + "Layouts do not match");
         return *this;
     }
@@ -335,7 +332,7 @@ struct check_shapes
      */
     const check_shapes& not_broadcasted() const
     {
-        if(not this->all_of([](const shape& s) { return not s.broadcasted(); }))
+        if(not this->all_of([](const shape& s) { return s.standard() or not s.broadcasted(); }))
             MIGRAPHX_THROW(prefix() + "Shapes are broadcasted");
         return *this;
     }

From 4920232985df4bac771a42f560417bfba0075aa9 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 2 Oct 2024 23:12:38 +0000
Subject: [PATCH 29/56] revert unwanted changes

---
 src/include/migraphx/check_shapes.hpp |  2 +-
 src/include/migraphx/shape.hpp        | 31 +++++----------------------
 2 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/src/include/migraphx/check_shapes.hpp b/src/include/migraphx/check_shapes.hpp
index cbffb758057..05118082ee8 100644
--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -332,7 +332,7 @@ struct check_shapes
      */
     const check_shapes& not_broadcasted() const
     {
-        if(not this->all_of([](const shape& s) { return s.standard() or not s.broadcasted(); }))
+        if(not this->all_of([](const shape& s) { return not s.broadcasted(); }))
             MIGRAPHX_THROW(prefix() + "Shapes are broadcasted");
         return *this;
     }
diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index e7ff55dfcf1..290656f003d 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -62,8 +62,10 @@ struct MIGRAPHX_EXPORT shape
     m(int64_type, int64_t) \
     m(uint32_type, uint32_t) \
     m(uint64_type, uint64_t) \
-    m(fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz)
-    // clang-format on
+    m(fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz) \
+    m(fp8e4m3fn_type, migraphx::fp8::fp8e4m3fn) \
+    m(fp8e5m2_type, migraphx::fp8::fp8e5m2)
+// clang-format on
 
 #define MIGRAPHX_SHAPE_GENERATE_ENUM_TYPES(x, t) x,
     enum type_t
@@ -147,6 +149,7 @@ struct MIGRAPHX_EXPORT shape
     static std::string cpp_type(type_t t);
 
     static bool is_integral(type_t t);
+    static bool is_compatible(const shape& actual, const shape& expected);
 
     shape();
     shape(type_t t);
@@ -431,30 +434,6 @@ struct MIGRAPHX_EXPORT shape
     std::shared_ptr<const shape_impl> impl;
 };
 
-// "Almost identical" shapes.  To support an MLIR feature, there is a limited
-// case where shapes may both be standard but have non-identical strides.
-static bool inline is_compatible_shape(const shape& actual, const shape& expected)
-{
-    // Check subshapes
-    if(expected.type() == shape::tuple_type)
-        return equal(actual.sub_shapes().begin(),
-                     actual.sub_shapes().end(),
-                     expected.sub_shapes().begin(),
-                     &is_compatible_shape);
-    // Only the expected can be dynamic
-    if(expected.dynamic())
-        return true;
-    if(actual == expected)
-        return true;
-    if(actual.type() != expected.type())
-        return false;
-    // If both shapes are standard and lens match, they are considered compatible
-    // even if strides are different.
-    if(actual.standard() and expected.standard())
-        return actual.lens() == expected.lens();
-    return false;
-}
-
 /// Flatten subshapes to a single vector of non-tuple type of shapes
 MIGRAPHX_EXPORT std::vector<shape> flatten(const std::vector<shape>& shapes);
 

From 61cc9a6d07b0e4fd25a66040df7398e72f908871 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 2 Oct 2024 23:18:02 +0000
Subject: [PATCH 30/56] revert unwanted changes

---
 src/shape.cpp                                 | 24 +++++++++++++++++++
 .../gpu/include/migraphx/gpu/convolution.hpp  | 13 ++++++----
 test/onnx/gen_onnx.py                         | 24 -------------------
 3 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/src/shape.cpp b/src/shape.cpp
index cfa3a1c2b43..657a131be70 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -267,6 +267,30 @@ bool shape::is_integral(shape::type_t t)
     return result;
 }
 
+bool shape::is_compatible(const shape& actual, const shape& expected)
+{
+    // Check subshapes
+    if(expected.type() == shape::tuple_type)
+        return migraphx::equal(actual.sub_shapes(), expected.sub_shapes(), &is_compatible);
+    if(actual == expected)
+        return true;
+    if(actual.type() != expected.type())
+        return false;
+    // Only the expected can be dynamic
+    if(expected.dynamic())
+        return actual.ndim() == expected.ndim();
+    if(actual.dynamic())
+        return false;
+    if(actual.lens() != expected.lens())
+        return false;
+    // Check strides from dimensions that are not 1
+    return all_of(range(actual.lens().size()), [&](auto i) {
+        if(actual.lens()[i] == 1)
+            return true;
+        return actual.strides()[i] == expected.strides()[i];
+    });
+}
+
 shape::shape() : impl(shape_impl::default_shape()) {}
 
 shape::shape(type_t t) : impl(std::make_shared<shape_impl>(t)) {}
diff --git a/src/targets/gpu/include/migraphx/gpu/convolution.hpp b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
index 0738324af4a..1a6d1bc2497 100644
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -85,7 +85,7 @@ struct miopen_convolution
         check_shapes{conv_inputs, *this}
             .max_ndims(5)
             .packed_layouts({{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}})
-            .compatible_layout();
+            .same_layout();
         return migraphx::compute_shape<Op>(op, conv_inputs);
     }
 
@@ -180,6 +180,7 @@ struct miopen_convolution
         const auto& x_shape = inputs[0];
         const auto& w_shape = inputs[1];
 
+        unsigned long seed = 0;
 #ifdef MIGRAPHX_HAS_FIND_2_API
         {
             auto conv_problem = make_obj<miopen_problem>(
@@ -192,8 +193,10 @@ struct miopen_convolution
             // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
             preallocate = true;
 #endif
-            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : argument{inputs[0]};
-            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : argument{inputs[1]};
+            auto x = preallocate ? to_gpu(generate_argument(x_shape, seed++, random_mode::random))
+                                 : argument{inputs[0]};
+            auto w = preallocate ? to_gpu(generate_argument(w_shape, seed++, random_mode::random))
+                                 : argument{inputs[1]};
             auto y = preallocate ? allocate_gpu(output_shape) : argument{inputs[2]};
             auto workspace =
                 preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);
@@ -233,8 +236,8 @@ struct miopen_convolution
             return shape{shape::int8_type, {workspace_size}};
         }
 #else
-        auto x         = to_gpu(generate_argument(x_shape));
-        auto w         = to_gpu(generate_argument(w_shape));
+        auto x         = to_gpu(generate_argument(x_shape, seed++, random_mode::random));
+        auto w         = to_gpu(generate_argument(w_shape, seed++, random_mode::random));
         auto y         = allocate_gpu(output_shape);
         auto workspace = allocate_gpu(workspace_shape);
         int algo_count = 1;
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index 3b7e4697556..f80b59c3cb2 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -10153,30 +10153,6 @@ def roialign_half_pixel_test():
     return ([node], [x, roi, bi], [y])
 
 
-@onnx_test()
-def roialign_half_pixel_roi_test():
-    # Same as roialign_half_pixel_test but contains more ROIs than there
-    # are batch dimensions.
-    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 1, 2, 3])
-    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
-    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 4, 2, 2])
-
-    # half_pixel is the new mode we're developing for
-    node = onnx.helper.make_node(
-        'RoiAlign',
-        inputs=['x', 'rois', 'batch_ind'],
-        outputs=['y'],
-        spatial_scale=2.0,
-        output_height=2,
-        output_width=3,
-        sampling_ratio=2,
-        mode="avg",
-        coordinate_transformation_mode="half_pixel")
-
-    return ([node], [x, roi, bi], [y])
-
-
 @onnx_test()
 def round_half_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [4, 4])

From ae12b10c52875c81af47d1caf707c191fa887c2c Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 3 Oct 2024 19:33:45 +0000
Subject: [PATCH 31/56] format

---
 ort_roialign.py                               | 33 +++++-------
 src/include/migraphx/op/roialign.hpp          | 51 +++++++++---------
 test/onnx/gen_onnx.py                         | 20 +++----
 .../roialign_half_pixel_verify_test.cpp       | 54 ++++++++-----------
 test/onnx/verify/roialign_verify_test.cpp     | 43 ++++++++-------
 test/op_shape_test.cpp                        |  1 -
 6 files changed, 95 insertions(+), 107 deletions(-)

diff --git a/ort_roialign.py b/ort_roialign.py
index 6a7a3ee3dce..70ee2b410d6 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -1,4 +1,3 @@
-
 # Not for release.  This test script is for develop/test only
 
 import onnx
@@ -7,31 +6,27 @@
 import numpy as np
 print(" version: ", onnx.__version__, rt.__version__)
 
-
-x = np.array(np.arange(2*2*4*3), dtype='f')
+x = np.array(np.arange(2 * 2 * 4 * 3), dtype='f')
 x = np.reshape(x, [2, 2, 4, 3])
 
-y=np.ones([2, 2, 4, 3], dtype='f')
+y = np.ones([2, 2, 4, 3], dtype='f')
 
 # matches roialign_half_pixel_verify_test
 # rois=np.array([[0.1, 0.15, 0.6, 0.35],
 #                 [1.1, 0.73, 1.9, 1.13]], dtype='f')
 # matches roialign_half_pixel_oob_verify_test
-rois=np.array([
-                [1.1, 0.73, 1.7, 1.13],
-                [1.1, 0.73, 2.6, 1.13]
-                #         [1.1, 0.73, 2.6, 1.13]
-                ], dtype='f')
+rois = np.array(
+    [[1.1, 0.73, 1.7, 1.13], [1.1, 0.73, 2.6, 1.13]
+     #         [1.1, 0.73, 2.6, 1.13]
+     ],
+    dtype='f')
 
 # rois=np.array([
 #                 [ 1.1, 0.73, 2.2, 1.13]], dtype='f')
-sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_half_pixel_test.onnx')
-# sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_one_roi_asdf_test.onnx') 
-res = sess.run(['y'], {'x': x,
-                    'rois': rois,
-                    'batch_ind': [0, 1]})
-                  #   'batch_ind': [0]})
-print(' ORT test model is roialign_one_roi_asdf_test.onnx, rois_data is \n',rois, 
-      ' result is \n', res)
-       
-		
\ No newline at end of file
+sess = rt.InferenceSession(
+    '/workspace/AMDMIGraphX/test/onnx/roialign_half_pixel_test.onnx')
+# sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_one_roi_asdf_test.onnx')
+res = sess.run(['y'], {'x': x, 'rois': rois, 'batch_ind': [0, 1]})
+#   'batch_ind': [0]})
+print(' ORT test model is roialign_one_roi_asdf_test.onnx, rois_data is \n',
+      rois, ' result is \n', res)
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 1850131b52c..98f0bdcba9e 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -76,10 +76,11 @@ struct roialign
         // check input correct
         if(shape::is_integral(type))
             MIGRAPHX_THROW("ROIALIGN: incorrect type for input data! (should be non-integer)");
-        if(shape::is_integral(inputs.at(1).type())) 
+        if(shape::is_integral(inputs.at(1).type()))
             MIGRAPHX_THROW("ROIALIGN: incorrect data type for rois! (should be non-integer)");
         if(!shape::is_integral(inputs.at(2).type()))
-            MIGRAPHX_THROW("ROIALIGN: incorrect datatype for roi indices! (should be an integral type)");
+            MIGRAPHX_THROW(
+                "ROIALIGN: incorrect datatype for roi indices! (should be an integral type)");
         if(bi_lens.size() != 1)
         {
             MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
@@ -122,15 +123,14 @@ struct roialign
                                         output_width);
 
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
-
-            // The p and i indexes correspond to nested looping parameters in ORT that go in y, x order.  The i[x] value is least significant
-            // and iterates the fastest.
+            // The p and i indexes correspond to nested looping parameters in ORT that go in y, x
+            // order.  The i[x] value is least significant and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
-            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]};//  <== these are always the same
+            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; //  <== these are always the same
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
-            // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies inside)
-            // from which we will interpolate.
+            // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies
+            // inside) from which we will interpolate.
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
 
@@ -190,10 +190,10 @@ struct roialign
     // Calculate a pooling value for 1 block of bin_grid_size*bin_grid_size weights
     template <class T, class Op>
     double calc_pooling(const T& data,
-                                             const std::array<std::size_t, 2>& bin_grid_size,
-                                             const std::vector<pos_weight>& pos_weights,
-                                             int64_t& index,
-                                             Op op) const
+                        const std::array<std::size_t, 2>& bin_grid_size,
+                        const std::vector<pos_weight>& pos_weights,
+                        int64_t& index,
+                        Op op) const
     {
         double output_val   = op.init();
         const int64_t count = bin_grid_size[0] * bin_grid_size[1];
@@ -233,7 +233,7 @@ struct roialign
                 const auto bottom_data   = x.begin();
                 const auto roi_batch_ind = batch_indices[n];
                 // Do not use rounding; this implementation detail is critical
-                float offset = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
+                float offset                    = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
                 std::array<float, 2> roi_starts = {
                     static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale - offset),
                     static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale - offset)};
@@ -270,7 +270,7 @@ struct roialign
                 std::vector<int64_t> vec_index(channels, 0);
 
                 shape_for_each(comp_s1, [&](const auto& idx) {
-                    auto c  = idx[0];  // channel count
+                    auto c  = idx[0]; // channel count
                     auto ph = idx[1];
                     auto pw = idx[2];
 
@@ -278,18 +278,17 @@ struct roialign
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
                     double output_val;
-                    output_val =
-                        (mode == migraphx::op::pooling_mode::average)
-                            ? this->calc_pooling(offset_bottom_data,
-                                                 bin_grid_size,
-                                                 pre_calc,
-                                                 vec_index[c],
-                                                 avg_pool{})
-                            : this->calc_pooling(offset_bottom_data,
-                                                 bin_grid_size,
-                                                 pre_calc,
-                                                 vec_index[c],
-                                                 max_pool{});
+                    output_val           = (mode == migraphx::op::pooling_mode::average)
+                                               ? this->calc_pooling(offset_bottom_data,
+                                                          bin_grid_size,
+                                                          pre_calc,
+                                                          vec_index[c],
+                                                          avg_pool{})
+                                               : this->calc_pooling(offset_bottom_data,
+                                                          bin_grid_size,
+                                                          pre_calc,
+                                                          vec_index[c],
+                                                          max_pool{});
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index 1b041069caf..b838a8f065f 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -751,6 +751,7 @@ def celu_default_test():
 
     return ([node], [x], [y])
 
+
 @onnx_test()
 def celu_verify_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 3])
@@ -10629,16 +10630,15 @@ def roialign_half_pixel_test():
     y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 4, 3])
 
     # half_pixel is the newer mode for ROIAlign
-    node = onnx.helper.make_node(
-        'RoiAlign',
-        inputs=['x', 'rois', 'batch_ind'],
-        outputs=['y'],
-        spatial_scale=2.0,
-        output_height=2,
-        output_width=3,
-        sampling_ratio=2,
-        mode="avg",
-        coordinate_transformation_mode="half_pixel")
+    node = onnx.helper.make_node('RoiAlign',
+                                 inputs=['x', 'rois', 'batch_ind'],
+                                 outputs=['y'],
+                                 spatial_scale=2.0,
+                                 output_height=2,
+                                 output_width=3,
+                                 sampling_ratio=2,
+                                 mode="avg",
+                                 coordinate_transformation_mode="half_pixel")
 
     return ([node], [x, roi, bi], [y])
 
diff --git a/test/onnx/verify/roialign_half_pixel_verify_test.cpp b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
index 579106b7a52..ea570792249 100644
--- a/test/onnx/verify/roialign_half_pixel_verify_test.cpp
+++ b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
@@ -26,56 +26,44 @@
 #include <migraphx/verify.hpp>
 #include <onnx_test.hpp>
 
-// The half_pixel mode for the ROIAlign op 
+// The half_pixel mode for the ROIAlign op
 TEST_CASE(roialign_half_pixel_verify_test)
 {
     migraphx::program p = read_onnx("roialign_half_pixel_test.onnx");
     p.compile(migraphx::make_target("ref"));
     migraphx::shape s{migraphx::shape::float_type, {2, 2, 4, 3}};
-    std::vector<float> data(2*2*4*3);
+    std::vector<float> data(2 * 2 * 4 * 3);
     std::iota(data.begin(), data.end(), 0.f);
     migraphx::parameter_map pp;
-    pp["x"]     = migraphx::argument(s, data.data());
-    pp["y"]     = migraphx::argument(s, data.data());
+    pp["x"] = migraphx::argument(s, data.data());
+    pp["y"] = migraphx::argument(s, data.data());
 
     migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
-    std::vector<float> rois_data = {
-                                    1.1, 0.73, 1.7, 1.13,
-                                    1.1, 0.73, 2.6, 1.13
-                                     };
-    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    std::vector<float> rois_data = {1.1, 0.73, 1.7, 1.13, 1.1, 0.73, 2.6, 1.13};
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}}; // batch_index
     std::vector<int64_t> bi_data = {0, 1};
 
-    pp["rois"]    = migraphx::argument(srois, rois_data.data());
-    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
-    pp["y"]     = migraphx::argument(s, data.data());
+    pp["rois"]      = migraphx::argument(srois, rois_data.data());
+    pp["batch_ind"] = migraphx::argument(sbi, bi_data.data());
+    pp["y"]         = migraphx::argument(s, data.data());
 
     auto result = p.eval(pp).back();
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-printf(" result:  \n");
-for(int i = 0; i < result_vector.size(); i++)
-{
- printf(" %f ", result_vector[i]);
- if(i % 6 == 5)
-     printf("\n");
-}
-printf("\n");
+    printf(" result:  \n");
+    for(int i = 0; i < result_vector.size(); i++)
+    {
+        printf(" %f ", result_vector[i]);
+        if(i % 6 == 5)
+            printf("\n");
+    }
+    printf("\n");
     // Gold values were generated with onnxruntime
-    std::vector<float> gold={  
-            5.38, 5.4799995, 5.4799995,
-            6.58, 6.68, 6.68,
-            17.38, 17.48, 17.48,
-            18.58, 18.68, 18.68,
-            29.454998, 14.74, 0.,
-            30.654999, 15.34, 0.,
-            41.455, 20.74, 0.,
-            42.655003, 21.34, 0. 
-    };
+    std::vector<float> gold = {5.38,      5.4799995, 5.4799995, 6.58,      6.68,  6.68,
+                               17.38,     17.48,     17.48,     18.58,     18.68, 18.68,
+                               29.454998, 14.74,     0.,        30.654999, 15.34, 0.,
+                               41.455,    20.74,     0.,        42.655003, 21.34, 0.};
 
     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
 }
-
-
-
diff --git a/test/onnx/verify/roialign_verify_test.cpp b/test/onnx/verify/roialign_verify_test.cpp
index f2107280df0..f99b7072c69 100644
--- a/test/onnx/verify/roialign_verify_test.cpp
+++ b/test/onnx/verify/roialign_verify_test.cpp
@@ -32,12 +32,12 @@ TEST_CASE(roialign_verify_test)
     p.compile(migraphx::make_target("ref"));
 
     migraphx::shape s{migraphx::shape::float_type, {3, 2, 4, 5}};
-    std::vector<float> data(3*5*4*2);
+    std::vector<float> data(3 * 5 * 4 * 2);
     std::iota(data.begin(), data.end(), 0);
 
     migraphx::parameter_map pp;
-    pp["x"]     = migraphx::argument(s, data.data());
-    pp["y"]     = migraphx::argument(s, data.data());
+    pp["x"] = migraphx::argument(s, data.data());
+    pp["y"] = migraphx::argument(s, data.data());
 
     // migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
     // std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35,
@@ -46,29 +46,36 @@ TEST_CASE(roialign_verify_test)
     // std::vector<float> bi_data = {0, 1};
 
     migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
-    std::vector<float> rois_data = {
-                                    2.1, 1.73, 3.8, 2.13};
-    migraphx::shape sbi{migraphx::shape::int64_type, {1}};  // batch_index
+    std::vector<float> rois_data = {2.1, 1.73, 3.8, 2.13};
+    migraphx::shape sbi{migraphx::shape::int64_type, {1}}; // batch_index
     std::vector<float> bi_data = {0};
 
-    pp["rois"]    = migraphx::argument(srois, rois_data.data());
-    pp["batch_ind"]    = migraphx::argument(sbi, bi_data.data());
+    pp["rois"]      = migraphx::argument(srois, rois_data.data());
+    pp["batch_ind"] = migraphx::argument(sbi, bi_data.data());
 
     auto result = p.eval(pp).back();
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-printf(" result:  ");
-for(auto aa : result_vector) printf(" %f ", aa);
-printf("\n");
+    printf(" result:  ");
+    for(auto aa : result_vector)
+        printf(" %f ", aa);
+    printf("\n");
 
-    std::vector<float> gold = {   0.000000,  0.022222,  0.200000,  0.400000,  0.600000,  0.500000,  0.522222,  0.700000,  0.900000,  1.100000,  1.500000,  1.522223,  1.700000,
-      1.900000, 2.100000, 2.500000, 2.522222, 2.700000, 2.900000, 3.100000, 3.500000, 3.522222, 3.700000, 3.900000, 4.100000, 20.000000, 20.022223, 20.200001, 20.400000, 20.600000, 20.500000, 20.522223, 
-      20.700001, 20.900000, 21.100000, 21.500000, 21.522223, 21.700001, 21.900000, 22.100000, 22.500000, 22.522223, 22.700001, 22.900000, 23.100000, 23.500000, 23.522223, 23.700001, 
-      23.900000, 24.100000, 5.888889, 0.000000, 0.000000, 0.000000, 0.000000, 6.000000, 0.000000, 0.000000, 0.000000, 0.000000, 6.000000, 0.000000, 0.000000, 0.000000, 0.000000,
-    6.000000, 0.000000, 0.000000, 0.000000, 0.000000, 6.000000, 0.000000, 0.000000, 0.000000, 0.000000, 12.555555, 0.000000, 0.000000, 0.000000, 0.000000, 12.666667, 0.000000,
-        0.000000, 0.000000, 0.000000, 12.666667, 0.000000, 0.000000, 0.000000, 0.000000, 12.666667, 0.000000, 0.000000, 0.000000, 0.000000, 12.666667, 0.000000, 0.000000,
-        0.000000,  0.000000 };
+    std::vector<float> gold = {
+        0.000000,  0.022222,  0.200000,  0.400000,  0.600000,  0.500000,  0.522222,  0.700000,
+        0.900000,  1.100000,  1.500000,  1.522223,  1.700000,  1.900000,  2.100000,  2.500000,
+        2.522222,  2.700000,  2.900000,  3.100000,  3.500000,  3.522222,  3.700000,  3.900000,
+        4.100000,  20.000000, 20.022223, 20.200001, 20.400000, 20.600000, 20.500000, 20.522223,
+        20.700001, 20.900000, 21.100000, 21.500000, 21.522223, 21.700001, 21.900000, 22.100000,
+        22.500000, 22.522223, 22.700001, 22.900000, 23.100000, 23.500000, 23.522223, 23.700001,
+        23.900000, 24.100000, 5.888889,  0.000000,  0.000000,  0.000000,  0.000000,  6.000000,
+        0.000000,  0.000000,  0.000000,  0.000000,  6.000000,  0.000000,  0.000000,  0.000000,
+        0.000000,  6.000000,  0.000000,  0.000000,  0.000000,  0.000000,  6.000000,  0.000000,
+        0.000000,  0.000000,  0.000000,  12.555555, 0.000000,  0.000000,  0.000000,  0.000000,
+        12.666667, 0.000000,  0.000000,  0.000000,  0.000000,  12.666667, 0.000000,  0.000000,
+        0.000000,  0.000000,  12.666667, 0.000000,  0.000000,  0.000000,  0.000000,  12.666667,
+        0.000000,  0.000000,  0.000000,  0.000000};
     float alpha = 0.5;
     std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {
         return std::max(0.0f, x) + std::min(0.0f, alpha * std::expm1(x / alpha));
diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp
index ecf4efa121a..24b9afb1377 100644
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -5183,7 +5183,6 @@ TEST_CASE(roialign_test)
 
     migraphx::shape sbi_float{migraphx::shape::float_type, {2}};
     throws_shape(migraphx::make_op("roialign"), sx, srois, sbi_float);
-
 }
 
 TEST_CASE(test_concat)

From 717b03c279e5c860ac63f3dc729cf676dadcf9e5 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 3 Oct 2024 22:20:04 +0000
Subject: [PATCH 32/56] undo a return type change and a test error

---
 src/include/migraphx/op/roialign.hpp | 37 ++++++++++++++--------------
 test/check_shapes_test.cpp           |  4 +--
 test/shape_test.cpp                  |  1 -
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 98f0bdcba9e..5d987c9c1ca 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -126,7 +126,7 @@ struct roialign
             // The p and i indexes correspond to nested looping parameters in ORT that go in y, x
             // order.  The i[x] value is least significant and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
-            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; //  <== these are always the same
+            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; // these are always equal
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
             // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies
@@ -189,11 +189,11 @@ struct roialign
 
     // Calculate a pooling value for 1 block of bin_grid_size*bin_grid_size weights
     template <class T, class Op>
-    double calc_pooling(const T& data,
-                        const std::array<std::size_t, 2>& bin_grid_size,
-                        const std::vector<pos_weight>& pos_weights,
-                        int64_t& index,
-                        Op op) const
+    std::tuple<double, int64_t> calc_pooling(const T& data,
+                                             const std::array<std::size_t, 2>& bin_grid_size,
+                                             const std::vector<pos_weight>& pos_weights,
+                                             int64_t index,
+                                             Op op) const
     {
         double output_val   = op.init();
         const int64_t count = bin_grid_size[0] * bin_grid_size[1];
@@ -210,7 +210,7 @@ struct roialign
 
         output_val = op.final(output_val, count);
 
-        return output_val;
+        return {output_val, index};
     }
 
     argument compute(const shape& output_shape, std::vector<argument> args) const
@@ -278,17 +278,18 @@ struct roialign
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
                     double output_val;
-                    output_val           = (mode == migraphx::op::pooling_mode::average)
-                                               ? this->calc_pooling(offset_bottom_data,
-                                                          bin_grid_size,
-                                                          pre_calc,
-                                                          vec_index[c],
-                                                          avg_pool{})
-                                               : this->calc_pooling(offset_bottom_data,
-                                                          bin_grid_size,
-                                                          pre_calc,
-                                                          vec_index[c],
-                                                          max_pool{});
+                    std::tie(output_val, vec_index[c]) =
+                        (mode == migraphx::op::pooling_mode::average)
+                            ? this->calc_pooling(offset_bottom_data,
+                                                 bin_grid_size,
+                                                 pre_calc,
+                                                 vec_index[c],
+                                                 avg_pool{})
+                            : this->calc_pooling(offset_bottom_data,
+                                                 bin_grid_size,
+                                                 pre_calc,
+                                                 vec_index[c],
+                                                 max_pool{});
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/test/check_shapes_test.cpp b/test/check_shapes_test.cpp
index 58241576648..42b514d02f8 100644
--- a/test/check_shapes_test.cpp
+++ b/test/check_shapes_test.cpp
@@ -53,7 +53,7 @@ TEST_CASE(same_layout_fail)
     EXPECT(test::throws([] {
         shape a{shape::float_type, {2, 3}};
         shape b{shape::float_type, {2, 3}, {1, 2}};
-        migraphx::check_shapes{{a, b}, ""}.compatible_layout();
+        migraphx::check_shapes{{a, b}, ""}.same_layout();
     }));
 }
 
@@ -62,7 +62,7 @@ TEST_CASE(same_layout_pass)
     EXPECT(not test::throws([] {
         shape a{shape::float_type, {2, 3}, {1, 2}};
         shape b{shape::float_type, {2, 3}, {1, 2}};
-        migraphx::check_shapes{{a, b}, ""}.compatible_layout();
+        migraphx::check_shapes{{a, b}, ""}.same_layout();
     }));
 }
 
diff --git a/test/shape_test.cpp b/test/shape_test.cpp
index fd56faad95b..8a200f2c51c 100644
--- a/test/shape_test.cpp
+++ b/test/shape_test.cpp
@@ -826,7 +826,6 @@ TEST_CASE(tuple_copy)
     EXPECT(s3 == s2);
     migraphx::shape s4{{migraphx::shape{migraphx::shape::int8_type},
                         migraphx::shape{migraphx::shape::float_type}}};
-    EXPECT(not is_compatible_shape(s1, s4));
     EXPECT(s4 != s1);
     EXPECT(s4 != s2);
     EXPECT(s4 != s3);

From 6fe841d2682ef682e30ae1b4ebf24e086fc25fac Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 3 Oct 2024 22:33:39 +0000
Subject: [PATCH 33/56] revert default test

---
 test/onnx/roialign_default_test.onnx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/onnx/roialign_default_test.onnx b/test/onnx/roialign_default_test.onnx
index 3f54104fdd6..5b0165fc093 100644
--- a/test/onnx/roialign_default_test.onnx
+++ b/test/onnx/roialign_default_test.onnx
@@ -1,5 +1,4 @@
-
-roialign_default_test:�
+	roialign_default_test:�
 !
 x
 rois
@@ -24,4 +23,4 @@
 
 
 
-B
\ No newline at end of file
+B
\ No newline at end of file

From 09adc29ef8c49b30afd8ad79b68029336f91d393 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 7 Oct 2024 15:41:41 +0000
Subject: [PATCH 34/56] debugging crash

---
 ort_roialign.py                           |  27 ++-
 src/include/migraphx/op/roialign.hpp      |  20 ++-
 test/onnx/conv_relu_maxpool_test.onnx     | Bin 316 -> 329 bytes
 test/onnx/gen_onnx.py                     |   9 +-
 test/onnx/roialign_half_pixel_test.onnx   | Bin 360 -> 360 bytes
 test/onnx/roialign_test.onnx              | Bin 345 -> 345 bytes
 test/onnx/verify/roialign_verify_test.cpp | 206 ++++++++++++++++++----
 7 files changed, 209 insertions(+), 53 deletions(-)

diff --git a/ort_roialign.py b/ort_roialign.py
index 70ee2b410d6..ec6e1920b8d 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -6,27 +6,22 @@
 import numpy as np
 print(" version: ", onnx.__version__, rt.__version__)
 
-x = np.array(np.arange(2 * 2 * 4 * 3), dtype='f')
-x = np.reshape(x, [2, 2, 4, 3])
+x = np.array(np.arange(10 * 5 * 4 * 7), dtype='f')
+x = np.reshape(x, [10, 5, 4, 7])
 
-y = np.ones([2, 2, 4, 3], dtype='f')
+y = np.ones([10, 5, 4, 7], dtype='f')
 
-# matches roialign_half_pixel_verify_test
-# rois=np.array([[0.1, 0.15, 0.6, 0.35],
-#                 [1.1, 0.73, 1.9, 1.13]], dtype='f')
-# matches roialign_half_pixel_oob_verify_test
 rois = np.array(
-    [[1.1, 0.73, 1.7, 1.13], [1.1, 0.73, 2.6, 1.13]
-     #         [1.1, 0.73, 2.6, 1.13]
-     ],
+    [
+        [0.1, 0.15, 0.6, 0.35],
+        [2.1, 1.73, 3.8, 2.13]        
+    ],
     dtype='f')
 
-# rois=np.array([
-#                 [ 1.1, 0.73, 2.2, 1.13]], dtype='f')
+themodel = 'roialign_test.onnx'
 sess = rt.InferenceSession(
-    '/workspace/AMDMIGraphX/test/onnx/roialign_half_pixel_test.onnx')
-# sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/roialign_one_roi_asdf_test.onnx')
+    '/workspace/AMDMIGraphX/test/onnx/' + themodel)
 res = sess.run(['y'], {'x': x, 'rois': rois, 'batch_ind': [0, 1]})
-#   'batch_ind': [0]})
-print(' ORT test model is roialign_one_roi_asdf_test.onnx, rois_data is \n',
+
+print(' ORT test model is ' + themodel + ', rois_data is \n',
       rois, ' result is \n', res)
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 5d987c9c1ca..1d19f3e514f 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -121,7 +121,8 @@ struct roialign
     {
         std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
                                         output_width);
-
+// printf(" bin grid %ldx%ld, height %lu width %lu\n", bin_grid_size[0], bin_grid_size[1], output_height,
+//                                         output_width);
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
             // The p and i indexes correspond to nested looping parameters in ORT that go in y, x
             // order.  The i[x] value is least significant and iterates the fastest.
@@ -138,9 +139,12 @@ struct roialign
             {
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-                xy[ii] = (coord_trans_mode != "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+// printf(" QQQQQQ  L137 x=%f  y=%f  ", xy[0], xy[1]);                                        
+                // xy[ii] = (coord_trans_mode != "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+// printf(" L139 %f ", xy[ii]);   
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
+// printf(" L142 results = pos_weight i=%lu dims=%lu, %lu  \n ", index,  dims[0], dims[1]);                    
                     results[index] = pos_weight{};
                     return;
                 }
@@ -148,10 +152,13 @@ struct roialign
                 xy[ii]   = std::max(xy[ii], 0.0f);
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
+// printf(" L148 %f  low[ii] %lu, dims[ii] %lu", xy[ii],  low[ii], dims[ii]);                
                 if(low[ii] >= dims[ii] - 1)
                 {
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+// printf(" L154 %f ", xy[ii]);                    
                 }
+// printf(" \n");            
             }
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
@@ -162,10 +169,15 @@ struct roialign
             float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-
+printf(" !!!!! %ld\n", index);
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
         });
+// printf(" AAAAA here we are\n");
+        // for(int iix = 0; iix < results.size(); iix++)
+        //     printf(" SSSSS %ld %d\n", results.size(), iix);
+        //   printf(" SSSSS %d    %lu  %lu  %lu  %lu   %f  %f  %f  %f\n", iix, results[iix].pos[0], results[iix].pos[1], results[iix].pos[2], results[iix].pos[3],
+        //            results[iix].w[0], results[iix].w[1], results[iix].w[2], results[iix].w[3]);
         return results;
     }
 
@@ -251,10 +263,12 @@ struct roialign
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
                     if(coord_trans_mode != "half_pixel")
                         roi_size[ii] = std::max(roi_size[ii], 1.0f);
+printf("\n KKKKK ii %ld  roi_size %f   roi_batch_ind %ld  out_dims %lu     \n", ii, roi_size[ii] , roi_batch_ind,  out_dims[ii]);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
                                             : std::ceil(roi_size[ii] / out_dims[ii]);
+printf(" KLKLKL bin_grid_size= %ld x %ld\n", bin_grid_size[0], bin_grid_size[1]);                                            
                 }
 
                 // we want to precalculate indices and weights shared by all channels,
diff --git a/test/onnx/conv_relu_maxpool_test.onnx b/test/onnx/conv_relu_maxpool_test.onnx
index f5bfe4c1514a128bbde7d847205baffbb35763fb..4403d8de5f70722d44e60d46fb26e575d7a959ca 100644
GIT binary patch
delta 65
zcmdnPbdpJigHuQ>IX|x~z9==PG(I=6q98v%C%z=LxWsD3MDbPPI3y=t$YvA<00*)d
Awg3PC

delta 52
zcmX@fw1-KYgH=c{IX|x~z9==PR5!IEF}ENm)oQ^+nN|FJC8@<F@o>qBce5F}0M$Se
A)c^nh

diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index b838a8f065f..14206f7abbc 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -10605,17 +10605,18 @@ def roialign_default_test():
 def roialign_test():
     # Roialign with output_half_pixel mode is backward-compatible.
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 5, 4, 7])
-    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [8, 4])
-    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [8])
-    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [8, 4, 5, 5])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 4, 5, 5])
     node = onnx.helper.make_node(
         'RoiAlign',
         inputs=['x', 'rois', 'batch_ind'],
         outputs=['y'],
         spatial_scale=2.0,
         output_height=5,
-        output_width=5,
+        output_width=3,
         sampling_ratio=3,
+        # todo:  max test
         mode="avg",
         coordinate_transformation_mode="output_half_pixel")
 
diff --git a/test/onnx/roialign_half_pixel_test.onnx b/test/onnx/roialign_half_pixel_test.onnx
index 76daf3d0c0df8de4fe13e5e0133da6c3bb1e4cce..4b4ff5dcb2f89884e01805636aec6b7122df873d 100644
GIT binary patch
delta 16
XcmaFC^n!_rgLC42sf{w;jEuqnF8>6d

delta 16
XcmaFC^n!_rgKOe_sf{w;jEtfHFAD^s

diff --git a/test/onnx/roialign_test.onnx b/test/onnx/roialign_test.onnx
index 0a60795f561572d993de35769d4b8aef8b520e49..eb6703f49d54786be995473bd87d0534717b30ee 100644
GIT binary patch
delta 56
zcmcb~bd!mRgL5L&Qby*9D~cGICQCEQC^02Tb1_y5iE)W=FbWB9F>x>fF$)m0g0K^l
GfG_|!><8Qc

delta 56
zcmcb~bd!mRgKHwwQbyK^D~cF7CQCEQC~+i7b1_y5iE)W=FbWB9F>!DJF$)m0g0K^l
GfG7YvoCos&

diff --git a/test/onnx/verify/roialign_verify_test.cpp b/test/onnx/verify/roialign_verify_test.cpp
index f99b7072c69..747171a8a7a 100644
--- a/test/onnx/verify/roialign_verify_test.cpp
+++ b/test/onnx/verify/roialign_verify_test.cpp
@@ -31,51 +31,197 @@ TEST_CASE(roialign_verify_test)
     migraphx::program p = read_onnx("roialign_test.onnx");
     p.compile(migraphx::make_target("ref"));
 
-    migraphx::shape s{migraphx::shape::float_type, {3, 2, 4, 5}};
-    std::vector<float> data(3 * 5 * 4 * 2);
+    migraphx::shape s{migraphx::shape::float_type, {10, 5, 4, 7}};
+    std::vector<float> data(10 * 5 * 4 * 7);
     std::iota(data.begin(), data.end(), 0);
 
     migraphx::parameter_map pp;
     pp["x"] = migraphx::argument(s, data.data());
     pp["y"] = migraphx::argument(s, data.data());
 
-    // migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
-    // std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35,
-    //                                 2.1, 1.73, 3.8, 2.13};
-    // migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
-    // std::vector<float> bi_data = {0, 1};
-
-    migraphx::shape srois{migraphx::shape::float_type, {1, 4}};
-    std::vector<float> rois_data = {2.1, 1.73, 3.8, 2.13};
-    migraphx::shape sbi{migraphx::shape::int64_type, {1}}; // batch_index
-    std::vector<float> bi_data = {0};
-
+    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
+    std::vector<float> rois_data = {
+                                    2.1, 1.73, 3.8, 2.13,
+                                    0.1, 0.15, 0.6, 0.35
+                                    };
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    std::vector<float> bi_data = {1, 0};
+printf("sdfkgjusdfgjk\n");
     pp["rois"]      = migraphx::argument(srois, rois_data.data());
     pp["batch_ind"] = migraphx::argument(sbi, bi_data.data());
 
     auto result = p.eval(pp).back();
+printf("  dfssdgf \n");
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-    printf(" result:  ");
-    for(auto aa : result_vector)
-        printf(" %f ", aa);
-    printf("\n");
+//     printf(" result: ");
+// for(int aa = 0; aa < result_vector.size(); aa++) 
+//  {
+//           printf(" %f ", result_vector[aa]);
+//     if(aa % s.lens()[0] == s.lens()[0]-1)
+//       printf("\n");
+// }    printf("\n");
 
     std::vector<float> gold = {
-        0.000000,  0.022222,  0.200000,  0.400000,  0.600000,  0.500000,  0.522222,  0.700000,
-        0.900000,  1.100000,  1.500000,  1.522223,  1.700000,  1.900000,  2.100000,  2.500000,
-        2.522222,  2.700000,  2.900000,  3.100000,  3.500000,  3.522222,  3.700000,  3.900000,
-        4.100000,  20.000000, 20.022223, 20.200001, 20.400000, 20.600000, 20.500000, 20.522223,
-        20.700001, 20.900000, 21.100000, 21.500000, 21.522223, 21.700001, 21.900000, 22.100000,
-        22.500000, 22.522223, 22.700001, 22.900000, 23.100000, 23.500000, 23.522223, 23.700001,
-        23.900000, 24.100000, 5.888889,  0.000000,  0.000000,  0.000000,  0.000000,  6.000000,
-        0.000000,  0.000000,  0.000000,  0.000000,  6.000000,  0.000000,  0.000000,  0.000000,
-        0.000000,  6.000000,  0.000000,  0.000000,  0.000000,  0.000000,  6.000000,  0.000000,
-        0.000000,  0.000000,  0.000000,  12.555555, 0.000000,  0.000000,  0.000000,  0.000000,
-        12.666667, 0.000000,  0.000000,  0.000000,  0.000000,  12.666667, 0.000000,  0.000000,
-        0.000000,  0.000000,  12.666667, 0.000000,  0.000000,  0.000000,  0.000000,  12.666667,
-        0.000000,  0.000000,  0.000000,  0.000000};
+          3.1666667,   3.5000002,   3.8333333,
+           4.566667 ,   4.9      ,   5.2333336,
+           5.9666677,   6.3      ,   6.6333337,
+           7.366667 ,   7.7000003,   8.033334 ,
+           8.766666 ,   9.100001 ,   9.433333 ,
+
+        
+ 31.166666 ,  31.5      ,  31.833334 ,
+          32.566666 ,  32.9      ,  33.23333  ,
+          33.966667 ,  34.300003 ,  34.633335 ,
+          35.366665 ,  35.699997 ,  36.033337 ,
+          36.766666 ,  37.100002 ,  37.433334 ,
+
+        
+ 59.166668 ,  59.5      ,  59.833332 ,
+          60.566666 ,  60.899998 ,  61.23333  ,
+          61.966667 ,  62.299995 ,  62.633335 ,
+          63.366665 ,  63.700005 ,  64.03334  ,
+          64.76666  ,  65.100006 ,  65.433334 ,
+
+        
+ 87.166664 ,  87.5      ,  87.83334  ,
+          88.566666 ,  88.899994 ,  89.23333  ,
+          89.96666  ,  90.30001  ,  90.63333  ,
+          91.36667  ,  91.7      ,  92.033325 ,
+          92.766655 ,  93.100006 ,  93.433334 ,
+
+        
+115.166664 , 115.5      , 115.833336 ,
+         116.56668  , 116.899994 , 117.23333  ,
+         117.96666  , 118.30001  , 118.63333  ,
+         119.36667  , 119.700005 , 120.03334  ,
+         120.766655 , 121.100006 , 121.433334 ,
+
+
+       
+165.76666  , 166.80742  ,  55.666668 ,
+         165.76666  , 166.80742  ,  55.666668 ,
+         110.51111  , 111.20494  ,  37.11111  ,
+           0.       ,   0.       ,   0.       ,
+           0.       ,   0.       ,   0.       ,
+
+        
+193.76666  , 194.80742  ,  65.       ,
+         193.76666  , 194.80742  ,  65.       ,
+         129.17778  , 129.87161  ,  43.333332 ,
+           0.       ,   0.       ,   0.       ,
+           0.       ,   0.       ,   0.       ,
+
+        
+221.76668  , 222.80742  ,  74.333336 ,
+         221.76668  , 222.80742  ,  74.333336 ,
+         147.84445  , 148.53827  ,  49.555557 ,
+           0.       ,   0.       ,   0.       ,
+           0.       ,   0.       ,   0.       ,
+
+        
+249.76668  , 250.8074   ,  83.666664 ,
+         249.76668  , 250.8074   ,  83.666664 ,
+         166.51111  , 167.20494  ,  55.77778  ,
+           0.       ,   0.       ,   0.       ,
+           0.       ,   0.       ,   0.       ,
+
+        
+277.7667   , 278.8074   ,  93.       ,
+         277.7667   , 278.8074   ,  93.       ,
+         185.17778  , 185.87161  ,  62.       ,
+           0.       ,   0.       ,   0.       ,
+           0.       ,   0.       ,   0.  
+// 0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
+//           8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
+//           4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
+//          0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
+//           8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
+//           4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
+//          0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
+//           8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
+//           4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
+//          1.90476179e-02, 1.90476179e-02, 2.39858869e-02,
+//           1.07936502e-01, 2.19047621e-01, 3.30158740e-01,
+//           4.41269815e-01, 5.52380979e-01, 6.63492084e-01,
+//          1.71428561e-01, 1.71428561e-01, 1.76366836e-01,
+//           2.60317445e-01, 3.71428549e-01, 4.82539713e-01,
+//           5.93650818e-01, 7.04761863e-01, 8.15872967e-01,
+//          3.42857152e-01, 3.42857152e-01, 3.47795397e-01,
+//           4.31746036e-01, 5.42857111e-01, 6.53968275e-01,
+//           7.65079260e-01, 8.76190484e-01, 9.87301588e-01,
+//          5.14285743e-01, 5.14285743e-01, 5.19223928e-01,
+//           6.03174567e-01, 7.14285672e-01, 8.25396836e-01,
+//           9.36507940e-01, 1.04761910e+00, 1.15873003e+00,
+
+//         1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
+//           1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
+//           1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
+//          1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
+//           1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
+//           1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
+//          1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
+//           1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
+//           1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
+//          1.20190477e+01, 1.20190477e+01, 1.20239868e+01,
+//           1.21079369e+01, 1.22190475e+01, 1.23301582e+01,
+//           1.24412699e+01, 1.25523796e+01, 1.26634922e+01,
+//          1.21714277e+01, 1.21714277e+01, 1.21763659e+01,
+//           1.22603178e+01, 1.23714285e+01, 1.24825401e+01,
+//           1.25936518e+01, 1.27047615e+01, 1.28158722e+01,
+//          1.23428583e+01, 1.23428583e+01, 1.23477964e+01,
+//           1.24317465e+01, 1.25428581e+01, 1.26539688e+01,
+//           1.27650795e+01, 1.28761902e+01, 1.29873009e+01,
+//          1.25142860e+01, 1.25142860e+01, 1.25192232e+01,
+//           1.26031752e+01, 1.27142859e+01, 1.28253975e+01,
+//           1.29365072e+01, 1.30476189e+01, 1.31587305e+01,
+
+
+//        2.88403187e+01, 2.90094528e+01, 2.90514297e+01,
+//           2.90514297e+01, 2.90514297e+01, 2.90514297e+01,
+//           2.90514297e+01, 9.68380928e+00, 0.00000000e+00,
+//          2.91831741e+01, 2.93523083e+01, 2.93942871e+01,
+//           2.93942871e+01, 2.93942871e+01, 2.93942871e+01,
+//           2.93942871e+01, 9.79809570e+00, 0.00000000e+00,
+//          2.95260353e+01, 2.96951675e+01, 2.97371426e+01,
+//           2.97371426e+01, 2.97371426e+01, 2.97371426e+01,
+//           2.97371426e+01, 9.91238022e+00, 0.00000000e+00,
+//          2.98688869e+01, 3.00380211e+01, 3.00799999e+01,
+//           3.00799999e+01, 3.00799999e+01, 3.00799999e+01,
+//           3.00799999e+01, 1.00266676e+01, 0.00000000e+00,
+//          3.02117481e+01, 3.03808823e+01, 3.04228554e+01,
+//           3.04228554e+01, 3.04228554e+01, 3.04228554e+01,
+//           3.04228554e+01, 1.01409521e+01, 0.00000000e+00,
+//          3.05546055e+01, 3.07237377e+01, 3.07657166e+01,
+//           3.07657166e+01, 3.07657166e+01, 3.07657166e+01,
+//           3.07657166e+01, 1.02552385e+01, 0.00000000e+00,
+//          3.08974609e+01, 3.10665970e+01, 3.11085720e+01,
+//           3.11085720e+01, 3.11085720e+01, 3.11085720e+01,
+//           3.11085720e+01, 1.03695240e+01, 0.00000000e+00,
+
+//         4.08403168e+01, 4.10094528e+01, 4.10514259e+01,
+//           4.10514259e+01, 4.10514259e+01, 4.10514259e+01,
+//           4.10514259e+01, 1.36838093e+01, 0.00000000e+00,
+//          4.11831741e+01, 4.13523102e+01, 4.13942871e+01,
+//           4.13942871e+01, 4.13942871e+01, 4.13942871e+01,
+//           4.13942871e+01, 1.37980957e+01, 0.00000000e+00,
+//          4.15260315e+01, 4.16951675e+01, 4.17371483e+01,
+//           4.17371483e+01, 4.17371483e+01, 4.17371483e+01,
+//           4.17371483e+01, 1.39123802e+01, 0.00000000e+00,
+//          4.18688889e+01, 4.20380211e+01, 4.20799980e+01,
+//           4.20799980e+01, 4.20799980e+01, 4.20799980e+01,
+//           4.20799980e+01, 1.40266676e+01, 0.00000000e+00,
+//          4.22117462e+01, 4.23808823e+01, 4.24228554e+01,
+//           4.24228554e+01, 4.24228554e+01, 4.24228554e+01,
+//           4.24228554e+01, 1.41409521e+01, 0.00000000e+00,
+//          4.25546036e+01, 4.27237396e+01, 4.27657166e+01,
+//           4.27657166e+01, 4.27657166e+01, 4.27657166e+01,
+//           4.27657166e+01, 1.42552385e+01, 0.00000000e+00,
+//          4.28974609e+01, 4.30666008e+01, 4.31085777e+01,
+//           4.31085777e+01, 4.31085777e+01, 4.31085777e+01,
+//           4.31085777e+01, 1.43695240e+01, 0.00000000e+00
+};
     float alpha = 0.5;
     std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {
         return std::max(0.0f, x) + std::min(0.0f, alpha * std::expm1(x / alpha));

From fb30afbf448cea4048f93586671f55f243db2123 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 7 Oct 2024 21:42:30 +0000
Subject: [PATCH 35/56] probably fixed

---
 ort_roialign.py                           |   5 +-
 src/include/migraphx/op/roialign.hpp      |   5 +-
 test/onnx/verify/roialign_verify_test.cpp | 233 ++++++----------------
 3 files changed, 71 insertions(+), 172 deletions(-)

diff --git a/ort_roialign.py b/ort_roialign.py
index ec6e1920b8d..eb60fb05352 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -14,14 +14,15 @@
 rois = np.array(
     [
         [0.1, 0.15, 0.6, 0.35],
-        [2.1, 1.73, 3.8, 2.13]        
+        [2.1, 1.73, 3.8, 2.13]
     ],
     dtype='f')
 
 themodel = 'roialign_test.onnx'
 sess = rt.InferenceSession(
     '/workspace/AMDMIGraphX/test/onnx/' + themodel)
-res = sess.run(['y'], {'x': x, 'rois': rois, 'batch_ind': [0, 1]})
+res = sess.run(['y'], {'x': x, 'rois': rois, 'batch_ind': [1, 0]})
 
 print(' ORT test model is ' + themodel + ', rois_data is \n',
       rois, ' result is \n', res)
+ 
\ No newline at end of file
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 1d19f3e514f..0fc68736fa8 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -169,7 +169,6 @@ struct roialign
             float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-printf(" !!!!! %ld\n", index);
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
         });
@@ -263,12 +262,12 @@ printf(" !!!!! %ld\n", index);
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
                     if(coord_trans_mode != "half_pixel")
                         roi_size[ii] = std::max(roi_size[ii], 1.0f);
-printf("\n KKKKK ii %ld  roi_size %f   roi_batch_ind %ld  out_dims %lu     \n", ii, roi_size[ii] , roi_batch_ind,  out_dims[ii]);
+// printf("\n KKKKK ii %ld  roi_size %f   roi_batch_ind %ld  out_dims %lu     \n", ii, roi_size[ii] , roi_batch_ind,  out_dims[ii]);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
                                             : std::ceil(roi_size[ii] / out_dims[ii]);
-printf(" KLKLKL bin_grid_size= %ld x %ld\n", bin_grid_size[0], bin_grid_size[1]);                                            
+// printf(" KLKLKL bin_grid_size= %ld x %ld\n", bin_grid_size[0], bin_grid_size[1]);                                            
                 }
 
                 // we want to precalculate indices and weights shared by all channels,
diff --git a/test/onnx/verify/roialign_verify_test.cpp b/test/onnx/verify/roialign_verify_test.cpp
index 747171a8a7a..1ed1c52ccc4 100644
--- a/test/onnx/verify/roialign_verify_test.cpp
+++ b/test/onnx/verify/roialign_verify_test.cpp
@@ -41,17 +41,16 @@ TEST_CASE(roialign_verify_test)
 
     migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
     std::vector<float> rois_data = {
-                                    2.1, 1.73, 3.8, 2.13,
-                                    0.1, 0.15, 0.6, 0.35
+                                    0.1, 0.15, 0.6, 0.35,
+                                    2.1, 1.73, 3.8, 2.13
                                     };
     migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
-    std::vector<float> bi_data = {1, 0};
-printf("sdfkgjusdfgjk\n");
+    std::vector<int64_t> bi_data = {1, 0};
+
     pp["rois"]      = migraphx::argument(srois, rois_data.data());
     pp["batch_ind"] = migraphx::argument(sbi, bi_data.data());
 
     auto result = p.eval(pp).back();
-printf("  dfssdgf \n");
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
@@ -59,172 +58,72 @@ printf("  dfssdgf \n");
 // for(int aa = 0; aa < result_vector.size(); aa++) 
 //  {
 //           printf(" %f ", result_vector[aa]);
-//     if(aa % s.lens()[0] == s.lens()[0]-1)
+//     if(aa % 15 == 15-1)
 //       printf("\n");
 // }    printf("\n");
 
     std::vector<float> gold = {
-          3.1666667,   3.5000002,   3.8333333,
-           4.566667 ,   4.9      ,   5.2333336,
-           5.9666677,   6.3      ,   6.6333337,
-           7.366667 ,   7.7000003,   8.033334 ,
-           8.766666 ,   9.100001 ,   9.433333 ,
-
-        
- 31.166666 ,  31.5      ,  31.833334 ,
-          32.566666 ,  32.9      ,  33.23333  ,
-          33.966667 ,  34.300003 ,  34.633335 ,
-          35.366665 ,  35.699997 ,  36.033337 ,
-          36.766666 ,  37.100002 ,  37.433334 ,
-
-        
- 59.166668 ,  59.5      ,  59.833332 ,
-          60.566666 ,  60.899998 ,  61.23333  ,
-          61.966667 ,  62.299995 ,  62.633335 ,
-          63.366665 ,  63.700005 ,  64.03334  ,
-          64.76666  ,  65.100006 ,  65.433334 ,
-
-        
- 87.166664 ,  87.5      ,  87.83334  ,
-          88.566666 ,  88.899994 ,  89.23333  ,
-          89.96666  ,  90.30001  ,  90.63333  ,
-          91.36667  ,  91.7      ,  92.033325 ,
-          92.766655 ,  93.100006 ,  93.433334 ,
-
-        
-115.166664 , 115.5      , 115.833336 ,
-         116.56668  , 116.899994 , 117.23333  ,
-         117.96666  , 118.30001  , 118.63333  ,
-         119.36667  , 119.700005 , 120.03334  ,
-         120.766655 , 121.100006 , 121.433334 ,
-
-
-       
-165.76666  , 166.80742  ,  55.666668 ,
-         165.76666  , 166.80742  ,  55.666668 ,
-         110.51111  , 111.20494  ,  37.11111  ,
-           0.       ,   0.       ,   0.       ,
-           0.       ,   0.       ,   0.       ,
-
-        
-193.76666  , 194.80742  ,  65.       ,
-         193.76666  , 194.80742  ,  65.       ,
-         129.17778  , 129.87161  ,  43.333332 ,
-           0.       ,   0.       ,   0.       ,
-           0.       ,   0.       ,   0.       ,
-
-        
-221.76668  , 222.80742  ,  74.333336 ,
-         221.76668  , 222.80742  ,  74.333336 ,
-         147.84445  , 148.53827  ,  49.555557 ,
-           0.       ,   0.       ,   0.       ,
-           0.       ,   0.       ,   0.       ,
-
-        
-249.76668  , 250.8074   ,  83.666664 ,
-         249.76668  , 250.8074   ,  83.666664 ,
-         166.51111  , 167.20494  ,  55.77778  ,
-           0.       ,   0.       ,   0.       ,
-           0.       ,   0.       ,   0.       ,
-
-        
-277.7667   , 278.8074   ,  93.       ,
-         277.7667   , 278.8074   ,  93.       ,
-         185.17778  , 185.87161  ,  62.       ,
-           0.       ,   0.       ,   0.       ,
-           0.       ,   0.       ,   0.  
-// 0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
-//           8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
-//           4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
-//          0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
-//           8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
-//           4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
-//          0.00000000e+00, 0.00000000e+00, 4.93826950e-03,
-//           8.88888836e-02, 2.00000003e-01, 3.11111122e-01,
-//           4.22222227e-01, 5.33333302e-01, 6.44444466e-01,
-//          1.90476179e-02, 1.90476179e-02, 2.39858869e-02,
-//           1.07936502e-01, 2.19047621e-01, 3.30158740e-01,
-//           4.41269815e-01, 5.52380979e-01, 6.63492084e-01,
-//          1.71428561e-01, 1.71428561e-01, 1.76366836e-01,
-//           2.60317445e-01, 3.71428549e-01, 4.82539713e-01,
-//           5.93650818e-01, 7.04761863e-01, 8.15872967e-01,
-//          3.42857152e-01, 3.42857152e-01, 3.47795397e-01,
-//           4.31746036e-01, 5.42857111e-01, 6.53968275e-01,
-//           7.65079260e-01, 8.76190484e-01, 9.87301588e-01,
-//          5.14285743e-01, 5.14285743e-01, 5.19223928e-01,
-//           6.03174567e-01, 7.14285672e-01, 8.25396836e-01,
-//           9.36507940e-01, 1.04761910e+00, 1.15873003e+00,
-
-//         1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
-//           1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
-//           1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
-//          1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
-//           1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
-//           1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
-//          1.20000000e+01, 1.20000000e+01, 1.20049391e+01,
-//           1.20888891e+01, 1.21999998e+01, 1.23111115e+01,
-//           1.24222221e+01, 1.25333328e+01, 1.26444445e+01,
-//          1.20190477e+01, 1.20190477e+01, 1.20239868e+01,
-//           1.21079369e+01, 1.22190475e+01, 1.23301582e+01,
-//           1.24412699e+01, 1.25523796e+01, 1.26634922e+01,
-//          1.21714277e+01, 1.21714277e+01, 1.21763659e+01,
-//           1.22603178e+01, 1.23714285e+01, 1.24825401e+01,
-//           1.25936518e+01, 1.27047615e+01, 1.28158722e+01,
-//          1.23428583e+01, 1.23428583e+01, 1.23477964e+01,
-//           1.24317465e+01, 1.25428581e+01, 1.26539688e+01,
-//           1.27650795e+01, 1.28761902e+01, 1.29873009e+01,
-//          1.25142860e+01, 1.25142860e+01, 1.25192232e+01,
-//           1.26031752e+01, 1.27142859e+01, 1.28253975e+01,
-//           1.29365072e+01, 1.30476189e+01, 1.31587305e+01,
-
-
-//        2.88403187e+01, 2.90094528e+01, 2.90514297e+01,
-//           2.90514297e+01, 2.90514297e+01, 2.90514297e+01,
-//           2.90514297e+01, 9.68380928e+00, 0.00000000e+00,
-//          2.91831741e+01, 2.93523083e+01, 2.93942871e+01,
-//           2.93942871e+01, 2.93942871e+01, 2.93942871e+01,
-//           2.93942871e+01, 9.79809570e+00, 0.00000000e+00,
-//          2.95260353e+01, 2.96951675e+01, 2.97371426e+01,
-//           2.97371426e+01, 2.97371426e+01, 2.97371426e+01,
-//           2.97371426e+01, 9.91238022e+00, 0.00000000e+00,
-//          2.98688869e+01, 3.00380211e+01, 3.00799999e+01,
-//           3.00799999e+01, 3.00799999e+01, 3.00799999e+01,
-//           3.00799999e+01, 1.00266676e+01, 0.00000000e+00,
-//          3.02117481e+01, 3.03808823e+01, 3.04228554e+01,
-//           3.04228554e+01, 3.04228554e+01, 3.04228554e+01,
-//           3.04228554e+01, 1.01409521e+01, 0.00000000e+00,
-//          3.05546055e+01, 3.07237377e+01, 3.07657166e+01,
-//           3.07657166e+01, 3.07657166e+01, 3.07657166e+01,
-//           3.07657166e+01, 1.02552385e+01, 0.00000000e+00,
-//          3.08974609e+01, 3.10665970e+01, 3.11085720e+01,
-//           3.11085720e+01, 3.11085720e+01, 3.11085720e+01,
-//           3.11085720e+01, 1.03695240e+01, 0.00000000e+00,
-
-//         4.08403168e+01, 4.10094528e+01, 4.10514259e+01,
-//           4.10514259e+01, 4.10514259e+01, 4.10514259e+01,
-//           4.10514259e+01, 1.36838093e+01, 0.00000000e+00,
-//          4.11831741e+01, 4.13523102e+01, 4.13942871e+01,
-//           4.13942871e+01, 4.13942871e+01, 4.13942871e+01,
-//           4.13942871e+01, 1.37980957e+01, 0.00000000e+00,
-//          4.15260315e+01, 4.16951675e+01, 4.17371483e+01,
-//           4.17371483e+01, 4.17371483e+01, 4.17371483e+01,
-//           4.17371483e+01, 1.39123802e+01, 0.00000000e+00,
-//          4.18688889e+01, 4.20380211e+01, 4.20799980e+01,
-//           4.20799980e+01, 4.20799980e+01, 4.20799980e+01,
-//           4.20799980e+01, 1.40266676e+01, 0.00000000e+00,
-//          4.22117462e+01, 4.23808823e+01, 4.24228554e+01,
-//           4.24228554e+01, 4.24228554e+01, 4.24228554e+01,
-//           4.24228554e+01, 1.41409521e+01, 0.00000000e+00,
-//          4.25546036e+01, 4.27237396e+01, 4.27657166e+01,
-//           4.27657166e+01, 4.27657166e+01, 4.27657166e+01,
-//           4.27657166e+01, 1.42552385e+01, 0.00000000e+00,
-//          4.28974609e+01, 4.30666008e+01, 4.31085777e+01,
-//           4.31085777e+01, 4.31085777e+01, 4.31085777e+01,
-//           4.31085777e+01, 1.43695240e+01, 0.00000000e+00
+        143.16667 , 143.49998 , 143.83333 ,
+         144.56667 , 144.9     , 145.23334 ,
+         145.96667 , 146.3     , 146.63333 ,
+         147.36667 , 147.70001 , 148.03334 ,
+         148.76666 , 149.09999 , 149.43333 ,
+
+        171.16667 , 171.5     , 171.83333 ,
+         172.56667 , 172.90001 , 173.23334 ,
+         173.96667 , 174.3     , 174.63333 ,
+         175.36667 , 175.70001 , 176.03333 ,
+         176.76666 , 177.09999 , 177.43335 ,
+
+        199.16667 , 199.5     , 199.83333 ,
+         200.56667 , 200.90001 , 201.23334 ,
+         201.96666 , 202.3     , 202.63333 ,
+         203.36665 , 203.70001 , 204.03333 ,
+         204.76668 , 205.09999 , 205.43333 ,
+
+        227.16667 , 227.5     , 227.83333 ,
+         228.56668 , 228.90001 , 229.23332 ,
+         229.96669 , 230.29999 , 230.63333 ,
+         231.36664 , 231.70001 , 232.03334 ,
+         232.76668 , 233.09999 , 233.43332 ,
+
+        255.16667 , 255.5     , 255.83333 ,
+         256.56668 , 256.90002 , 257.2333  ,
+         257.96667 , 258.3     , 258.63333 ,
+         259.36664 , 259.69998 , 260.03333 ,
+         260.7667  , 261.09998 , 261.43338 ,
+
+
+        25.766665,  26.807405,   9.      ,
+          25.766665,  26.807405,   9.      ,
+          17.177776,  17.871605,   6.      ,
+           0.      ,   0.      ,   0.      ,
+           0.      ,   0.      ,   0.      ,
+
+         53.766666,  54.807407,  18.333334,
+          53.766666,  54.807407,  18.333334,
+          35.844444,  36.538273,  12.222222,
+           0.      ,   0.      ,   0.      ,
+           0.      ,   0.      ,   0.      ,
+
+         81.76667 ,  82.8074  ,  27.666666,
+          81.76667 ,  82.8074  ,  27.666666,
+          54.51111 ,  55.204937,  18.444445,
+           0.      ,   0.      ,   0.      ,
+           0.      ,   0.      ,   0.      ,
+
+        109.76667 , 110.8074  ,  37.      ,
+         109.76667 , 110.8074  ,  37.      ,
+          73.17777 ,  73.871605,  24.666666,
+           0.      ,   0.      ,   0.      ,
+           0.      ,   0.      ,   0.      ,
+
+        137.76666 , 138.80742 ,  46.333332,
+         137.76666 , 138.80742 ,  46.333332,
+          91.844444,  92.53828 ,  30.88889 ,
+           0.      ,   0.      ,   0.      ,
+           0.      ,   0.      ,   0.
 };
-    float alpha = 0.5;
-    std::transform(data.begin(), data.end(), gold.begin(), [&](auto x) {
-        return std::max(0.0f, x) + std::min(0.0f, alpha * std::expm1(x / alpha));
-    });
+
     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
 }

From 4c11f71987beb428544e3644cc6568dfb21a8544 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 7 Oct 2024 21:54:25 +0000
Subject: [PATCH 36/56] clean up debug code

---
 src/include/migraphx/op/roialign.hpp      |  16 ----
 test/onnx/verify/roialign_verify_test.cpp | 106 +++++++---------------
 2 files changed, 32 insertions(+), 90 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 0fc68736fa8..de5a58aaa81 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -121,8 +121,6 @@ struct roialign
     {
         std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
                                         output_width);
-// printf(" bin grid %ldx%ld, height %lu width %lu\n", bin_grid_size[0], bin_grid_size[1], output_height,
-//                                         output_width);
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
             // The p and i indexes correspond to nested looping parameters in ORT that go in y, x
             // order.  The i[x] value is least significant and iterates the fastest.
@@ -139,12 +137,8 @@ struct roialign
             {
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-// printf(" QQQQQQ  L137 x=%f  y=%f  ", xy[0], xy[1]);                                        
-                // xy[ii] = (coord_trans_mode != "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
-// printf(" L139 %f ", xy[ii]);   
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
-// printf(" L142 results = pos_weight i=%lu dims=%lu, %lu  \n ", index,  dims[0], dims[1]);                    
                     results[index] = pos_weight{};
                     return;
                 }
@@ -152,13 +146,10 @@ struct roialign
                 xy[ii]   = std::max(xy[ii], 0.0f);
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
-// printf(" L148 %f  low[ii] %lu, dims[ii] %lu", xy[ii],  low[ii], dims[ii]);                
                 if(low[ii] >= dims[ii] - 1)
                 {
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
-// printf(" L154 %f ", xy[ii]);                    
                 }
-// printf(" \n");            
             }
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
@@ -172,11 +163,6 @@ struct roialign
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
         });
-// printf(" AAAAA here we are\n");
-        // for(int iix = 0; iix < results.size(); iix++)
-        //     printf(" SSSSS %ld %d\n", results.size(), iix);
-        //   printf(" SSSSS %d    %lu  %lu  %lu  %lu   %f  %f  %f  %f\n", iix, results[iix].pos[0], results[iix].pos[1], results[iix].pos[2], results[iix].pos[3],
-        //            results[iix].w[0], results[iix].w[1], results[iix].w[2], results[iix].w[3]);
         return results;
     }
 
@@ -262,12 +248,10 @@ struct roialign
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
                     if(coord_trans_mode != "half_pixel")
                         roi_size[ii] = std::max(roi_size[ii], 1.0f);
-// printf("\n KKKKK ii %ld  roi_size %f   roi_batch_ind %ld  out_dims %lu     \n", ii, roi_size[ii] , roi_batch_ind,  out_dims[ii]);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
                                             : std::ceil(roi_size[ii] / out_dims[ii]);
-// printf(" KLKLKL bin_grid_size= %ld x %ld\n", bin_grid_size[0], bin_grid_size[1]);                                            
                 }
 
                 // we want to precalculate indices and weights shared by all channels,
diff --git a/test/onnx/verify/roialign_verify_test.cpp b/test/onnx/verify/roialign_verify_test.cpp
index 1ed1c52ccc4..051080adf25 100644
--- a/test/onnx/verify/roialign_verify_test.cpp
+++ b/test/onnx/verify/roialign_verify_test.cpp
@@ -40,11 +40,8 @@ TEST_CASE(roialign_verify_test)
     pp["y"] = migraphx::argument(s, data.data());
 
     migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
-    std::vector<float> rois_data = {
-                                    0.1, 0.15, 0.6, 0.35,
-                                    2.1, 1.73, 3.8, 2.13
-                                    };
-    migraphx::shape sbi{migraphx::shape::int64_type, {2}};  // batch_index
+    std::vector<float> rois_data = {0.1, 0.15, 0.6, 0.35, 2.1, 1.73, 3.8, 2.13};
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}};
     std::vector<int64_t> bi_data = {1, 0};
 
     pp["rois"]      = migraphx::argument(srois, rois_data.data());
@@ -54,76 +51,37 @@ TEST_CASE(roialign_verify_test)
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-//     printf(" result: ");
-// for(int aa = 0; aa < result_vector.size(); aa++) 
-//  {
-//           printf(" %f ", result_vector[aa]);
-//     if(aa % 15 == 15-1)
-//       printf("\n");
-// }    printf("\n");
-
+    // gold results were generated with onnxruntime
     std::vector<float> gold = {
-        143.16667 , 143.49998 , 143.83333 ,
-         144.56667 , 144.9     , 145.23334 ,
-         145.96667 , 146.3     , 146.63333 ,
-         147.36667 , 147.70001 , 148.03334 ,
-         148.76666 , 149.09999 , 149.43333 ,
-
-        171.16667 , 171.5     , 171.83333 ,
-         172.56667 , 172.90001 , 173.23334 ,
-         173.96667 , 174.3     , 174.63333 ,
-         175.36667 , 175.70001 , 176.03333 ,
-         176.76666 , 177.09999 , 177.43335 ,
-
-        199.16667 , 199.5     , 199.83333 ,
-         200.56667 , 200.90001 , 201.23334 ,
-         201.96666 , 202.3     , 202.63333 ,
-         203.36665 , 203.70001 , 204.03333 ,
-         204.76668 , 205.09999 , 205.43333 ,
-
-        227.16667 , 227.5     , 227.83333 ,
-         228.56668 , 228.90001 , 229.23332 ,
-         229.96669 , 230.29999 , 230.63333 ,
-         231.36664 , 231.70001 , 232.03334 ,
-         232.76668 , 233.09999 , 233.43332 ,
-
-        255.16667 , 255.5     , 255.83333 ,
-         256.56668 , 256.90002 , 257.2333  ,
-         257.96667 , 258.3     , 258.63333 ,
-         259.36664 , 259.69998 , 260.03333 ,
-         260.7667  , 261.09998 , 261.43338 ,
-
-
-        25.766665,  26.807405,   9.      ,
-          25.766665,  26.807405,   9.      ,
-          17.177776,  17.871605,   6.      ,
-           0.      ,   0.      ,   0.      ,
-           0.      ,   0.      ,   0.      ,
-
-         53.766666,  54.807407,  18.333334,
-          53.766666,  54.807407,  18.333334,
-          35.844444,  36.538273,  12.222222,
-           0.      ,   0.      ,   0.      ,
-           0.      ,   0.      ,   0.      ,
-
-         81.76667 ,  82.8074  ,  27.666666,
-          81.76667 ,  82.8074  ,  27.666666,
-          54.51111 ,  55.204937,  18.444445,
-           0.      ,   0.      ,   0.      ,
-           0.      ,   0.      ,   0.      ,
-
-        109.76667 , 110.8074  ,  37.      ,
-         109.76667 , 110.8074  ,  37.      ,
-          73.17777 ,  73.871605,  24.666666,
-           0.      ,   0.      ,   0.      ,
-           0.      ,   0.      ,   0.      ,
-
-        137.76666 , 138.80742 ,  46.333332,
-         137.76666 , 138.80742 ,  46.333332,
-          91.844444,  92.53828 ,  30.88889 ,
-           0.      ,   0.      ,   0.      ,
-           0.      ,   0.      ,   0.
-};
+        143.16667, 143.49998, 143.83333, 144.56667, 144.9,     145.23334, 145.96667, 146.3,
+        146.63333, 147.36667, 147.70001, 148.03334, 148.76666, 149.09999, 149.43333,
+
+        171.16667, 171.5,     171.83333, 172.56667, 172.90001, 173.23334, 173.96667, 174.3,
+        174.63333, 175.36667, 175.70001, 176.03333, 176.76666, 177.09999, 177.43335,
+
+        199.16667, 199.5,     199.83333, 200.56667, 200.90001, 201.23334, 201.96666, 202.3,
+        202.63333, 203.36665, 203.70001, 204.03333, 204.76668, 205.09999, 205.43333,
+
+        227.16667, 227.5,     227.83333, 228.56668, 228.90001, 229.23332, 229.96669, 230.29999,
+        230.63333, 231.36664, 231.70001, 232.03334, 232.76668, 233.09999, 233.43332,
+
+        255.16667, 255.5,     255.83333, 256.56668, 256.90002, 257.2333,  257.96667, 258.3,
+        258.63333, 259.36664, 259.69998, 260.03333, 260.7667,  261.09998, 261.43338,
+
+        25.766665, 26.807405, 9.,        25.766665, 26.807405, 9.,        17.177776, 17.871605,
+        6.,        0.,        0.,        0.,        0.,        0.,        0.,
+
+        53.766666, 54.807407, 18.333334, 53.766666, 54.807407, 18.333334, 35.844444, 36.538273,
+        12.222222, 0.,        0.,        0.,        0.,        0.,        0.,
+
+        81.76667,  82.8074,   27.666666, 81.76667,  82.8074,   27.666666, 54.51111,  55.204937,
+        18.444445, 0.,        0.,        0.,        0.,        0.,        0.,
+
+        109.76667, 110.8074,  37.,       109.76667, 110.8074,  37.,       73.17777,  73.871605,
+        24.666666, 0.,        0.,        0.,        0.,        0.,        0.,
+
+        137.76666, 138.80742, 46.333332, 137.76666, 138.80742, 46.333332, 91.844444, 92.53828,
+        30.88889,  0.,        0.,        0.,        0.,        0.,        0.};
 
     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
 }

From 0b0bcb67d8d3feeaf20d00195c98c975cd48dd66 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 7 Oct 2024 22:43:51 +0000
Subject: [PATCH 37/56] fix roialign_test onnx test to reflect changed test
 file

---
 test/onnx/parse/roialign_test.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/onnx/parse/roialign_test.cpp b/test/onnx/parse/roialign_test.cpp
index 05f27b6473c..346346727b2 100644
--- a/test/onnx/parse/roialign_test.cpp
+++ b/test/onnx/parse/roialign_test.cpp
@@ -27,8 +27,8 @@
 TEST_CASE(roialign_test)
 {
     migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
-    migraphx::shape srois{migraphx::shape::float_type, {8, 4}};
-    migraphx::shape sbi{migraphx::shape::int64_type, {8}};
+    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}};
 
     migraphx::program p;
     auto* mm  = p.get_main_module();
@@ -41,7 +41,7 @@ TEST_CASE(roialign_test)
                           {{"coordinate_transformation_mode", "output_half_pixel"},
                            {"spatial_scale", 2.0f},
                            {"output_height", 5},
-                           {"output_width", 5},
+                           {"output_width", 3},
                            {"sampling_ratio", 3}}),
         x,
         rois,

From 9d658a72cebb59bba89bf81a80c73e1b680c7e9d Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 7 Oct 2024 23:50:10 +0000
Subject: [PATCH 38/56] Update Onnx test models to allow specified op set; add
 roialign default parsing tests with op set; re-add a new verify test that
 wasn't commmitted yet

---
 test/onnx/gen_onnx.py                   |  28 +++++++++++++++++++++---
 test/onnx/roialign_default_test.onnx    | Bin 197 -> 199 bytes
 test/onnx/roialign_default_test_12.onnx | Bin 0 -> 205 bytes
 test/verify/roialign_verify_test.cpp    |   0
 4 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 test/onnx/roialign_default_test_12.onnx
 create mode 100644 test/verify/roialign_verify_test.cpp

diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index 14206f7abbc..27cc178dd96 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -31,10 +31,12 @@
 from onnx.numpy_helper import from_array
 
 
-def onnx_test(external_data=False):
+def onnx_test(external_data=False, opset_imports=None):
     def create_onnx_test(op_test):
         def run_test():
             op_info = op_test()
+            opset_id = [helper.make_operatorsetid('', opset_imports)] if opset_imports is not None else None
+
             if len(op_info) > 3:
                 graph_def = helper.make_graph(op_info[0],
                                               op_test.__name__,
@@ -45,7 +47,9 @@ def run_test():
                 graph_def = helper.make_graph(op_info[0], op_test.__name__,
                                               op_info[1], op_info[2])
             model_def = helper.make_model(graph_def,
-                                          producer_name=op_test.__name__)
+                                          producer_name=op_test.__name__,
+                                              opset_imports=opset_id
+            )
             onnx.save_model(model_def,
                             '{}.onnx'.format(op_test.__name__),
                             save_as_external_data=external_data,
@@ -10587,8 +10591,26 @@ def rnn_r_3arg_layout_test():
     return ([node], [seq, w, r], [hs, output])
 
 
-@onnx_test()
+@onnx_test(external_data=False, opset_imports=16)
 def roialign_default_test():
+    # The op. ROIAlign had an attribute coordinate_transformation_mode added
+    # as of Onnx opset 16; we make opset-specific test models which give
+    # different default values.
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 4, 7, 8])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [8, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [8])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [8, 4, 1, 1])
+
+    node = onnx.helper.make_node('RoiAlign',
+                                 inputs=['x', 'rois', 'batch_ind'],
+                                 outputs=['y'])
+
+    return ([node], [x, roi, bi], [y])
+
+
+@onnx_test(external_data=False, opset_imports=12)
+def roialign_default_test_12():
+    # Same model as in roialign_default_test() but with an older opset specified
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 4, 7, 8])
     roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [8, 4])
     bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [8])
diff --git a/test/onnx/roialign_default_test.onnx b/test/onnx/roialign_default_test.onnx
index 5b0165fc093f58fb94b2efcd3c55023ec430af2d..cc47b78b9dfbfa97583e635b7268cee0598a4f51 100644
GIT binary patch
delta 12
TcmX@gc${&<AyyVH1_1#88+rp2

delta 10
RcmX@kc$9I%A!a54VE`3(0|@{C

diff --git a/test/onnx/roialign_default_test_12.onnx b/test/onnx/roialign_default_test_12.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..1747f61ee128d19a5cd5443fbf1ec4f2ee96ac28
GIT binary patch
literal 205
zcmd<!6p|>)&rHn8OwWr?Nli;E%_)g5Ni8mkH#D+Z!N{e^#aO|`0#s4V#hH{?lAIBr
znU^BOSgFJjl%MGc(uG@plr&JYkQkQ;2cwVx7ZV2;5VHU=I}me3i9w7N;^X20%5#9k
mK=M)I$QE&Nad5B;fi!U>0Zpq!G7YR6D8UHAPApsu0z3dznkkt8

literal 0
HcmV?d00001

diff --git a/test/verify/roialign_verify_test.cpp b/test/verify/roialign_verify_test.cpp
new file mode 100644
index 00000000000..e69de29bb2d

From c54c1399fda508ae1060f36ef7383367d4b8929b Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 7 Oct 2024 23:53:09 +0000
Subject: [PATCH 39/56] add 1 file to previous commit

---
 test/onnx/parse/roialign_default_test.cpp | 27 ++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/test/onnx/parse/roialign_default_test.cpp b/test/onnx/parse/roialign_default_test.cpp
index b4869740a57..ceb4b12ccba 100644
--- a/test/onnx/parse/roialign_default_test.cpp
+++ b/test/onnx/parse/roialign_default_test.cpp
@@ -36,16 +36,33 @@ TEST_CASE(roialign_default_test)
     auto rois = mm->add_parameter("rois", srois);
     auto bi   = mm->add_parameter("batch_ind", sbi);
 
-    // Due to the onnx model using opset 12, the coordinate_transformation_mode should be set to
-    // output_half_pixel
+    // Depending on whether the model was built for opset 16 or earlier, the default
+    // coordinate_transformation_mode is different.  These models had opset specified 
+    // when they were created..
     auto r = mm->add_instruction(
-        migraphx::make_op("roialign", {{"coordinate_transformation_mode", "output_half_pixel"}}),
+        migraphx::make_op("roialign", {{"coordinate_transformation_mode", "half_pixel"}}),
         x,
         rois,
         bi);
     mm->add_return({r});
-
     auto prog = read_onnx("roialign_default_test.onnx");
-
     EXPECT(p == prog);
+
+
+    migraphx::program p_12;
+    auto* mm_12  = p_12.get_main_module();
+    auto x_12    = mm_12->add_parameter("x", sx);
+    auto rois_12 = mm_12->add_parameter("rois", srois);
+    auto bi_12   = mm_12->add_parameter("batch_ind", sbi);
+
+    auto r_12 = mm_12->add_instruction(
+        migraphx::make_op("roialign", {{"coordinate_transformation_mode", "output_half_pixel"}}),
+        x_12,
+        rois_12,
+        bi_12);
+    mm_12->add_return({r_12});
+    auto prog_12 = read_onnx("roialign_default_test_12.onnx");
+    EXPECT(p_12 == prog_12);
+
+
 }

From 6fae7d5716e609a0da335694f331a210c6773ff2 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 8 Oct 2024 15:23:18 +0000
Subject: [PATCH 40/56] file cleanup

---
 src/program.cpp                           |   1 -
 test/onnx/conv_relu_maxpool_test.onnx     | Bin 329 -> 316 bytes
 test/onnx/parse/roialign_default_test.cpp |  10 ++++------
 test/verify/roialign_verify_test.cpp      |   0
 4 files changed, 4 insertions(+), 7 deletions(-)
 delete mode 100644 test/verify/roialign_verify_test.cpp

diff --git a/src/program.cpp b/src/program.cpp
index f22a3322037..25cb16cc950 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -35,7 +35,6 @@
 #include <migraphx/register_target.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/iterator.hpp>
-#include <migraphx/shape.hpp>
 #include <migraphx/algorithm.hpp>
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/make_op.hpp>
diff --git a/test/onnx/conv_relu_maxpool_test.onnx b/test/onnx/conv_relu_maxpool_test.onnx
index 4403d8de5f70722d44e60d46fb26e575d7a959ca..f5bfe4c1514a128bbde7d847205baffbb35763fb 100644
GIT binary patch
delta 52
zcmX@fw1-KYgH=c{IX|x~z9==PR5!IEF}ENm)oQ^+nN|FJC8@<F@o>qBce5F}0M$Se
A)c^nh

delta 65
zcmdnPbdpJigHuQ>IX|x~z9==PG(I=6q98v%C%z=LxWsD3MDbPPI3y=t$YvA<00*)d
Awg3PC

diff --git a/test/onnx/parse/roialign_default_test.cpp b/test/onnx/parse/roialign_default_test.cpp
index ceb4b12ccba..410d7ed62d4 100644
--- a/test/onnx/parse/roialign_default_test.cpp
+++ b/test/onnx/parse/roialign_default_test.cpp
@@ -36,9 +36,9 @@ TEST_CASE(roialign_default_test)
     auto rois = mm->add_parameter("rois", srois);
     auto bi   = mm->add_parameter("batch_ind", sbi);
 
-    // Depending on whether the model was built for opset 16 or earlier, the default
-    // coordinate_transformation_mode is different.  These models had opset specified 
-    // when they were created..
+    // Depending on whether the model was built for Onnx opset 16 or earlier, the default
+    // coordinate_transformation_mode is different.  These model files had explicit opset given
+    // when they were created.
     auto r = mm->add_instruction(
         migraphx::make_op("roialign", {{"coordinate_transformation_mode", "half_pixel"}}),
         x,
@@ -48,7 +48,7 @@ TEST_CASE(roialign_default_test)
     auto prog = read_onnx("roialign_default_test.onnx");
     EXPECT(p == prog);
 
-
+    // Opset 12 program
     migraphx::program p_12;
     auto* mm_12  = p_12.get_main_module();
     auto x_12    = mm_12->add_parameter("x", sx);
@@ -63,6 +63,4 @@ TEST_CASE(roialign_default_test)
     mm_12->add_return({r_12});
     auto prog_12 = read_onnx("roialign_default_test_12.onnx");
     EXPECT(p_12 == prog_12);
-
-
 }
diff --git a/test/verify/roialign_verify_test.cpp b/test/verify/roialign_verify_test.cpp
deleted file mode 100644
index e69de29bb2d..00000000000

From c4565bdf8da3164266dbac2144ba37a0830e3aa5 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 8 Oct 2024 19:33:06 +0000
Subject: [PATCH 41/56] first-try updates to gpu roialign plus misc. cleanup;
 WIP doesn't pass test_verify

---
 docs/dev/onnx_operators.rst                   |  2 +-
 src/include/migraphx/op/roialign.hpp          |  2 +-
 .../include/migraphx/kernels/roialign.hpp     | 28 +++++++++++--------
 test/onnx/gen_onnx.py                         |  4 +--
 test/verify/test_roialign.cpp                 | 18 ++++++------
 5 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/docs/dev/onnx_operators.rst b/docs/dev/onnx_operators.rst
index fc621b4f894..a87af00e755 100644
--- a/docs/dev/onnx_operators.rst
+++ b/docs/dev/onnx_operators.rst
@@ -697,7 +697,7 @@ Operator Support Matrix
 |                          |           |                 | functions are                |
 |                          |           |                 | not enabled                  |
 +--------------------------+-----------+-----------------+------------------------------+
-| RoiAlign                 | ✅        | FP8, FP16,      |                              |
+| RoiAlign                 | ✅        | FP8, FP16,      |                               |
 |                          |           | FP32, FP64      |                              |
 +--------------------------+-----------+-----------------+------------------------------+
 | Round                    | ✅        | FP8, FP16,      |                              |
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index de5a58aaa81..4231f33621f 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -230,7 +230,7 @@ struct roialign
                 const auto bottom_data   = x.begin();
                 const auto roi_batch_ind = batch_indices[n];
                 // Do not use rounding; this implementation detail is critical
-                float offset                    = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
+                const float offset              = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
                 std::array<float, 2> roi_starts = {
                     static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale - offset),
                     static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale - offset)};
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index b7d7216c690..92b40028080 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
 #define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
 
+#include <migraphx/kernels/debug.hpp>
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/dfor.hpp>
 #include <migraphx/kernels/ops.hpp>
@@ -87,18 +88,19 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
         }
     }
-    array<index_int, 4> locs = {low[0] * dims[1] + low[1],
-                                low[0] * dims[1] + high[1],
-                                high[0] * dims[1] + low[1],
-                                high[0] * dims[1] + high[1]};
+    array<index_int, 4> locs = {low[1] * dims[0] + low[0],
+                                low[1] * dims[0] + high[0],
+                                high[1] * dims[0] + low[0],
+                                high[1] * dims[0] + high[0]};
 
-    float ly = xy[0] - low[0];
-    float lx = xy[1] - low[1];
+    float lx = xy[0] - low[0];
+    float ly = xy[1] - low[1];
     float hy = 1.0f - ly;
     float hx = 1.0f - lx;
     // do calculations in floating point and convert final result to required type
     array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
 
+    // todo:  Should we change the order of these indices?
     auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
     auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
     return implicit_conversion(pooling(v01, v23));
@@ -177,12 +179,15 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
         const auto offset_rois = rois + (n * roi_column_num);
         const int batch_ind    = ind[n];
 
+        // todo:  did roi_offset get initialized to -0.5 in src/targets/gpu/jit/roialign.cpp?
         array<float, 2> roi_starts = {
-            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
-            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) - s.roi_offset,
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) -
+                s.roi_offset};
         array<float, 2> roi_ends = {
-            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
-            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale) - s.roi_offset,
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale) -
+                s.roi_offset};
 
         array<float, 2> roi_size{};
         array<float, 2> bin_size{};
@@ -191,7 +196,8 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
         for(index_int ii = 0; ii < roi_size.size(); ++ii)
         {
             roi_size[ii] = roi_ends[ii] - roi_starts[ii];
-            roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
+            if(s.roi_offset == 0.f)
+                roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
 
             bin_size[ii]      = roi_size[ii] / out_dims[ii];
             bin_grid_size[ii] = (s.sampling_ratio > 0)
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index 27cc178dd96..bc5639d0424 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -10591,7 +10591,7 @@ def rnn_r_3arg_layout_test():
     return ([node], [seq, w, r], [hs, output])
 
 
-@onnx_test(external_data=False, opset_imports=16)
+@onnx_test(opset_imports=16)
 def roialign_default_test():
     # The op. ROIAlign had an attribute coordinate_transformation_mode added
     # as of Onnx opset 16; we make opset-specific test models which give
@@ -10608,7 +10608,7 @@ def roialign_default_test():
     return ([node], [x, roi, bi], [y])
 
 
-@onnx_test(external_data=False, opset_imports=12)
+@onnx_test(opset_imports=12)
 def roialign_default_test_12():
     # Same model as in roialign_default_test() but with an older opset specified
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 4, 7, 8])
diff --git a/test/verify/test_roialign.cpp b/test/verify/test_roialign.cpp
index 6314491e10d..9a0f706c93d 100644
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -44,14 +44,16 @@ struct test_roialign : verify_program<test_roialign<DType>>
         auto x   = mm->add_parameter("x", x_s);
         auto roi = mm->add_parameter("roi", roi_s);
         auto ind = mm->add_literal(migraphx::literal(ind_s, ind_vec));
-        auto r   = mm->add_instruction(migraphx::make_op("roialign",
-                                                         {{"spatial_scale", 1.0},
-                                                          {"output_height", 5},
-                                                          {"output_width", 5},
-                                                          {"sampling_ratio", 2}}),
-                                     x,
-                                     roi,
-                                     ind);
+        auto r   = mm->add_instruction(
+            migraphx::make_op("roialign",
+                                {{"spatial_scale", 1.1},
+                                 {"output_height", 5},
+                                 {"output_width", 3},
+                                 {"sampling_ratio", 2},
+                                 {"coordinate_transformation_mode", "half_pixel"}}),
+            x,
+            roi,
+            ind);
         mm->add_return({r});
 
         return p;

From 3978d410f1e717c95937cc7dff76aa8bb204cd68 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 9 Oct 2024 22:28:57 +0000
Subject: [PATCH 42/56] work in progress

---
 src/include/migraphx/op/roialign.hpp          | 15 +++++
 .../include/migraphx/kernels/roialign.hpp     | 67 ++++++++++++++-----
 test/verify/test_roialign.cpp                 | 46 +++++++++++--
 3 files changed, 107 insertions(+), 21 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 4231f33621f..a201d443bc1 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -135,22 +135,30 @@ struct roialign
 
             for(auto ii : range(p.size()))
             {
+// printf(" ttttt roi_start[%d] = %f  p=%lu bin_size = %f   i[%d] + .5f = %f  bin_grid_size = %lu\n", ii, 
+// roi_start[ii], p[ii], bin_size[ii], ii, (i[ii] + .5f), bin_grid_size[ii]);
+                
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
+// printf(" uuuuu    xy[%d]:   %f\n", ii, xy[ii]);
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
                     results[index] = pos_weight{};
+// printf(" vvvvv    xy[%d]:   %f\n", ii, xy[ii]);
                     return;
                 }
 
                 xy[ii]   = std::max(xy[ii], 0.0f);
+// printf(" wwwww    xy[%d]:   %f\n", ii, xy[ii]);
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
                 if(low[ii] >= dims[ii] - 1)
                 {
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+// printf(" xxxxx    xy[%d]:   %f\n", ii, xy[ii]);
                 }
             }
+printf(" fufufu   xy:   %f, %f    index %d\n",  xy[0],  xy[1], index);
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
                                   high[1] * dims[0] + low[0],
@@ -226,6 +234,8 @@ struct roialign
 
         visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
             const auto* batch_indices = args.at(2).cast<int64_t>();
+// printf(" UUUUU roi = %f, %f  roi_s.index(0, 0) = %zu   roi_s.index(0, 1) = %zu   (1, 0)=%zu    (1, 1)=%zu\n", roi[0], roi[1], 
+// roi_s.index({0, 0}), roi_s.index({0, 1}), roi_s.index({1, 0}), roi_s.index({1, 1}))   ;         
             par_for(n_rois, [&](auto n) {
                 const auto bottom_data   = x.begin();
                 const auto roi_batch_ind = batch_indices[n];
@@ -238,6 +248,7 @@ struct roialign
                     static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale - offset),
                     static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale - offset)};
 
+
                 // Force malformed ROIs to be 1x1, output_half_pixel transform mode
                 std::array<float, 2> roi_size{};
                 std::array<float, 2> bin_size{};
@@ -274,6 +285,9 @@ struct roialign
                     const auto offset_bottom_data =
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
+// std::cout << " VVVVV offset_bottom_data: "  << offset_bottom_data[0] << "\n" ;
+// std::cout << " WWWWW offset_bottom_data_asdf: "  << static_cast<int64_t>((roi_batch_ind * channels + c) *
+//                                                            in_dims[0] * in_dims[1]) << "\n"   ;
                     double output_val;
                     std::tie(output_val, vec_index[c]) =
                         (mode == migraphx::op::pooling_mode::average)
@@ -287,6 +301,7 @@ struct roialign
                                                  pre_calc,
                                                  vec_index[c],
                                                  max_pool{});
+// printf(" XXXXX output_val: %f  \n", output_val)                                                 ;
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index 92b40028080..5f12b74d714 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -25,6 +25,7 @@
 #define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
 
 #include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/print.hpp>
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/dfor.hpp>
 #include <migraphx/kernels/ops.hpp>
@@ -75,34 +76,47 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
     array<int, 2> high{};
     for(index_int ii = 0; ii < xy.size(); ++ii)
     {
+        println_once(" fffff xy: ", xy[ii]);
         if(xy[ii] < -1.0f or xy[ii] > dims[ii])
         {
+        println_once(" ggggg xy: ", xy[ii]);
             return implicit_conversion(0);
         }
 
         xy[ii]   = migraphx::max(xy[ii], 0.0f);
+        println_once(" hhhhh xy: ", xy[ii]);
         low[ii]  = xy[ii];
         high[ii] = low[ii] + 1;
         if(low[ii] >= dims[ii] - 1)
         {
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+    println_once(" iiiii xy: ", xy[ii]);
         }
+        println_once(" FFFFF xy: ", xy[ii]);
     }
-    array<index_int, 4> locs = {low[1] * dims[0] + low[0],
+println(" FUFUFU xy: ", xy);    
+    array<index_int, 4> locs = {low[1] * dims[0] + low[0],  // new
                                 low[1] * dims[0] + high[0],
                                 high[1] * dims[0] + low[0],
                                 high[1] * dims[0] + high[0]};
-
-    float lx = xy[0] - low[0];
-    float ly = xy[1] - low[1];
+// array<index_int, 4> locs = {low[0] * dims[1] + low[1],  //old
+//                                 low[0] * dims[1] + high[1],
+//                                 high[0] * dims[1] + low[1],
+//                                 high[0] * dims[1] + high[1]};
+    // float lx = xy[0] - low[0];  // new
+    // float ly = xy[1] - low[1];
+    float ly = xy[0] - low[0];
+    float lx = xy[1] - low[1];
     float hy = 1.0f - ly;
     float hx = 1.0f - lx;
     // do calculations in floating point and convert final result to required type
     array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
 
     // todo:  Should we change the order of these indices?
-    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
-    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
+    // auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
+    // auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
+    auto v01 = pooling(data[locs[1]] * ws[1], data[locs[0]] * ws[0]);
+    auto v23 = pooling(data[locs[3]] * ws[3], data[locs[2]] * ws[2]);
     return implicit_conversion(pooling(v01, v23));
 }
 
@@ -121,9 +135,18 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
     const int64_t count = bin_grid_size[0] * bin_grid_size[1];
     dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
         array<index_int, 2> id = {iy, ix};
+println_once(" eeeee roi_starts: ",  roi_starts);
+println(" eeeee idx: ",  idx);
+println_once(" eeeee bin_size: ",  bin_size);
+println_once(" eeeee (id + 0.5f): ",  (id + 0.5f));
+println_once(" eeeee bin_grid_size: ",  bin_grid_size);
+println_once(" eeeee roi_offset: ",  roi_offset);
+        // array<float, 2> locs =
+        //     roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;    // old
         array<float, 2> locs =
-            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
-
+            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;       // new
+print(" EEEEE locs: ", locs);
+println("", "");
         auto val   = bilinear_interpolate(data, dims, locs, op);
         output_val = op(output_val, val);
     });
@@ -179,20 +202,26 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
         const auto offset_rois = rois + (n * roi_column_num);
         const int batch_ind    = ind[n];
 
-        // todo:  did roi_offset get initialized to -0.5 in src/targets/gpu/jit/roialign.cpp?
+        // Note that roi_offset in src/targets/gpu/jit/roialign.cpp uses a negative value, so we add it here
+println_once(" AAAAA s.roi_offset: ", s.roi_offset);
         array<float, 2> roi_starts = {
-            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) - s.roi_offset,
-            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) -
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) +
                 s.roi_offset};
+// static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
+//             static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
+
         array<float, 2> roi_ends = {
-            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale) - s.roi_offset,
-            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale) -
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale) +
                 s.roi_offset};
-
+            // static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
+            // static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
         array<float, 2> roi_size{};
         array<float, 2> bin_size{};
         array<index_int, 2> bin_grid_size{};
 
+
         for(index_int ii = 0; ii < roi_size.size(); ++ii)
         {
             roi_size[ii] = roi_ends[ii] - roi_starts[ii];
@@ -204,8 +233,9 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
                                     ? s.sampling_ratio
                                     : migraphx::ceil(roi_size[ii] / out_dims[ii]);
         }
-
+// const auto offset_asdf = ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
         const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+// println_once(" CCCCC offset_asdf: ", offset_asdf);
         if constexpr(s.is_avg_pooling)
         {
             y_t[i] = calc_pooling(offset_x,
@@ -216,6 +246,10 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
                                   in_dims,
                                   s.roi_offset,
                                   avg_pool{});
+// println_once(" ddddd roi_starts[0]:  ", roi_starts[0]);   looks good here
+// println_once(" ddddd1 roi_starts[1]:  ", roi_starts[1]);
+// print(" DDDDD  i: ",  i)  ;
+// println("   y_t[i]: ",  y_t[i])   ;  // these are all y_t[i]:  0.500000   make sense?
         }
         else
         {
@@ -227,6 +261,9 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
                                   in_dims,
                                   s.roi_offset,
                                   max_pool{});
+println(" EEEEE  i: ",  i)  ;//  EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875,   -0.212812,   -0.364062,   -0.515312,   -0.666562,   -0.817813,   -0.212812,   -0.364062,   -0.515312,   -0.666562,   -0.817813,   -0.212812,   -0.364062,   -0.515312,   -0.666562,   -0.817813 FFFFF xy:  0.000000 
+
+print("   y_t[i]: ",  y_t[i])   ;
         }
     }
 }
diff --git a/test/verify/test_roialign.cpp b/test/verify/test_roialign.cpp
index 9a0f706c93d..d9a93a8db05 100644
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -27,6 +27,39 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>
 
+template <migraphx::shape::type_t DType>
+struct test_roialign_half_pixel : verify_program<test_roialign_half_pixel<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape x_s{DType, {1, 1, 2, 2}};
+
+        migraphx::shape roi_s{DType, {1, 4}};
+
+        migraphx::shape ind_s{migraphx::shape::int64_type, {1}};
+        std::vector<int64_t> ind_vec = {0};
+
+        auto x   = mm->add_parameter("x", x_s);
+        auto roi = mm->add_parameter("roi", roi_s);
+        auto ind = mm->add_literal(migraphx::literal(ind_s, ind_vec));
+        auto r   = mm->add_instruction(
+            migraphx::make_op("roialign",
+                                {{"spatial_scale", 1.1},
+                                 {"output_height", 5},
+                                 {"output_width", 3},
+                                 {"sampling_ratio", 2},
+                                 {"coordinate_transformation_mode", "half_pixel"}}),
+            x,
+            roi,
+            ind);
+        mm->add_return({r});
+
+        return p;
+    }
+};
+
 template <migraphx::shape::type_t DType>
 struct test_roialign : verify_program<test_roialign<DType>>
 {
@@ -48,9 +81,9 @@ struct test_roialign : verify_program<test_roialign<DType>>
             migraphx::make_op("roialign",
                                 {{"spatial_scale", 1.1},
                                  {"output_height", 5},
-                                 {"output_width", 3},
+                                 {"output_width", 2},
                                  {"sampling_ratio", 2},
-                                 {"coordinate_transformation_mode", "half_pixel"}}),
+                                 {"coordinate_transformation_mode", "output_half_pixel"}}),
             x,
             roi,
             ind);
@@ -60,8 +93,9 @@ struct test_roialign : verify_program<test_roialign<DType>>
     }
 };
 
+template struct test_roialign_half_pixel<migraphx::shape::float_type>;
 template struct test_roialign<migraphx::shape::float_type>;
-template struct test_roialign<migraphx::shape::half_type>;
-template struct test_roialign<migraphx::shape::fp8e4m3fnuz_type>;
-template struct test_roialign<migraphx::shape::fp8e4m3fn_type>;
-template struct test_roialign<migraphx::shape::fp8e5m2_type>;
+// template struct test_roialign<migraphx::shape::half_type>;  commented out for debug
+// template struct test_roialign<migraphx::shape::fp8e4m3fnuz_type>;
+// template struct test_roialign<migraphx::shape::fp8e4m3fn_type>;
+// template struct test_roialign<migraphx::shape::fp8e5m2_type>;

From d6dd2e1cd4fa10f3d5193bd41d33e937925ac859 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 9 Oct 2024 22:47:24 +0000
Subject: [PATCH 43/56] work in progress

---
 src/include/migraphx/op/roialign.hpp              |  4 ++--
 .../kernels/include/migraphx/kernels/roialign.hpp | 15 ++++++++++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index a201d443bc1..38120c99918 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -158,7 +158,7 @@ struct roialign
 // printf(" xxxxx    xy[%d]:   %f\n", ii, xy[ii]);
                 }
             }
-printf(" fufufu   xy:   %f, %f    index %d\n",  xy[0],  xy[1], index);
+printf(" fufufu   xy:   %f, %f\n",  xy[0],  xy[1]);
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
                                   high[1] * dims[0] + low[0],
@@ -272,7 +272,7 @@ printf(" fufufu   xy:   %f, %f    index %d\n",  xy[0],  xy[1], index);
                 shape comp_s{shape::float_type, comp_lens};
                 auto pre_calc =
                     this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
-
+// The array returned here should correspond to the GGGGG and HHHHH values in the GPU
                 std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
                 shape comp_s1{migraphx::shape::float_type, comp_lens1};
                 std::vector<int64_t> vec_index(channels, 0);
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index 5f12b74d714..6d3f5931285 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -76,23 +76,23 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
     array<int, 2> high{};
     for(index_int ii = 0; ii < xy.size(); ++ii)
     {
-        println_once(" fffff xy: ", xy[ii]);
+        // println_once(" fffff xy: ", xy[ii]);
         if(xy[ii] < -1.0f or xy[ii] > dims[ii])
         {
-        println_once(" ggggg xy: ", xy[ii]);
+        // println_once(" ggggg xy: ", xy[ii]);
             return implicit_conversion(0);
         }
 
         xy[ii]   = migraphx::max(xy[ii], 0.0f);
-        println_once(" hhhhh xy: ", xy[ii]);
+        // println_once(" hhhhh xy: ", xy[ii]);
         low[ii]  = xy[ii];
         high[ii] = low[ii] + 1;
         if(low[ii] >= dims[ii] - 1)
         {
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
-    println_once(" iiiii xy: ", xy[ii]);
+    // println_once(" iiiii xy: ", xy[ii]);
         }
-        println_once(" FFFFF xy: ", xy[ii]);
+        // println_once(" FFFFF xy: ", xy[ii]);
     }
 println(" FUFUFU xy: ", xy);    
     array<index_int, 4> locs = {low[1] * dims[0] + low[0],  // new
@@ -112,6 +112,11 @@ println(" FUFUFU xy: ", xy);
     // do calculations in floating point and convert final result to required type
     array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
 
+    //debug
+     array<float, 2> pooling_input01 = {data[locs[1]] * ws[1], data[locs[0]] * ws[0]};
+     array<float, 2> pooling_input23 = {data[locs[3]] * ws[3], data[locs[2]] * ws[2]};
+println(" GGGGG pooling_input01", pooling_input01);
+println(" HHHHH pooling_input23", pooling_input23);
     // todo:  Should we change the order of these indices?
     // auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
     // auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);

From d425d5644611e1da1e87d02d27904207b0b5f7b8 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Thu, 10 Oct 2024 21:18:14 +0000
Subject: [PATCH 44/56] work in progress; a lot of debug code

---
 src/include/migraphx/op/roialign.hpp          | 24 +++++-----
 .../include/migraphx/kernels/roialign.hpp     | 45 +++++++++++--------
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 38120c99918..05ce5d9a65c 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -135,30 +135,24 @@ struct roialign
 
             for(auto ii : range(p.size()))
             {
-// printf(" ttttt roi_start[%d] = %f  p=%lu bin_size = %f   i[%d] + .5f = %f  bin_grid_size = %lu\n", ii, 
-// roi_start[ii], p[ii], bin_size[ii], ii, (i[ii] + .5f), bin_grid_size[ii]);
-                
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-// printf(" uuuuu    xy[%d]:   %f\n", ii, xy[ii]);
+printf(" FUFUFU index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
                     results[index] = pos_weight{};
-// printf(" vvvvv    xy[%d]:   %f\n", ii, xy[ii]);
                     return;
                 }
 
                 xy[ii]   = std::max(xy[ii], 0.0f);
-// printf(" wwwww    xy[%d]:   %f\n", ii, xy[ii]);
                 low[ii]  = xy[ii];
                 high[ii] = low[ii] + 1;
                 if(low[ii] >= dims[ii] - 1)
                 {
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
-// printf(" xxxxx    xy[%d]:   %f\n", ii, xy[ii]);
                 }
             }
-printf(" fufufu   xy:   %f, %f\n",  xy[0],  xy[1]);
+printf(" FFFFF index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
                                   high[1] * dims[0] + low[0],
@@ -202,13 +196,19 @@ printf(" fufufu   xy:   %f, %f\n",  xy[0],  xy[1]);
     {
         double output_val   = op.init();
         const int64_t count = bin_grid_size[0] * bin_grid_size[1];
-        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
+        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
             const auto& pc = pos_weights[index];
             std::array<double, 4> wv;
+// printf(" HHHHH dfor index: (%lu, %lu)\n", iy, ix);            
+// printf(" GGGGG transform:  ");                    
+printf(" IIIII transform ws:  ");  
             std::transform(
                 pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
+printf(" %f ", w);
+// printf("  %f ", *(data + pos) * w);
                     return *(data + pos) * w;
                 });
+printf("\n");                
             output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
             index += 1;
         });
@@ -234,8 +234,6 @@ printf(" fufufu   xy:   %f, %f\n",  xy[0],  xy[1]);
 
         visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
             const auto* batch_indices = args.at(2).cast<int64_t>();
-// printf(" UUUUU roi = %f, %f  roi_s.index(0, 0) = %zu   roi_s.index(0, 1) = %zu   (1, 0)=%zu    (1, 1)=%zu\n", roi[0], roi[1], 
-// roi_s.index({0, 0}), roi_s.index({0, 1}), roi_s.index({1, 0}), roi_s.index({1, 1}))   ;         
             par_for(n_rois, [&](auto n) {
                 const auto bottom_data   = x.begin();
                 const auto roi_batch_ind = batch_indices[n];
@@ -285,9 +283,7 @@ printf(" fufufu   xy:   %f, %f\n",  xy[0],  xy[1]);
                     const auto offset_bottom_data =
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
-// std::cout << " VVVVV offset_bottom_data: "  << offset_bottom_data[0] << "\n" ;
-// std::cout << " WWWWW offset_bottom_data_asdf: "  << static_cast<int64_t>((roi_batch_ind * channels + c) *
-//                                                            in_dims[0] * in_dims[1]) << "\n"   ;
+printf(" KKKKK n, c, ph, pw = %lu %lu %lu %lu\n",  n, c, ph, pw);
                     double output_val;
                     std::tie(output_val, vec_index[c]) =
                         (mode == migraphx::op::pooling_mode::average)
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index 6d3f5931285..caecf6dd4bc 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -79,22 +79,20 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
         // println_once(" fffff xy: ", xy[ii]);
         if(xy[ii] < -1.0f or xy[ii] > dims[ii])
         {
-        // println_once(" ggggg xy: ", xy[ii]);
+        // println_once(" g@gggg xy: ", xy[ii]);
             return implicit_conversion(0);
         }
 
         xy[ii]   = migraphx::max(xy[ii], 0.0f);
-        // println_once(" hhhhh xy: ", xy[ii]);
+        // println_once(" h@hhhh xy: ", xy[ii]);
         low[ii]  = xy[ii];
         high[ii] = low[ii] + 1;
         if(low[ii] >= dims[ii] - 1)
         {
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
-    // println_once(" iiiii xy: ", xy[ii]);
         }
-        // println_once(" FFFFF xy: ", xy[ii]);
     }
-println(" FUFUFU xy: ", xy);    
+println_once(" fffff xy: ", xy);
     array<index_int, 4> locs = {low[1] * dims[0] + low[0],  // new
                                 low[1] * dims[0] + high[0],
                                 high[1] * dims[0] + low[0],
@@ -110,13 +108,16 @@ println(" FUFUFU xy: ", xy);
     float hy = 1.0f - ly;
     float hx = 1.0f - lx;
     // do calculations in floating point and convert final result to required type
-    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    // array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx}; //old
+    array<float, 4> ws = {hy * hx, ly * hx, hy * lx, ly * lx};  // new
 
     //debug
-     array<float, 2> pooling_input01 = {data[locs[1]] * ws[1], data[locs[0]] * ws[0]};
-     array<float, 2> pooling_input23 = {data[locs[3]] * ws[3], data[locs[2]] * ws[2]};
-println(" GGGGG pooling_input01", pooling_input01);
-println(" HHHHH pooling_input23", pooling_input23);
+//  array<float, 2> pooling_input01 = {data[locs[1]] * ws[1], data[locs[0]] * ws[0]};
+//  array<float, 2> pooling_input23 = {data[locs[3]] * ws[3], data[locs[2]] * ws[2]};
+// println(" ggggg pooling_input01", pooling_input01);
+// println(" hhhhh pooling_input23", pooling_input23);
+println(" iiiii ws:  ", ws);
+println();
     // todo:  Should we change the order of these indices?
     // auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
     // auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
@@ -140,18 +141,24 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
     const int64_t count = bin_grid_size[0] * bin_grid_size[1];
     dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
         array<index_int, 2> id = {iy, ix};
+println_once(" hhhhh id: ", id); 
+(void) roi_offset;
+println("How does locs increment?  12 steps in idx = 1 step in ref version", "");    
 println_once(" eeeee roi_starts: ",  roi_starts);
 println(" eeeee idx: ",  idx);
 println_once(" eeeee bin_size: ",  bin_size);
 println_once(" eeeee (id + 0.5f): ",  (id + 0.5f));
 println_once(" eeeee bin_grid_size: ",  bin_grid_size);
+array<float, 2> zap = idx * bin_size;
+println("idx * bin_size: ", zap);
+array<float, 2> zap2 = bin_size * (id + 0.5f) / bin_grid_size;
+println("(id + 0.5f) / bin_grid_size: ", zap2);
 println_once(" eeeee roi_offset: ",  roi_offset);
         // array<float, 2> locs =
-        //     roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;    // old
+        //     roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;    // old leads to all 0's
         array<float, 2> locs =
             roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;       // new
-print(" EEEEE locs: ", locs);
-println("", "");
+println(" eeeeeEEE locs: ", locs);
         auto val   = bilinear_interpolate(data, dims, locs, op);
         output_val = op(output_val, val);
     });
@@ -208,7 +215,8 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
         const int batch_ind    = ind[n];
 
         // Note that roi_offset in src/targets/gpu/jit/roialign.cpp uses a negative value, so we add it here
-println_once(" AAAAA s.roi_offset: ", s.roi_offset);
+println(" aaaaa idx: ", idx);
+// println("   out_dims ", out_lens);
         array<float, 2> roi_starts = {
             static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
             static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) +
@@ -238,9 +246,11 @@ println_once(" AAAAA s.roi_offset: ", s.roi_offset);
                                     ? s.sampling_ratio
                                     : migraphx::ceil(roi_size[ii] / out_dims[ii]);
         }
-// const auto offset_asdf = ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+array<int, 4> zap = {n, c, ph, pw};
+
+println(" kkkkk n, c, ph, pw: ", zap);
+
         const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
-// println_once(" CCCCC offset_asdf: ", offset_asdf);
         if constexpr(s.is_avg_pooling)
         {
             y_t[i] = calc_pooling(offset_x,
@@ -266,9 +276,8 @@ println_once(" AAAAA s.roi_offset: ", s.roi_offset);
                                   in_dims,
                                   s.roi_offset,
                                   max_pool{});
-println(" EEEEE  i: ",  i)  ;//  EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.805208 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.576042 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875 EEEEE locs:  -0.346875,   -0.212812,   -0.364062,   -0.515312,   -0.666562,   -0.817813,   -0.212812,   -0.364062,   -0.515312,   -0.666562,   -0.817813,   -0.212812,   -0.364062,   -0.515312,   -0.666562,   -0.817813 FFFFF xy:  0.000000 
 
-print("   y_t[i]: ",  y_t[i])   ;
+// print("   y_t[i]: ",  y_t[i])   ;
         }
     }
 }

From 682b6532db5234d2c22aaf4370668811b835bb3a Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Fri, 11 Oct 2024 20:01:01 +0000
Subject: [PATCH 45/56] work in progress, gpu kernel closer to correct.  Gives
 correct results but in mixed up order.

---
 src/include/migraphx/op/roialign.hpp          | 43 +++++++++----
 .../include/migraphx/kernels/roialign.hpp     | 64 ++++++-------------
 test/verify/test_roialign.cpp                 |  2 +-
 3 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 05ce5d9a65c..3e6e31d9524 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -126,18 +126,28 @@ struct roialign
             // order.  The i[x] value is least significant and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
             std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; // these are always equal
+printf(" EEEEE p, i-index   %lu  %lu  %lu %lu    ( %lu  %lu  %lu %lu)\n", p[0], p[1], i[0], i[1],
+idx_v[0], idx_v[1], idx_v[2], idx_v[3]);
+
+
+
+
+
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
             // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies
             // inside) from which we will interpolate.
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
-
+float asdf=-1.f;
             for(auto ii : range(p.size()))
             {
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-printf(" FUFUFU index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
+// initial calculated values, before adjustments
+if(ii == 0 ) asdf = xy[0];
+if(ii == 1)
+printf(" IIIII index %lu    xy:   (%f, %f)\n", index,  asdf,  xy[1]);
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
                     results[index] = pos_weight{};
@@ -152,7 +162,7 @@ printf(" FUFUFU index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
                 }
             }
-printf(" FFFFF index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
+// printf(" FFFFF index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
                                   high[1] * dims[0] + low[0],
@@ -164,6 +174,12 @@ printf(" FFFFF index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
             float hx = 1.0f - lx;
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+    printf(" AAAAA index %lu results.w:  %f, %f, %f, %f\n", index, 
+    results[index].w[0],
+    results[index].w[1],
+    results[index].w[2],
+    results[index].w[3]
+            );
         });
         return results;
     }
@@ -191,24 +207,19 @@ printf(" FFFFF index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
     std::tuple<double, int64_t> calc_pooling(const T& data,
                                              const std::array<std::size_t, 2>& bin_grid_size,
                                              const std::vector<pos_weight>& pos_weights,
-                                             int64_t index,
+                                             int64_t index,  // index to c
                                              Op op) const
     {
         double output_val   = op.init();
         const int64_t count = bin_grid_size[0] * bin_grid_size[1];
         dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
+printf(" IIIIIKKKKK  iy, ix, index =      %lu  %lu  %ld\n", iy, ix, index );
             const auto& pc = pos_weights[index];
-            std::array<double, 4> wv;
-// printf(" HHHHH dfor index: (%lu, %lu)\n", iy, ix);            
-// printf(" GGGGG transform:  ");                    
-printf(" IIIII transform ws:  ");  
+            std::array<double, 4> wv; 
             std::transform(
                 pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
-printf(" %f ", w);
-// printf("  %f ", *(data + pos) * w);
                     return *(data + pos) * w;
                 });
-printf("\n");                
             output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
             index += 1;
         });
@@ -270,7 +281,7 @@ printf("\n");
                 shape comp_s{shape::float_type, comp_lens};
                 auto pre_calc =
                     this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
-// The array returned here should correspond to the GGGGG and HHHHH values in the GPU
+
                 std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
                 shape comp_s1{migraphx::shape::float_type, comp_lens1};
                 std::vector<int64_t> vec_index(channels, 0);
@@ -280,11 +291,15 @@ printf("\n");
                     auto ph = idx[1];
                     auto pw = idx[2];
 
+// n anc c are 0 because that's the size of the test case
+printf(" IIIII n, c, ph, pw =                       %lu %lu    %lu  %lu\n", n, c, ph, pw);
+
                     const auto offset_bottom_data =
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
-printf(" KKKKK n, c, ph, pw = %lu %lu %lu %lu\n",  n, c, ph, pw);
+
                     double output_val;
+printf(" IIIIIc vec_index[c] = %ld\n", vec_index[c]);
                     std::tie(output_val, vec_index[c]) =
                         (mode == migraphx::op::pooling_mode::average)
                             ? this->calc_pooling(offset_bottom_data,
@@ -297,7 +312,7 @@ printf(" KKKKK n, c, ph, pw = %lu %lu %lu %lu\n",  n, c, ph, pw);
                                                  pre_calc,
                                                  vec_index[c],
                                                  max_pool{});
-// printf(" XXXXX output_val: %f  \n", output_val)                                                 ;
+printf(" DDDDD %f\n", output_val);                                                 
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index caecf6dd4bc..9d60e705f0f 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -76,15 +76,12 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
     array<int, 2> high{};
     for(index_int ii = 0; ii < xy.size(); ++ii)
     {
-        // println_once(" fffff xy: ", xy[ii]);
         if(xy[ii] < -1.0f or xy[ii] > dims[ii])
         {
-        // println_once(" g@gggg xy: ", xy[ii]);
             return implicit_conversion(0);
         }
 
         xy[ii]   = migraphx::max(xy[ii], 0.0f);
-        // println_once(" h@hhhh xy: ", xy[ii]);
         low[ii]  = xy[ii];
         high[ii] = low[ii] + 1;
         if(low[ii] >= dims[ii] - 1)
@@ -92,40 +89,25 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
         }
     }
-println_once(" fffff xy: ", xy);
     array<index_int, 4> locs = {low[1] * dims[0] + low[0],  // new
                                 low[1] * dims[0] + high[0],
                                 high[1] * dims[0] + low[0],
                                 high[1] * dims[0] + high[0]};
-// array<index_int, 4> locs = {low[0] * dims[1] + low[1],  //old
-//                                 low[0] * dims[1] + high[1],
-//                                 high[0] * dims[1] + low[1],
-//                                 high[0] * dims[1] + high[1]};
-    // float lx = xy[0] - low[0];  // new
-    // float ly = xy[1] - low[1];
-    float ly = xy[0] - low[0];
-    float lx = xy[1] - low[1];
+
+    float lx = xy[0] - low[0];  // new
+    float ly = xy[1] - low[1];
+\
     float hy = 1.0f - ly;
     float hx = 1.0f - lx;
     // do calculations in floating point and convert final result to required type
-    // array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx}; //old
-    array<float, 4> ws = {hy * hx, ly * hx, hy * lx, ly * lx};  // new
+    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx}; //old
 
-    //debug
-//  array<float, 2> pooling_input01 = {data[locs[1]] * ws[1], data[locs[0]] * ws[0]};
-//  array<float, 2> pooling_input23 = {data[locs[3]] * ws[3], data[locs[2]] * ws[2]};
-// println(" ggggg pooling_input01", pooling_input01);
-// println(" hhhhh pooling_input23", pooling_input23);
-println(" iiiii ws:  ", ws);
-println();
-    // todo:  Should we change the order of these indices?
-    // auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
-    // auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
     auto v01 = pooling(data[locs[1]] * ws[1], data[locs[0]] * ws[0]);
     auto v23 = pooling(data[locs[3]] * ws[3], data[locs[2]] * ws[2]);
     return implicit_conversion(pooling(v01, v23));
 }
 
+// Calculate a single pooled output value
 template <class Iterator, class Op>
 MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                             const array<float, 2>& roi_starts,
@@ -136,29 +118,24 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                             float roi_offset,
                                             Op op)
 {
+    // for one idx (output height and width coordinates) we iterate through all bin_grid values
     using in_dtype      = typename Iterator::value_type;
     in_dtype output_val = in_dtype{op.init()};
     const int64_t count = bin_grid_size[0] * bin_grid_size[1];
     dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
         array<index_int, 2> id = {iy, ix};
-println_once(" hhhhh id: ", id); 
+println_once(" jjjjj id: ", id); 
 (void) roi_offset;
-println("How does locs increment?  12 steps in idx = 1 step in ref version", "");    
-println_once(" eeeee roi_starts: ",  roi_starts);
+println_once(" jjjjj roi_starts: ",  roi_starts);
 println(" eeeee idx: ",  idx);
-println_once(" eeeee bin_size: ",  bin_size);
-println_once(" eeeee (id + 0.5f): ",  (id + 0.5f));
-println_once(" eeeee bin_grid_size: ",  bin_grid_size);
-array<float, 2> zap = idx * bin_size;
-println("idx * bin_size: ", zap);
-array<float, 2> zap2 = bin_size * (id + 0.5f) / bin_grid_size;
-println("(id + 0.5f) / bin_grid_size: ", zap2);
-println_once(" eeeee roi_offset: ",  roi_offset);
-        // array<float, 2> locs =
-        //     roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;    // old leads to all 0's
+
         array<float, 2> locs =
             roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;       // new
-println(" eeeeeEEE locs: ", locs);
+// idx same as ph, pw
+ array<float, 6> asdf_idx = {float(iy),  float(ix), float(idx[0]), float(idx[1]),locs[0], locs[1]};
+// put idx, ix, iy, and locs into a single array to debug together        
+
+println(" iiiii asdf_idx/locs: ", asdf_idx);
         auto val   = bilinear_interpolate(data, dims, locs, op);
         output_val = op(output_val, val);
     });
@@ -192,7 +169,7 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
     auto channel_num = x_lens[1];
     // input dims of height and width, in all 2-dim arrays, the first dim
     // is for height and second dim is for width
-    array<index_int, 2> in_dims = {x_lens[2], x_lens[3]};
+    array<index_int, 2> in_dims = {x_lens[3], x_lens[2]};
 
     const auto stride   = index.nglobal();
     auto out_s          = y_t.get_shape();
@@ -202,6 +179,7 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
     // is for height and second dim is for width
     const auto& out_lens         = out_s.lens;
     array<index_int, 2> out_dims = {out_lens[2], out_lens[3]};
+println_once(" aaaaa stride: ", stride);
 
     for(index_int i = index.global; i < out_s.elements(); i += stride)
     {
@@ -215,8 +193,6 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
         const int batch_ind    = ind[n];
 
         // Note that roi_offset in src/targets/gpu/jit/roialign.cpp uses a negative value, so we add it here
-println(" aaaaa idx: ", idx);
-// println("   out_dims ", out_lens);
         array<float, 2> roi_starts = {
             static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
             static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) +
@@ -251,6 +227,8 @@ array<int, 4> zap = {n, c, ph, pw};
 println(" kkkkk n, c, ph, pw: ", zap);
 
         const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+array<int, 4> reindex = {n, c, pw, ph};//;;  rearrange the gpu indices to what the ref indices would be
+// and insert that location in y_t        
         if constexpr(s.is_avg_pooling)
         {
             y_t[i] = calc_pooling(offset_x,
@@ -263,8 +241,8 @@ println(" kkkkk n, c, ph, pw: ", zap);
                                   avg_pool{});
 // println_once(" ddddd roi_starts[0]:  ", roi_starts[0]);   looks good here
 // println_once(" ddddd1 roi_starts[1]:  ", roi_starts[1]);
-// print(" DDDDD  i: ",  i)  ;
-// println("   y_t[i]: ",  y_t[i])   ;  // these are all y_t[i]:  0.500000   make sense?
+print(" ddddd  i: ",  i)  ;
+println("   y_t[i]: ",  y_t[i])   ;  // these are all y_t[i]:  0.500000   make sense?
         }
         else
         {
diff --git a/test/verify/test_roialign.cpp b/test/verify/test_roialign.cpp
index d9a93a8db05..c036dbb5e23 100644
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -49,7 +49,7 @@ struct test_roialign_half_pixel : verify_program<test_roialign_half_pixel<DType>
                                 {{"spatial_scale", 1.1},
                                  {"output_height", 5},
                                  {"output_width", 3},
-                                 {"sampling_ratio", 2},
+                                 {"sampling_ratio", 3},
                                  {"coordinate_transformation_mode", "half_pixel"}}),
             x,
             roi,

From 4e122bc03a8e4d4f2f0c19787a98c27ffa19515e Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Fri, 11 Oct 2024 23:14:49 +0000
Subject: [PATCH 46/56] work in progress

---
 .../include/migraphx/kernels/roialign.hpp     | 72 +++++++++++++++----
 test/verify/test_roialign.cpp                 |  2 +-
 2 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index 9d60e705f0f..80d2bd7bffe 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -124,10 +124,10 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
     const int64_t count = bin_grid_size[0] * bin_grid_size[1];
     dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
         array<index_int, 2> id = {iy, ix};
-println_once(" jjjjj id: ", id); 
+// println_once(" jjjjj id: ", id); 
 (void) roi_offset;
-println_once(" jjjjj roi_starts: ",  roi_starts);
-println(" eeeee idx: ",  idx);
+// println_once(" jjjjj roi_starts: ",  roi_starts);
+// println(" eeeee idx: ",  idx);
 
         array<float, 2> locs =
             roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;       // new
@@ -135,7 +135,7 @@ println(" eeeee idx: ",  idx);
  array<float, 6> asdf_idx = {float(iy),  float(ix), float(idx[0]), float(idx[1]),locs[0], locs[1]};
 // put idx, ix, iy, and locs into a single array to debug together        
 
-println(" iiiii asdf_idx/locs: ", asdf_idx);
+// println(" iiiii asdf_idx/locs: ", asdf_idx);
         auto val   = bilinear_interpolate(data, dims, locs, op);
         output_val = op(output_val, val);
     });
@@ -197,15 +197,12 @@ println_once(" aaaaa stride: ", stride);
             static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
             static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) +
                 s.roi_offset};
-// static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
-//             static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
 
         array<float, 2> roi_ends = {
             static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
             static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale) +
                 s.roi_offset};
-            // static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
-            // static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
+
         array<float, 2> roi_size{};
         array<float, 2> bin_size{};
         array<index_int, 2> bin_grid_size{};
@@ -227,8 +224,10 @@ array<int, 4> zap = {n, c, ph, pw};
 println(" kkkkk n, c, ph, pw: ", zap);
 
         const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
-array<int, 4> reindex = {n, c, pw, ph};//;;  rearrange the gpu indices to what the ref indices would be
-// and insert that location in y_t        
+// array<size_t, 4> reindex = {size_t(n), size_t(c), size_t(pw), size_t(ph)};//;;  rearrange the gpu indices to what the ref indices would be
+// migraphx::shape reindex_shape(reindex);
+// and insert that location in y_t    
+
         if constexpr(s.is_avg_pooling)
         {
             y_t[i] = calc_pooling(offset_x,
@@ -239,10 +238,54 @@ array<int, 4> reindex = {n, c, pw, ph};//;;  rearrange the gpu indices to what t
                                   in_dims,
                                   s.roi_offset,
                                   avg_pool{});
-// println_once(" ddddd roi_starts[0]:  ", roi_starts[0]);   looks good here
-// println_once(" ddddd1 roi_starts[1]:  ", roi_starts[1]);
-print(" ddddd  i: ",  i)  ;
-println("   y_t[i]: ",  y_t[i])   ;  // these are all y_t[i]:  0.500000   make sense?
+// what are the indices corresponding to i?
+
+        std::size_t jj = 0;
+        // std::size_t ss      = 1;
+array<size_t, 4> m_lens{out_lens[0], out_lens[1], out_lens[3], out_lens[2]};
+array<size_t, 4> m_strides;
+m_strides[3] = 1;
+    for(auto k: {2, 1, 0})
+    {
+        m_strides[k] = m_strides[k+1] * m_lens[k+1]; 
+
+    }
+println_once(" m_lens: ", m_lens);
+println_once(" m_strides: ", m_strides);
+        // for(auto k : {3, 2, 1, 0})
+        // {
+        //     std::size_t stride2 = m_strides[k];
+        //     std::size_t len    = m_lens[k];
+        //     std::size_t idxx    = (i % (ss * len)) / ss;
+        //     jj += stride2 * idxx;
+        //     ss *= len;
+        // }
+        // println(" jj2: ", jj);
+
+size_t pp = i;
+jj = (pp/m_strides[0])*m_strides[0];
+pp = pp % m_strides[1];
+jj += (pp/m_strides[1])*m_strides[1];
+pp %= m_strides[2];
+jj += (pp/m_strides[2])*m_strides[2];
+pp %= m_strides[3];
+jj += pp;
+
+
+// jj = i/m_strides[2] + (i%m_strides[2])*m_lens[2] + (i/m_strides[1])*m_strides[1] + (i/m_strides[0])*m_strides[0];
+// jj = (i % m_strides[1])
+
+array<float, 7> zapzap = {float(n), float(c), float(ph), float(pw), y_t[i], float(i), static_cast<float>(jj)};
+// array<size_t, 2> zapzap = {i, jj};
+
+/**
+ * I want to turn  0->0,
+ *                 1->5,
+ *                 2->10,
+ *                 3->1,
+ * i.e.  (i%3) * 5 + (i/3)   but accounting for n and c too.
+ */
+println(" ddddd  y_t[i]: ",  zapzap)   ;
         }
         else
         {
@@ -255,7 +298,6 @@ println("   y_t[i]: ",  y_t[i])   ;  // these are all y_t[i]:  0.500000   make s
                                   s.roi_offset,
                                   max_pool{});
 
-// print("   y_t[i]: ",  y_t[i])   ;
         }
     }
 }
diff --git a/test/verify/test_roialign.cpp b/test/verify/test_roialign.cpp
index c036dbb5e23..e8878c6c8e8 100644
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -34,7 +34,7 @@ struct test_roialign_half_pixel : verify_program<test_roialign_half_pixel<DType>
     {
         migraphx::program p;
         auto* mm = p.get_main_module();
-        migraphx::shape x_s{DType, {1, 1, 2, 2}};
+        migraphx::shape x_s{DType, {1, 7, 2, 2}};
 
         migraphx::shape roi_s{DType, {1, 4}};
 

From dbd28a4395479a6639d47d37aed4dd5a08c41f04 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 14 Oct 2024 16:13:39 +0000
Subject: [PATCH 47/56] work in progress with GPU output indexes

---
 src/include/migraphx/op/roialign.hpp          | 32 ++++++------
 .../include/migraphx/kernels/roialign.hpp     | 51 ++++++++++---------
 test/verify/test_roialign.cpp                 |  2 +-
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index 3e6e31d9524..a9885cdec8b 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -126,8 +126,8 @@ struct roialign
             // order.  The i[x] value is least significant and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
             std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; // these are always equal
-printf(" EEEEE p, i-index   %lu  %lu  %lu %lu    ( %lu  %lu  %lu %lu)\n", p[0], p[1], i[0], i[1],
-idx_v[0], idx_v[1], idx_v[2], idx_v[3]);
+// printf(" EEEEE p, i-index   %lu  %lu  %lu %lu    ( %lu  %lu  %lu %lu)\n", p[0], p[1], i[0], i[1],
+// idx_v[0], idx_v[1], idx_v[2], idx_v[3]);
 
 
 
@@ -139,15 +139,15 @@ idx_v[0], idx_v[1], idx_v[2], idx_v[3]);
             // inside) from which we will interpolate.
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
-float asdf=-1.f;
+// float asdf=-1.f;
             for(auto ii : range(p.size()))
             {
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
 // initial calculated values, before adjustments
-if(ii == 0 ) asdf = xy[0];
-if(ii == 1)
-printf(" IIIII index %lu    xy:   (%f, %f)\n", index,  asdf,  xy[1]);
+// if(ii == 0 ) asdf = xy[0];
+// if(ii == 1)
+// printf(" IIIII index %lu    xy:   (%f, %f)\n", index,  asdf,  xy[1]);
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
                     results[index] = pos_weight{};
@@ -174,12 +174,12 @@ printf(" IIIII index %lu    xy:   (%f, %f)\n", index,  asdf,  xy[1]);
             float hx = 1.0f - lx;
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
-    printf(" AAAAA index %lu results.w:  %f, %f, %f, %f\n", index, 
-    results[index].w[0],
-    results[index].w[1],
-    results[index].w[2],
-    results[index].w[3]
-            );
+    // printf(" AAAAA index %lu results.w:  %f, %f, %f, %f\n", index, 
+    // results[index].w[0],
+    // results[index].w[1],
+    // results[index].w[2],
+    // results[index].w[3]
+    //         );
         });
         return results;
     }
@@ -213,7 +213,7 @@ printf(" IIIII index %lu    xy:   (%f, %f)\n", index,  asdf,  xy[1]);
         double output_val   = op.init();
         const int64_t count = bin_grid_size[0] * bin_grid_size[1];
         dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
-printf(" IIIIIKKKKK  iy, ix, index =      %lu  %lu  %ld\n", iy, ix, index );
+// printf(" IIIIIKKKKK  iy, ix, index =      %lu  %lu  %ld\n", iy, ix, index );
             const auto& pc = pos_weights[index];
             std::array<double, 4> wv; 
             std::transform(
@@ -292,14 +292,14 @@ printf(" IIIIIKKKKK  iy, ix, index =      %lu  %lu  %ld\n", iy, ix, index );
                     auto pw = idx[2];
 
 // n anc c are 0 because that's the size of the test case
-printf(" IIIII n, c, ph, pw =                       %lu %lu    %lu  %lu\n", n, c, ph, pw);
+// printf(" IIIII n, c, ph, pw =                       %lu %lu    %lu  %lu\n", n, c, ph, pw);
 
                     const auto offset_bottom_data =
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
 
                     double output_val;
-printf(" IIIIIc vec_index[c] = %ld\n", vec_index[c]);
+// printf(" IIIIIc vec_index[c] = %ld\n", vec_index[c]);
                     std::tie(output_val, vec_index[c]) =
                         (mode == migraphx::op::pooling_mode::average)
                             ? this->calc_pooling(offset_bottom_data,
@@ -312,7 +312,7 @@ printf(" IIIIIc vec_index[c] = %ld\n", vec_index[c]);
                                                  pre_calc,
                                                  vec_index[c],
                                                  max_pool{});
-printf(" DDDDD %f\n", output_val);                                                 
+printf(" DDDDD  index %lu %f\n",   output_shape.index({n, c, ph, pw}), output_val);                                                 
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index 80d2bd7bffe..c275eb3e11c 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -219,9 +219,9 @@ println_once(" aaaaa stride: ", stride);
                                     ? s.sampling_ratio
                                     : migraphx::ceil(roi_size[ii] / out_dims[ii]);
         }
-array<int, 4> zap = {n, c, ph, pw};
+// array<int, 4> zap = {n, c, ph, pw};
 
-println(" kkkkk n, c, ph, pw: ", zap);
+// println(" kkkkk n, c, ph, pw: ", zap);
 
         const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
 // array<size_t, 4> reindex = {size_t(n), size_t(c), size_t(pw), size_t(ph)};//;;  rearrange the gpu indices to what the ref indices would be
@@ -230,14 +230,6 @@ println(" kkkkk n, c, ph, pw: ", zap);
 
         if constexpr(s.is_avg_pooling)
         {
-            y_t[i] = calc_pooling(offset_x,
-                                  roi_starts,
-                                  bin_size,
-                                  {ph, pw},
-                                  bin_grid_size,
-                                  in_dims,
-                                  s.roi_offset,
-                                  avg_pool{});
 // what are the indices corresponding to i?
 
         std::size_t jj = 0;
@@ -264,28 +256,37 @@ println_once(" m_strides: ", m_strides);
 
 size_t pp = i;
 jj = (pp/m_strides[0])*m_strides[0];
-pp = pp % m_strides[1];
+pp = pp % m_strides[0];
 jj += (pp/m_strides[1])*m_strides[1];
-pp %= m_strides[2];
-jj += (pp/m_strides[2])*m_strides[2];
-pp %= m_strides[3];
+pp %= m_strides[1];
+println(" i, pp: ", 10000*i + pp);
+
+println(" pp/m_strides[2], pp % m_strides[2]",1000000*(pp/m_lens[2] + (pp % m_lens[2])*m_strides[2]) 
+    + 10000*(pp/m_lens[2]) + (pp%m_lens[2]) + 100000000);
+pp = pp/m_lens[2] + (pp % m_lens[2])*m_strides[2];
+println("  jj, pp: ", jj * 10000 + pp);   // <===== may still be relevant
+// jj += (pp/m_strides[2])*m_strides[2];
+// pp %= m_strides[2];
 jj += pp;
 
 
-// jj = i/m_strides[2] + (i%m_strides[2])*m_lens[2] + (i/m_strides[1])*m_strides[1] + (i/m_strides[0])*m_strides[0];
+// jj = i/m_strides[2] + (i%m_strides[2])*m_lens[2] 
 // jj = (i % m_strides[1])
 
-array<float, 7> zapzap = {float(n), float(c), float(ph), float(pw), y_t[i], float(i), static_cast<float>(jj)};
-// array<size_t, 2> zapzap = {i, jj};
 
-/**
- * I want to turn  0->0,
- *                 1->5,
- *                 2->10,
- *                 3->1,
- * i.e.  (i%3) * 5 + (i/3)   but accounting for n and c too.
- */
-println(" ddddd  y_t[i]: ",  zapzap)   ;
+y_t[jj] = calc_pooling(offset_x,
+            // y_t[i] = calc_pooling(offset_x,
+                                  roi_starts,
+                                  bin_size,
+                                  {ph, pw},
+                                  bin_grid_size,
+                                  in_dims,
+                                  s.roi_offset,
+                                  avg_pool{});
+array<float, 7> zapzap = {float(n), float(c), float(ph), float(pw), float(i), static_cast<float>(jj), y_t[jj]};
+
+
+println(" ddddd  y_t[jj]: ",  zapzap)   ;
         }
         else
         {
diff --git a/test/verify/test_roialign.cpp b/test/verify/test_roialign.cpp
index e8878c6c8e8..8833618a079 100644
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -34,7 +34,7 @@ struct test_roialign_half_pixel : verify_program<test_roialign_half_pixel<DType>
     {
         migraphx::program p;
         auto* mm = p.get_main_module();
-        migraphx::shape x_s{DType, {1, 7, 2, 2}};
+        migraphx::shape x_s{DType, {1, 5, 2, 2}};
 
         migraphx::shape roi_s{DType, {1, 4}};
 

From 7c9175783fdb329e5c488d14829cad341a74db6d Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 14 Oct 2024 16:58:50 +0000
Subject: [PATCH 48/56] fixed GPU kernel and cleaned up debug code.  Passes all
 test_verify test_roialign* cases. still contains output index workaround

---
 ort_roialign.py                               |  15 +-
 src/include/migraphx/op/roialign.hpp          |  32 +----
 .../include/migraphx/kernels/roialign.hpp     | 129 +++++-------------
 test/onnx/gen_onnx.py                         |   6 +-
 test/verify/test_roialign.cpp                 |  16 +--
 test/verify/test_roialign_nondefault.cpp      |   1 -
 6 files changed, 55 insertions(+), 144 deletions(-)

diff --git a/ort_roialign.py b/ort_roialign.py
index eb60fb05352..817e5fe2e84 100644
--- a/ort_roialign.py
+++ b/ort_roialign.py
@@ -11,18 +11,11 @@
 
 y = np.ones([10, 5, 4, 7], dtype='f')
 
-rois = np.array(
-    [
-        [0.1, 0.15, 0.6, 0.35],
-        [2.1, 1.73, 3.8, 2.13]
-    ],
-    dtype='f')
+rois = np.array([[0.1, 0.15, 0.6, 0.35], [2.1, 1.73, 3.8, 2.13]], dtype='f')
 
 themodel = 'roialign_test.onnx'
-sess = rt.InferenceSession(
-    '/workspace/AMDMIGraphX/test/onnx/' + themodel)
+sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/' + themodel)
 res = sess.run(['y'], {'x': x, 'rois': rois, 'batch_ind': [1, 0]})
 
-print(' ORT test model is ' + themodel + ', rois_data is \n',
-      rois, ' result is \n', res)
- 
\ No newline at end of file
+print(' ORT test model is ' + themodel + ', rois_data is \n', rois,
+      ' result is \n', res)
diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index a9885cdec8b..ded80e5080f 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -126,12 +126,6 @@ struct roialign
             // order.  The i[x] value is least significant and iterates the fastest.
             std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
             std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; // these are always equal
-// printf(" EEEEE p, i-index   %lu  %lu  %lu %lu    ( %lu  %lu  %lu %lu)\n", p[0], p[1], i[0], i[1],
-// idx_v[0], idx_v[1], idx_v[2], idx_v[3]);
-
-
-
-
 
             // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
@@ -139,15 +133,11 @@ struct roialign
             // inside) from which we will interpolate.
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
-// float asdf=-1.f;
             for(auto ii : range(p.size()))
             {
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-// initial calculated values, before adjustments
-// if(ii == 0 ) asdf = xy[0];
-// if(ii == 1)
-// printf(" IIIII index %lu    xy:   (%f, %f)\n", index,  asdf,  xy[1]);
+
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
                     results[index] = pos_weight{};
@@ -162,7 +152,6 @@ struct roialign
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
                 }
             }
-// printf(" FFFFF index %lu    xy:   (%f, %f)\n", index,  xy[0],  xy[1]);
             results[index].pos = {low[1] * dims[0] + low[0],
                                   low[1] * dims[0] + high[0],
                                   high[1] * dims[0] + low[0],
@@ -174,12 +163,6 @@ struct roialign
             float hx = 1.0f - lx;
             // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
-    // printf(" AAAAA index %lu results.w:  %f, %f, %f, %f\n", index, 
-    // results[index].w[0],
-    // results[index].w[1],
-    // results[index].w[2],
-    // results[index].w[3]
-    //         );
         });
         return results;
     }
@@ -207,15 +190,14 @@ struct roialign
     std::tuple<double, int64_t> calc_pooling(const T& data,
                                              const std::array<std::size_t, 2>& bin_grid_size,
                                              const std::vector<pos_weight>& pos_weights,
-                                             int64_t index,  // index to c
+                                             int64_t index, // index to c
                                              Op op) const
     {
         double output_val   = op.init();
         const int64_t count = bin_grid_size[0] * bin_grid_size[1];
-        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
-// printf(" IIIIIKKKKK  iy, ix, index =      %lu  %lu  %ld\n", iy, ix, index );
+        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
             const auto& pc = pos_weights[index];
-            std::array<double, 4> wv; 
+            std::array<double, 4> wv;
             std::transform(
                 pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
                     return *(data + pos) * w;
@@ -257,7 +239,6 @@ struct roialign
                     static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale - offset),
                     static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale - offset)};
 
-
                 // Force malformed ROIs to be 1x1, output_half_pixel transform mode
                 std::array<float, 2> roi_size{};
                 std::array<float, 2> bin_size{};
@@ -291,15 +272,11 @@ struct roialign
                     auto ph = idx[1];
                     auto pw = idx[2];
 
-// n anc c are 0 because that's the size of the test case
-// printf(" IIIII n, c, ph, pw =                       %lu %lu    %lu  %lu\n", n, c, ph, pw);
-
                     const auto offset_bottom_data =
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
 
                     double output_val;
-// printf(" IIIIIc vec_index[c] = %ld\n", vec_index[c]);
                     std::tie(output_val, vec_index[c]) =
                         (mode == migraphx::op::pooling_mode::average)
                             ? this->calc_pooling(offset_bottom_data,
@@ -312,7 +289,6 @@ struct roialign
                                                  pre_calc,
                                                  vec_index[c],
                                                  max_pool{});
-printf(" DDDDD  index %lu %f\n",   output_shape.index({n, c, ph, pw}), output_val);                                                 
                     output(n, c, ph, pw) = output_val;
                 });
             });
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index c275eb3e11c..22721aca2d6 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -89,18 +89,18 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
         }
     }
-    array<index_int, 4> locs = {low[1] * dims[0] + low[0],  // new
+    array<index_int, 4> locs = {low[1] * dims[0] + low[0], // new
                                 low[1] * dims[0] + high[0],
                                 high[1] * dims[0] + low[0],
                                 high[1] * dims[0] + high[0]};
 
-    float lx = xy[0] - low[0];  // new
+    float lx = xy[0] - low[0]; // new
     float ly = xy[1] - low[1];
-\
+
     float hy = 1.0f - ly;
     float hx = 1.0f - lx;
     // do calculations in floating point and convert final result to required type
-    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx}; //old
+    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx}; // old
 
     auto v01 = pooling(data[locs[1]] * ws[1], data[locs[0]] * ws[0]);
     auto v23 = pooling(data[locs[3]] * ws[3], data[locs[2]] * ws[2]);
@@ -115,7 +115,6 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                             const array<int, 2>& idx,
                                             const array<index_int, 2>& bin_grid_size,
                                             const array<index_int, 2>& dims,
-                                            float roi_offset,
                                             Op op)
 {
     // for one idx (output height and width coordinates) we iterate through all bin_grid values
@@ -124,18 +123,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
     const int64_t count = bin_grid_size[0] * bin_grid_size[1];
     dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
         array<index_int, 2> id = {iy, ix};
-// println_once(" jjjjj id: ", id); 
-(void) roi_offset;
-// println_once(" jjjjj roi_starts: ",  roi_starts);
-// println(" eeeee idx: ",  idx);
-
-        array<float, 2> locs =
-            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;       // new
-// idx same as ph, pw
- array<float, 6> asdf_idx = {float(iy),  float(ix), float(idx[0]), float(idx[1]),locs[0], locs[1]};
-// put idx, ix, iy, and locs into a single array to debug together        
-
-// println(" iiiii asdf_idx/locs: ", asdf_idx);
+        array<float, 2> locs = roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;
+        array<float, 6> asdf_idx = {
+            float(iy), float(ix), float(idx[0]), float(idx[1]), locs[0], locs[1]};
         auto val   = bilinear_interpolate(data, dims, locs, op);
         output_val = op(output_val, val);
     });
@@ -179,7 +169,17 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
     // is for height and second dim is for width
     const auto& out_lens         = out_s.lens;
     array<index_int, 2> out_dims = {out_lens[2], out_lens[3]};
-println_once(" aaaaa stride: ", stride);
+
+    // Compute lens and strides vectors for use in reindexing output.
+    // Todo: look for a less indirect way to reconcile the ordering of iteration
+    // between this op. and the reference.
+    array<size_t, 4> m_lens{out_lens[0], out_lens[1], out_lens[3], out_lens[2]};
+    array<size_t, 4> m_strides;
+    m_strides[3] = 1;
+    for(auto k : {2, 1, 0})
+    {
+        m_strides[k] = m_strides[k + 1] * m_lens[k + 1];
+    }
 
     for(index_int i = index.global; i < out_s.elements(); i += stride)
     {
@@ -192,7 +192,8 @@ println_once(" aaaaa stride: ", stride);
         const auto offset_rois = rois + (n * roi_column_num);
         const int batch_ind    = ind[n];
 
-        // Note that roi_offset in src/targets/gpu/jit/roialign.cpp uses a negative value, so we add it here
+        // Note that roi_offset in src/targets/gpu/jit/roialign.cpp uses a negative value, so we add
+        // rather than subtract it here
         array<float, 2> roi_starts = {
             static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale) + s.roi_offset,
             static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale) +
@@ -207,7 +208,6 @@ println_once(" aaaaa stride: ", stride);
         array<float, 2> bin_size{};
         array<index_int, 2> bin_grid_size{};
 
-
         for(index_int ii = 0; ii < roi_size.size(); ++ii)
         {
             roi_size[ii] = roi_ends[ii] - roi_starts[ii];
@@ -219,86 +219,29 @@ println_once(" aaaaa stride: ", stride);
                                     ? s.sampling_ratio
                                     : migraphx::ceil(roi_size[ii] / out_dims[ii]);
         }
-// array<int, 4> zap = {n, c, ph, pw};
-
-// println(" kkkkk n, c, ph, pw: ", zap);
-
         const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
-// array<size_t, 4> reindex = {size_t(n), size_t(c), size_t(pw), size_t(ph)};//;;  rearrange the gpu indices to what the ref indices would be
-// migraphx::shape reindex_shape(reindex);
-// and insert that location in y_t    
+
+        //
+        //  Reindexing.  Calculations to this point did not iterate in the same order as
+        // in the reference op; we now calculate the output index corresponding to i
+        //
+        size_t pp = i;
+        size_t jj = (pp / m_strides[0]) * m_strides[0];
+        pp        = pp % m_strides[0];
+        jj += (pp / m_strides[1]) * m_strides[1];
+        pp %= m_strides[1];
+        pp = pp / m_lens[2] + (pp % m_lens[2]) * m_strides[2];
+        jj += pp;
 
         if constexpr(s.is_avg_pooling)
         {
-// what are the indices corresponding to i?
-
-        std::size_t jj = 0;
-        // std::size_t ss      = 1;
-array<size_t, 4> m_lens{out_lens[0], out_lens[1], out_lens[3], out_lens[2]};
-array<size_t, 4> m_strides;
-m_strides[3] = 1;
-    for(auto k: {2, 1, 0})
-    {
-        m_strides[k] = m_strides[k+1] * m_lens[k+1]; 
-
-    }
-println_once(" m_lens: ", m_lens);
-println_once(" m_strides: ", m_strides);
-        // for(auto k : {3, 2, 1, 0})
-        // {
-        //     std::size_t stride2 = m_strides[k];
-        //     std::size_t len    = m_lens[k];
-        //     std::size_t idxx    = (i % (ss * len)) / ss;
-        //     jj += stride2 * idxx;
-        //     ss *= len;
-        // }
-        // println(" jj2: ", jj);
-
-size_t pp = i;
-jj = (pp/m_strides[0])*m_strides[0];
-pp = pp % m_strides[0];
-jj += (pp/m_strides[1])*m_strides[1];
-pp %= m_strides[1];
-println(" i, pp: ", 10000*i + pp);
-
-println(" pp/m_strides[2], pp % m_strides[2]",1000000*(pp/m_lens[2] + (pp % m_lens[2])*m_strides[2]) 
-    + 10000*(pp/m_lens[2]) + (pp%m_lens[2]) + 100000000);
-pp = pp/m_lens[2] + (pp % m_lens[2])*m_strides[2];
-println("  jj, pp: ", jj * 10000 + pp);   // <===== may still be relevant
-// jj += (pp/m_strides[2])*m_strides[2];
-// pp %= m_strides[2];
-jj += pp;
-
-
-// jj = i/m_strides[2] + (i%m_strides[2])*m_lens[2] 
-// jj = (i % m_strides[1])
-
-
-y_t[jj] = calc_pooling(offset_x,
-            // y_t[i] = calc_pooling(offset_x,
-                                  roi_starts,
-                                  bin_size,
-                                  {ph, pw},
-                                  bin_grid_size,
-                                  in_dims,
-                                  s.roi_offset,
-                                  avg_pool{});
-array<float, 7> zapzap = {float(n), float(c), float(ph), float(pw), float(i), static_cast<float>(jj), y_t[jj]};
-
-
-println(" ddddd  y_t[jj]: ",  zapzap)   ;
+            y_t[jj] = calc_pooling(
+                offset_x, roi_starts, bin_size, {ph, pw}, bin_grid_size, in_dims, avg_pool{});
         }
         else
         {
-            y_t[i] = calc_pooling(offset_x,
-                                  roi_starts,
-                                  bin_size,
-                                  {ph, pw},
-                                  bin_grid_size,
-                                  in_dims,
-                                  s.roi_offset,
-                                  max_pool{});
-
+            y_t[jj] = calc_pooling(
+                offset_x, roi_starts, bin_size, {ph, pw}, bin_grid_size, in_dims, max_pool{});
         }
     }
 }
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index bc5639d0424..ca8f549f89d 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -35,7 +35,8 @@ def onnx_test(external_data=False, opset_imports=None):
     def create_onnx_test(op_test):
         def run_test():
             op_info = op_test()
-            opset_id = [helper.make_operatorsetid('', opset_imports)] if opset_imports is not None else None
+            opset_id = [helper.make_operatorsetid('', opset_imports)
+                        ] if opset_imports is not None else None
 
             if len(op_info) > 3:
                 graph_def = helper.make_graph(op_info[0],
@@ -48,8 +49,7 @@ def run_test():
                                               op_info[1], op_info[2])
             model_def = helper.make_model(graph_def,
                                           producer_name=op_test.__name__,
-                                              opset_imports=opset_id
-            )
+                                          opset_imports=opset_id)
             onnx.save_model(model_def,
                             '{}.onnx'.format(op_test.__name__),
                             save_as_external_data=external_data,
diff --git a/test/verify/test_roialign.cpp b/test/verify/test_roialign.cpp
index 8833618a079..e957920af1a 100644
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -34,12 +34,12 @@ struct test_roialign_half_pixel : verify_program<test_roialign_half_pixel<DType>
     {
         migraphx::program p;
         auto* mm = p.get_main_module();
-        migraphx::shape x_s{DType, {1, 5, 2, 2}};
+        migraphx::shape x_s{DType, {2, 7, 2, 2}};
 
-        migraphx::shape roi_s{DType, {1, 4}};
+        migraphx::shape roi_s{DType, {2, 4}};
 
-        migraphx::shape ind_s{migraphx::shape::int64_type, {1}};
-        std::vector<int64_t> ind_vec = {0};
+        migraphx::shape ind_s{migraphx::shape::int64_type, {2}};
+        std::vector<int64_t> ind_vec = {1, 0};
 
         auto x   = mm->add_parameter("x", x_s);
         auto roi = mm->add_parameter("roi", roi_s);
@@ -95,7 +95,7 @@ struct test_roialign : verify_program<test_roialign<DType>>
 
 template struct test_roialign_half_pixel<migraphx::shape::float_type>;
 template struct test_roialign<migraphx::shape::float_type>;
-// template struct test_roialign<migraphx::shape::half_type>;  commented out for debug
-// template struct test_roialign<migraphx::shape::fp8e4m3fnuz_type>;
-// template struct test_roialign<migraphx::shape::fp8e4m3fn_type>;
-// template struct test_roialign<migraphx::shape::fp8e5m2_type>;
+template struct test_roialign<migraphx::shape::half_type>;
+template struct test_roialign<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_roialign<migraphx::shape::fp8e4m3fn_type>;
+template struct test_roialign<migraphx::shape::fp8e5m2_type>;
diff --git a/test/verify/test_roialign_nondefault.cpp b/test/verify/test_roialign_nondefault.cpp
index d4785014512..ac9be3b7281 100644
--- a/test/verify/test_roialign_nondefault.cpp
+++ b/test/verify/test_roialign_nondefault.cpp
@@ -40,7 +40,6 @@ struct test_roialign_nondefault : verify_program<test_roialign_nondefault>
 
         migraphx::shape ind_s{migraphx::shape::int64_type, {5}};
         std::vector<int64_t> ind_vec = {0, 2, 3, 4, 1};
-
         auto x   = mm->add_parameter("x", x_s);
         auto roi = mm->add_parameter("roi", roi_s);
         auto ind = mm->add_literal(migraphx::literal(ind_s, ind_vec));

From 2da60d1b6ae1b3dd75c24aa5d11112e1033a98ee Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Mon, 14 Oct 2024 17:12:06 +0000
Subject: [PATCH 49/56] removed a debug file

---
 ort_roialign.py | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 ort_roialign.py

diff --git a/ort_roialign.py b/ort_roialign.py
deleted file mode 100644
index 817e5fe2e84..00000000000
--- a/ort_roialign.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Not for release.  This test script is for develop/test only
-
-import onnx
-import onnxruntime as rt
-# from https://onnxruntime.ai/docs/get-started/with-python.html
-import numpy as np
-print(" version: ", onnx.__version__, rt.__version__)
-
-x = np.array(np.arange(10 * 5 * 4 * 7), dtype='f')
-x = np.reshape(x, [10, 5, 4, 7])
-
-y = np.ones([10, 5, 4, 7], dtype='f')
-
-rois = np.array([[0.1, 0.15, 0.6, 0.35], [2.1, 1.73, 3.8, 2.13]], dtype='f')
-
-themodel = 'roialign_test.onnx'
-sess = rt.InferenceSession('/workspace/AMDMIGraphX/test/onnx/' + themodel)
-res = sess.run(['y'], {'x': x, 'rois': rois, 'batch_ind': [1, 0]})
-
-print(' ORT test model is ' + themodel + ', rois_data is \n', rois,
-      ' result is \n', res)

From 6f9475889510a0dea49b17c9bee4cda58139aaa5 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <95253842+bpickrel@users.noreply.github.com>
Date: Mon, 14 Oct 2024 10:14:51 -0700
Subject: [PATCH 50/56] comment

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 test/onnx/parse/roialign_default_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/onnx/parse/roialign_default_test.cpp b/test/onnx/parse/roialign_default_test.cpp
index 410d7ed62d4..b5a940e7927 100644
--- a/test/onnx/parse/roialign_default_test.cpp
+++ b/test/onnx/parse/roialign_default_test.cpp
@@ -37,7 +37,7 @@ TEST_CASE(roialign_default_test)
     auto bi   = mm->add_parameter("batch_ind", sbi);
 
     // Depending on whether the model was built for Onnx opset 16 or earlier, the default
-    // coordinate_transformation_mode is different.  These model files had explicit opset given
+    // coordinate_transformation_mode will be different.  These model files had explicit opset given
     // when they were created.
     auto r = mm->add_instruction(
         migraphx::make_op("roialign", {{"coordinate_transformation_mode", "half_pixel"}}),

From c1000cfa31611a9d16566b5af19d625426df3338 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 15 Oct 2024 15:54:32 +0000
Subject: [PATCH 51/56] misc. cleanup; fixed one Jenkins-only fail; added one
 more shape check

---
 src/include/migraphx/op/roialign.hpp           |  7 +++++--
 .../include/migraphx/kernels/roialign.hpp      | 18 ++++++++----------
 test/op_shape_test.cpp                         |  5 +++++
 test/verify/test_roialign.cpp                  |  2 +-
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/include/migraphx/op/roialign.hpp b/src/include/migraphx/op/roialign.hpp
index ded80e5080f..76b7c8b967e 100644
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -81,6 +81,8 @@ struct roialign
         if(!shape::is_integral(inputs.at(2).type()))
             MIGRAPHX_THROW(
                 "ROIALIGN: incorrect datatype for roi indices! (should be an integral type)");
+        if(x_lens.size() != 4)
+            MIGRAPHX_THROW("ROIALIGN: data input must have 4 dimensions n, c, h, w");
         if(bi_lens.size() != 1)
         {
             MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
@@ -230,7 +232,8 @@ struct roialign
             par_for(n_rois, [&](auto n) {
                 const auto bottom_data   = x.begin();
                 const auto roi_batch_ind = batch_indices[n];
-                // Do not use rounding; this implementation detail is critical
+                // Do not use rounding here even if data is a quantized type; this
+                // implementation detail is critical
                 const float offset              = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
                 std::array<float, 2> roi_starts = {
                     static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale - offset),
@@ -239,7 +242,7 @@ struct roialign
                     static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale - offset),
                     static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale - offset)};
 
-                // Force malformed ROIs to be 1x1, output_half_pixel transform mode
+                // Force malformed ROIs to be 1x1, if in output_half_pixel transform mode
                 std::array<float, 2> roi_size{};
                 std::array<float, 2> bin_size{};
                 std::array<std::size_t, 2> bin_grid_size{};
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
index 22721aca2d6..769c7c978bf 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -24,8 +24,8 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
 #define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
 
-#include <migraphx/kernels/debug.hpp>
-#include <migraphx/kernels/print.hpp>
+// #include <migraphx/kernels/debug.hpp>
+// #include <migraphx/kernels/print.hpp>
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/dfor.hpp>
 #include <migraphx/kernels/ops.hpp>
@@ -89,21 +89,21 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
             xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
         }
     }
-    array<index_int, 4> locs = {low[1] * dims[0] + low[0], // new
+    array<index_int, 4> locs = {low[1] * dims[0] + low[0],
                                 low[1] * dims[0] + high[0],
                                 high[1] * dims[0] + low[0],
                                 high[1] * dims[0] + high[0]};
 
-    float lx = xy[0] - low[0]; // new
+    float lx = xy[0] - low[0];
     float ly = xy[1] - low[1];
 
     float hy = 1.0f - ly;
     float hx = 1.0f - lx;
     // do calculations in floating point and convert final result to required type
-    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx}; // old
+    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
 
-    auto v01 = pooling(data[locs[1]] * ws[1], data[locs[0]] * ws[0]);
-    auto v23 = pooling(data[locs[3]] * ws[3], data[locs[2]] * ws[2]);
+    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
+    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
     return implicit_conversion(pooling(v01, v23));
 }
 
@@ -124,8 +124,6 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
     dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
         array<index_int, 2> id = {iy, ix};
         array<float, 2> locs = roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size;
-        array<float, 6> asdf_idx = {
-            float(iy), float(ix), float(idx[0]), float(idx[1]), locs[0], locs[1]};
         auto val   = bilinear_interpolate(data, dims, locs, op);
         output_val = op(output_val, val);
     });
@@ -176,7 +174,7 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
     array<size_t, 4> m_lens{out_lens[0], out_lens[1], out_lens[3], out_lens[2]};
     array<size_t, 4> m_strides;
     m_strides[3] = 1;
-    for(auto k : {2, 1, 0})
+    for(int k = 2; k >= 0; k--)
     {
         m_strides[k] = m_strides[k + 1] * m_lens[k + 1];
     }
diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp
index 24b9afb1377..819d05f9556 100644
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -5155,6 +5155,11 @@ TEST_CASE(roialign_test)
 
     expect_shape(sout, migraphx::make_op("roialign"), sx, srois, sbi);
 
+    // data input must be 4 dimensions
+    migraphx::shape sx2{migraphx::shape::float_type, {2, 3, 4, 5, 6}};
+    throws_shape(migraphx::make_op("roialign"), sx2, srois, sbi);
+
+    // batch index must be 1 dimension
     migraphx::shape sbi1{migraphx::shape::int64_type, {2, 3}};
     throws_shape(migraphx::make_op("roialign"), sx, srois, sbi1);
 
diff --git a/test/verify/test_roialign.cpp b/test/verify/test_roialign.cpp
index e957920af1a..88864631e87 100644
--- a/test/verify/test_roialign.cpp
+++ b/test/verify/test_roialign.cpp
@@ -34,7 +34,7 @@ struct test_roialign_half_pixel : verify_program<test_roialign_half_pixel<DType>
     {
         migraphx::program p;
         auto* mm = p.get_main_module();
-        migraphx::shape x_s{DType, {2, 7, 2, 2}};
+        migraphx::shape x_s{DType, {5, 7, 2, 2}};
 
         migraphx::shape roi_s{DType, {2, 4}};
 

From 379dcef6bc694b89c1bce92dd1ba54f03e886d08 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 22 Oct 2024 17:14:17 +0000
Subject: [PATCH 52/56] revert debugging changes

---
 test/verify/test_roialign_nondefault.cpp |  1 +
 tools/build_and_test_onnxrt.sh           | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/verify/test_roialign_nondefault.cpp b/test/verify/test_roialign_nondefault.cpp
index ac9be3b7281..d4785014512 100644
--- a/test/verify/test_roialign_nondefault.cpp
+++ b/test/verify/test_roialign_nondefault.cpp
@@ -40,6 +40,7 @@ struct test_roialign_nondefault : verify_program<test_roialign_nondefault>
 
         migraphx::shape ind_s{migraphx::shape::int64_type, {5}};
         std::vector<int64_t> ind_vec = {0, 2, 3, 4, 1};
+
         auto x   = mm->add_parameter("x", x_s);
         auto roi = mm->add_parameter("roi", roi_s);
         auto ind = mm->add_literal(migraphx::literal(ind_s, ind_vec));
diff --git a/tools/build_and_test_onnxrt.sh b/tools/build_and_test_onnxrt.sh
index a3a8fdfbf61..19147c84ddb 100755
--- a/tools/build_and_test_onnxrt.sh
+++ b/tools/build_and_test_onnxrt.sh
@@ -36,8 +36,8 @@ export CXXFLAGS="-D__HIP_PLATFORM_AMD__=1 -w"
 cd build/Linux/Release
 #Add test launcher for onnxrt tests
 
-# echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-# echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions2' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-# echo 'InferenceSessionTests.Test3LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-# echo 'InferenceSessionTests.Test2LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
-# ../../../tools/ci_build/github/pai/pai_test_launcher.sh || (gdb ./onnxruntime_test_all core -batch -ex bt && exit 1)
+echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+echo 'InferenceSessionTests.CheckRunProfilerWithSessionOptions2' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+echo 'InferenceSessionTests.Test3LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+echo 'InferenceSessionTests.Test2LayerNestedSubgraph' >> ../../../tools/ci_build/github/pai/migraphx-excluded-tests.txt
+../../../tools/ci_build/github/pai/pai_test_launcher.sh || (gdb ./onnxruntime_test_all core -batch -ex bt && exit 1)

From 4c4846edc88024b63356da74bdb9d3e7b851c549 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Tue, 22 Oct 2024 17:21:25 +0000
Subject: [PATCH 53/56] clean up debug code

---
 test/onnx/verify/roialign_half_pixel_verify_test.cpp | 8 --------
 test/onnx/verify/roialign_verify_test.cpp            | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/test/onnx/verify/roialign_half_pixel_verify_test.cpp b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
index ea570792249..62c91ee63b5 100644
--- a/test/onnx/verify/roialign_half_pixel_verify_test.cpp
+++ b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
@@ -51,14 +51,6 @@ TEST_CASE(roialign_half_pixel_verify_test)
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-    printf(" result:  \n");
-    for(int i = 0; i < result_vector.size(); i++)
-    {
-        printf(" %f ", result_vector[i]);
-        if(i % 6 == 5)
-            printf("\n");
-    }
-    printf("\n");
     // Gold values were generated with onnxruntime
     std::vector<float> gold = {5.38,      5.4799995, 5.4799995, 6.58,      6.68,  6.68,
                                17.38,     17.48,     17.48,     18.58,     18.68, 18.68,
diff --git a/test/onnx/verify/roialign_verify_test.cpp b/test/onnx/verify/roialign_verify_test.cpp
index 051080adf25..ea9d84e7e8a 100644
--- a/test/onnx/verify/roialign_verify_test.cpp
+++ b/test/onnx/verify/roialign_verify_test.cpp
@@ -51,7 +51,7 @@ TEST_CASE(roialign_verify_test)
     std::vector<float> result_vector;
     result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
 
-    // gold results were generated with onnxruntime
+    // gold values were generated with onnxruntime
     std::vector<float> gold = {
         143.16667, 143.49998, 143.83333, 144.56667, 144.9,     145.23334, 145.96667, 146.3,
         146.63333, 147.36667, 147.70001, 148.03334, 148.76666, 149.09999, 149.43333,

From e9fd0fa7ccba3e3a235df23f802fae7820d7ad08 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 23 Oct 2024 18:10:25 +0000
Subject: [PATCH 54/56] work in progress

---
 docs/dev/onnx_operators.rst               |  7 +++++--
 test/onnx/parse/roialign_default_test.cpp | 13 +++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/dev/onnx_operators.rst b/docs/dev/onnx_operators.rst
index a87af00e755..7d58431ee39 100644
--- a/docs/dev/onnx_operators.rst
+++ b/docs/dev/onnx_operators.rst
@@ -697,8 +697,11 @@ Operator Support Matrix
 |                          |           |                 | functions are                |
 |                          |           |                 | not enabled                  |
 +--------------------------+-----------+-----------------+------------------------------+
-| RoiAlign                 | ✅        | FP8, FP16,      |                               |
-|                          |           | FP32, FP64      |                              |
+| RoiAlign                 | ✅        | FP8, FP16,      | ``X``,                       |
+|                          |           | FP32, FP64,     | ``ROI`` take any floating-   |
+|                          |           | UINT8, UINT16,   | point type;                 |
+|                          |           | UINT32, UINT64,  | ``batch_indices``           |
+|                          |           |                  | takes any integral type     |
 +--------------------------+-----------+-----------------+------------------------------+
 | Round                    | ✅        | FP8, FP16,      |                              |
 |                          |           | FP32, FP64      |                              |
diff --git a/test/onnx/parse/roialign_default_test.cpp b/test/onnx/parse/roialign_default_test.cpp
index b5a940e7927..f2f4426d485 100644
--- a/test/onnx/parse/roialign_default_test.cpp
+++ b/test/onnx/parse/roialign_default_test.cpp
@@ -35,7 +35,7 @@ TEST_CASE(roialign_default_test)
     auto x    = mm->add_parameter("x", sx);
     auto rois = mm->add_parameter("rois", srois);
     auto bi   = mm->add_parameter("batch_ind", sbi);
-
+asdf
     // Depending on whether the model was built for Onnx opset 16 or earlier, the default
     // coordinate_transformation_mode will be different.  These model files had explicit opset given
     // when they were created.
@@ -47,6 +47,15 @@ TEST_CASE(roialign_default_test)
     mm->add_return({r});
     auto prog = read_onnx("roialign_default_test.onnx");
     EXPECT(p == prog);
+}
+
+
+TEST_CASE(roialign_default_12_test)
+{
+    // opset 12 version
+    migraphx::shape sx{migraphx::shape::float_type, {10, 4, 7, 8}};
+    migraphx::shape srois{migraphx::shape::float_type, {8, 4}};
+    migraphx::shape sbi{migraphx::shape::int64_type, {8}};
 
     // Opset 12 program
     migraphx::program p_12;
@@ -62,5 +71,5 @@ TEST_CASE(roialign_default_test)
         bi_12);
     mm_12->add_return({r_12});
     auto prog_12 = read_onnx("roialign_default_test_12.onnx");
-    EXPECT(p_12 == prog_12);
+    EXPECT(p == prog_12);
 }

From 174a5b7898a4be27d3caab928da00948aa5f87b0 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 23 Oct 2024 21:13:30 +0000
Subject: [PATCH 55/56] split test into 2 cases

---
 test/onnx/parse/roialign_default_test.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/onnx/parse/roialign_default_test.cpp b/test/onnx/parse/roialign_default_test.cpp
index f2f4426d485..9a14778c12c 100644
--- a/test/onnx/parse/roialign_default_test.cpp
+++ b/test/onnx/parse/roialign_default_test.cpp
@@ -35,7 +35,6 @@ TEST_CASE(roialign_default_test)
     auto x    = mm->add_parameter("x", sx);
     auto rois = mm->add_parameter("rois", srois);
     auto bi   = mm->add_parameter("batch_ind", sbi);
-asdf
     // Depending on whether the model was built for Onnx opset 16 or earlier, the default
     // coordinate_transformation_mode will be different.  These model files had explicit opset given
     // when they were created.
@@ -71,5 +70,5 @@ TEST_CASE(roialign_default_12_test)
         bi_12);
     mm_12->add_return({r_12});
     auto prog_12 = read_onnx("roialign_default_test_12.onnx");
-    EXPECT(p == prog_12);
+    EXPECT(p_12 == prog_12);
 }

From 0f25c4f222563743fc0ed97cd331c4dbedd50811 Mon Sep 17 00:00:00 2001
From: Brian Pickrell <bpickrel@amd.com>
Date: Wed, 23 Oct 2024 21:49:16 +0000
Subject: [PATCH 56/56] add roialign verify test for max pooling; doesn't pass

---
 test/onnx/gen_onnx.py                         |  21 ++++++++++
 test/onnx/roialign_half_pixel_max_test.onnx   | Bin 0 -> 368 bytes
 .../roialign_half_pixel_verify_test.cpp       |  38 ++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 test/onnx/roialign_half_pixel_max_test.onnx

diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index ca8f549f89d..b8e1945e3ea 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -10666,6 +10666,27 @@ def roialign_half_pixel_test():
     return ([node], [x, roi, bi], [y])
 
 
+@onnx_test()
+def roialign_half_pixel_max_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 2, 4, 3])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [2, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [2])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 4, 3])
+
+    # half_pixel is the newer mode for ROIAlign
+    node = onnx.helper.make_node('RoiAlign',
+                                 inputs=['x', 'rois', 'batch_ind'],
+                                 outputs=['y'],
+                                 spatial_scale=2.0,
+                                 output_height=2,
+                                 output_width=3,
+                                 sampling_ratio=2,
+                                 mode="max",
+                                 coordinate_transformation_mode="half_pixel")
+
+    return ([node], [x, roi, bi], [y])
+
+
 @onnx_test()
 def round_half_test():
     x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [4, 4])
diff --git a/test/onnx/roialign_half_pixel_max_test.onnx b/test/onnx/roialign_half_pixel_max_test.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..fc192568d6f8bcb0b93ac275aae3541be33e1ace
GIT binary patch
literal 368
zcmaJ-%TB{E5R9EjxT`j*+^5uvLyx&|=LCEL$6PEoabru49oeo(|0X}e2k=)ov<NP6
zSgm$ucSf^hLif%pu2o~0<od)q)eFs9-m@Tlc>KWdjsW$L#WfF;BM+5i%G4BH4&>FU
z=kc_7fcwf?S1ZGVV8L<YPu8_OC~H`2YjJ=d^#P?t4%ru+MyGLJ!pxpScMePn)yS~L
z1H#1urZ@CW)j@945nMs;dE04a8s<ia>+S|-zKbK*%vW5CRgxr61491?EO#*+UBi+f
qItNVft+LqDa(#_WbC^b+d~@aI_aguZxufF&K1Q|6UurLSzW4+v<6gu7

literal 0
HcmV?d00001

diff --git a/test/onnx/verify/roialign_half_pixel_verify_test.cpp b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
index 62c91ee63b5..2653471af1b 100644
--- a/test/onnx/verify/roialign_half_pixel_verify_test.cpp
+++ b/test/onnx/verify/roialign_half_pixel_verify_test.cpp
@@ -59,3 +59,41 @@ TEST_CASE(roialign_half_pixel_verify_test)
 
     EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
 }
+
+// The half_pixel mode for the ROIAlign op, max pooling
+TEST_CASE(roialign_half_pixel_max_verify_test)
+{
+    migraphx::program p = read_onnx("roialign_half_pixel_max_test.onnx");
+    p.compile(migraphx::make_target("ref"));
+    migraphx::shape s{migraphx::shape::float_type, {2, 2, 4, 3}};
+    std::vector<float> data(2 * 2 * 4 * 3);
+    std::iota(data.begin(), data.end(), 0.f);
+    migraphx::parameter_map pp;
+    pp["x"] = migraphx::argument(s, data.data());
+    pp["y"] = migraphx::argument(s, data.data());
+
+    migraphx::shape srois{migraphx::shape::float_type, {2, 4}};
+    std::vector<float> rois_data = {1.1, 0.73, 1.7, 1.13, 1.1, 0.73, 2.6, 1.13};
+    migraphx::shape sbi{migraphx::shape::int64_type, {2}}; // batch_index
+    std::vector<int64_t> bi_data = {0, 1};
+
+    pp["rois"]      = migraphx::argument(srois, rois_data.data());
+    pp["batch_ind"] = migraphx::argument(sbi, bi_data.data());
+    pp["y"]         = migraphx::argument(s, data.data());
+
+    auto result = p.eval(pp).back();
+    std::vector<float> result_vector;
+    result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
+
+    // Gold values were generated with onnxruntime
+    std::vector<float> gold = { 4.7      ,  4.7      ,  4.7      ,5.2799997,  5.2799997,  5.2799997,
+
+        15.979999 , 15.979999 , 15.979999 ,     13.199999 , 13.199999 , 13.199999 ,
+
+
+       27.477499 , 27.477499 ,  0.       ,19.440002 , 19.440002 ,  0.       ,
+
+        38.8475   , 38.8475   ,  0.       , 26.730003 , 26.730003 ,  0.   };
+
+    EXPECT(migraphx::verify::verify_rms_range(result_vector, gold));
+}