From 2a4a88b42d18130a70b3a7b52db676fe1eecf1b9 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Sat, 8 Feb 2025 21:23:42 -0800
Subject: [PATCH 1/6] [Draft] NHWC DepthToSpace U8 and its transformation

---
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |  2 +
 .../core/graph/contrib_ops/contrib_defs.cc    | 69 +++++++++++++++
 .../core/optimizer/nhwc_transformer.cc        | 16 ++++
 .../providers/cpu/tensor/space_depth_ops.cc   | 49 +++++++++++
 .../providers/cpu/tensor/space_depth_ops.h    |  2 +
 .../contrib_ops/depth_to_space_op_test.cc     | 87 +++++++++++++++++++
 6 files changed, 225 insertions(+)
 create mode 100644 onnxruntime/test/contrib_ops/depth_to_space_op_test.cc
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index c742cd1e95bdd..c38582b36239f 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -64,6 +64,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Quick
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiHeadAttention);
 
 // ******** Start: Quantization ******************* //
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DepthToSpace);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearConcat);
@@ -216,6 +217,7 @@ Status RegisterFp16Kernels(KernelRegistry& kernel_registry) {
 Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DepthToSpace)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearConcat)>,
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index e45787299f3ad..54d9124ea0cda 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3683,6 +3683,75 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
         }
       });
 
+  static const char* DepthToSpace_ver1_doc = R"DOC(
+TODO
+)DOC";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(DepthToSpace)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(DepthToSpace_ver1_doc)
+      .Attr("blocksize", "Blocks of [blocksize, blocksize] are moved.", AttributeProto::INT)
+      .Attr("channels_last", "", AttributeProto::INT, static_cast<int64_t>(0))
+      .Attr(
+          "mode",
+          "DCR (default) for depth-column-row order re-arrangement. Use CRD for column-row-depth order.",
+          AttributeProto::STRING,
+          std::string("DCR"))
+      .Input(
+          0,
+          "input",
+          "Input tensor of [N,H,W,C], where N is the batch axis, C is the channel or depth"
+          ", H is the height and W is the width.",
+          "T",
+          OpSchema::Single,
+          true,
+          1,
+          OpSchema::Differentiable)
+      .Output(
+          0,
+          "output",
+          "Output tensor of [N, H * blocksize, W * blocksize, C/(blocksize * blocksize)].",
+          "T",
+          OpSchema::Single,
+          true,
+          1,
+          OpSchema::Differentiable)
+      .TypeConstraint("T", {"tensor(uint8)"}, "")
+      .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+        propagateElemTypeFromInputToOutput(ctx, 0, 0);
+        auto blocksize = getAttribute(ctx, "blocksize", 0);
+        if (blocksize <= 0) {
+          fail_shape_inference("Blocksize must be positive");
+        }
+        if (hasInputShape(ctx, 0)) {
+          auto& input_shape = getInputShape(ctx, 0);
+          if (input_shape.dim_size() == 4) {
+            // TODO: Clarify what behavior should be if C is not a multiple of
+            // blocksize*blocksize.
+            if (getAttribute(ctx, "channels_last", 0) == 0) {
+              updateOutputShape(
+                  ctx,
+                  0,
+                  {input_shape.dim(0),
+                    input_shape.dim(1) / (blocksize * blocksize),
+                    input_shape.dim(2) * blocksize,
+                    input_shape.dim(3) * blocksize});
+            } else { // channels_last
+              updateOutputShape(
+                  ctx,
+                  0,
+                  {input_shape.dim(0),
+                    input_shape.dim(1) * blocksize,
+                    input_shape.dim(2) * blocksize,
+                    input_shape.dim(3) / (blocksize * blocksize)});
+            }
+          } else {
+            fail_shape_inference("Input tensor must be 4-dimensional");
+          }
+        }
+      });
+
 #ifdef ENABLE_ATEN
   ONNX_CONTRIB_OPERATOR_SCHEMA(ATen)
       .SetDomain(kPytorchAtenDomain)
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index cd654991c92d5..3516c600bac41 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -117,6 +117,22 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
     }
   }
 
+  {
+    // uint8 DepthToSpace -> uint8 nhwc DepthToSpace
+    OpKernelRegistryId depthtospace_uint8{
+        "DepthToSpace", kMSDomain, 1, {{"T", {DataTypeImpl::GetTensorType<uint8_t>()}}}};
+    const KernelCreateInfo* kernel_create_info{};
+    const auto status = cpu_kernel_registry->TryFindKernel(
+        kCpuExecutionProvider, depthtospace_uint8.op_type_, depthtospace_uint8.domain_,
+        depthtospace_uint8.version_, depthtospace_uint8.type_constraints_, logger, &kernel_create_info);
+    if (status.IsOK() && kernel_create_info != nullptr) {
+      kernel_create_info = nullptr;
+      conv_table_.emplace(
+          OpIdInfo("DepthToSpace", kOnnxDomain, api::DataType::UINT8),
+          OpTransformInfo{depthtospace_uint8.op_type_, depthtospace_uint8.domain_, depthtospace_uint8.version_, true});
+    }
+  }
+
   {
     // fp16 MaxPool -> fp16 nhwc MaxPool
     OpKernelRegistryId nhwc_maxpool_fp16{
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
index 7e1049c402210..d6b139a99cba2 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
@@ -56,6 +56,18 @@ ONNX_CPU_OPERATOR_KERNEL(
                               DataTypeImpl::GetTensorType<uint8_t>()}),
     DepthToSpace);
 
+namespace contrib {
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    DepthToSpace,
+    kMSDomain,
+    1,
+    uint8_t,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<uint8_t>()),
+    DepthToSpace);
+}
+
 // intermediate tensor shapes are:
 // (batch, blocksize, blocksize, input_depth / (blocksize * blocksize), input_height, input_width) for DepthToSpace
 // (batch, input_depth, input_height / blocksize, blocksize, input_width / blocksize, blocksize) for SpaceToDepth
@@ -157,6 +169,43 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
   int64_t output_height = -1;
   int64_t output_width = -1;
 
+  if (is_bhwc_) {
+    ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc<true>(input,
+                                                          batch,
+                                                          input_depth, input_height, input_width,
+                                                          output_depth, output_height, output_width,
+                                                          false));
+
+    Tensor& output = *context->Output(0, {batch, output_height, output_width, output_depth});
+
+    // handle DCR and CRD format
+    auto dim3 = is_dcr_ ? blocksize_ : input_depth / blocksize_ / blocksize_;
+    auto dim5 = is_dcr_ ? input_depth / blocksize_ / blocksize_ : blocksize_;
+
+    auto permutation = is_dcr_ ? std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 1, 3, 2, 4, 5}}
+                               : std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 3, 1, 4, 2, 5}};
+
+    if (input.IsDataType<uint8_t>()) {
+      SpaceDepthOpCpuImpl<uint8_t>(input, output, permutation,
+                                  onnxruntime::narrow<std::ptrdiff_t>(batch),
+                                  onnxruntime::narrow<std::ptrdiff_t>(input_height),
+                                  onnxruntime::narrow<std::ptrdiff_t>(input_width),
+                                  onnxruntime::narrow<std::ptrdiff_t>(dim3),
+                                  onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
+                                  onnxruntime::narrow<std::ptrdiff_t>(dim5),
+                                  onnxruntime::narrow<std::ptrdiff_t>(input_height),
+                                  onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
+                                  onnxruntime::narrow<std::ptrdiff_t>(input_width),
+                                  onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
+                                  onnxruntime::narrow<std::ptrdiff_t>(input_depth / blocksize_ / blocksize_));
+    } else {
+      // user will not see this as the kernel doesn't claim support for types other than float and double
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported input type in DepthToSpace (channels_last = 1) op: ", input.DataType());
+    }
+
+    return Status::OK();
+  }
+
   ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc(input,
                                                         batch,
                                                         input_depth, input_height, input_width,
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
index 3218c8952d6ec..923c75e2118e6 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
@@ -79,6 +79,7 @@ class SpaceToDepth final : public OpKernel, SpaceDepthBase {
 class DepthToSpace final : public OpKernel, SpaceDepthBase {
  public:
   explicit DepthToSpace(const OpKernelInfo& info) : OpKernel(info), SpaceDepthBase(info) {
+    is_bhwc_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
     std::string mode;
     // if  mode doesn't exist, then it is the default "DCR" mode
     // (or) it is an opset < 11 model for which the only mode is "DCR" mode
@@ -95,6 +96,7 @@ class DepthToSpace final : public OpKernel, SpaceDepthBase {
 
  private:
   bool is_dcr_ = true;
+  bool is_bhwc_ = false;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/depth_to_space_op_test.cc b/onnxruntime/test/contrib_ops/depth_to_space_op_test.cc
new file mode 100644
index 0000000000000..9562325984070
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/depth_to_space_op_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <vector>
+#include <type_traits>
+#include <memory>
+#include <utility>
+
+#include "core/common/common.h"
+#include "core/framework/execution_provider.h"
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape
+template <typename T>
+void RunDepthToSpace(const std::vector<T>& input,
+                     const std::vector<int64_t>& input_shape,
+                     const int64_t blocksize,
+                     const int64_t channels_last,
+                     const std::string mode, // type?
+                     const std::vector<T>& output,
+                     const std::vector<int64_t>& output_shape,
+                     OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess) {
+  auto run_test = [&]() {
+    OpTester test("DepthToSpace", 1, kMSDomain);
+
+    test.AddAttribute<int64_t>("blocksize", blocksize);
+    test.AddAttribute<int64_t>("channels_last", channels_last);
+    test.AddAttribute<std::string>("mode", mode);
+
+    test.AddInput<T>("input", input_shape, input);
+    test.AddOutput<T>("output", output_shape, output);
+
+    std::vector<std::unique_ptr<IExecutionProvider>> eps;
+    eps.push_back(DefaultCpuExecutionProvider());
+    test.Run(expect_result, "", {}, nullptr, &eps);
+  };
+
+  run_test();
+}
+
+TEST(DepthToSpaceOpTest, UInt8) {
+
+    constexpr int64_t B = 2, H = 3, W = 2, C = 12;
+    constexpr int64_t blocksize = 2;
+    std::vector<uint8_t> input = {
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+        44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+        55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+        66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+        77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+        88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+        110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+        121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+        132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+        143
+    };
+    std::vector<int64_t> input_shape = {B, H, W, C};
+    std::vector<uint8_t> output = {
+        0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
+        6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 36, 37, 38, 39, 40, 41,
+        30, 31, 32, 33, 34, 35, 42, 43, 44, 45, 46, 47,
+        48, 49, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65,
+        54, 55, 56, 57, 58, 59, 66, 67, 68, 69, 70, 71,
+        72, 73, 74, 75, 76, 77, 84, 85, 86, 87, 88, 89,
+        78, 79, 80, 81, 82, 83, 90, 91, 92, 93, 94, 95,
+        96, 97, 98, 99, 100, 101, 108, 109, 110, 111, 112, 113,
+        102, 103, 104, 105, 106, 107, 114, 115, 116, 117, 118, 119,
+        120, 121, 122, 123, 124, 125, 132, 133, 134, 135, 136, 137,
+        126, 127, 128, 129, 130, 131, 138, 139, 140, 141, 142, 143
+    };
+    std::vector<int64_t> output_shape = {B, H * blocksize, W * blocksize, C / (blocksize * blocksize)};
+
+    RunDepthToSpace<uint8_t>(input, input_shape, blocksize, 1, "DCR", output, output_shape);
+}
+
+}  // namespace test
+}  // namespace onnxruntime

From 91c11399884a4c025b3ba9b73d6778677d114860 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Sat, 15 Feb 2025 04:15:38 -0800
Subject: [PATCH 2/6] [Draft] Use our own Transpose instead of Eigen one

---
 .../contrib_ops/cpu/bert/attention_utils.cc   |  2 +-
 .../core/framework/transpose_helper.cc        | 30 +++++++++++++++----
 onnxruntime/core/framework/transpose_helper.h |  6 ++--
 .../providers/cpu/tensor/space_depth_ops.cc   | 28 +++++++++++++++--
 .../core/providers/cpu/tensor/transpose.cc    | 14 +++++----
 .../core/providers/cpu/tensor/transpose.h     |  1 +
 6 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_utils.cc b/onnxruntime/contrib_ops/cpu/bert/attention_utils.cc
index c8fe9c77d8ff8..a8b0bb0193240 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_utils.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_utils.cc
@@ -27,7 +27,7 @@ inline Status Transpose_BSNH_to_BNSH(const Tensor* qkv,
   std::vector<size_t> permutations({0, 2, 1, 3});
   gsl::span<const size_t> permutations_span{permutations};
   size_t from = 2, to = 1;
-  SingleAxisTranspose(permutations, *qkv, *qkv_transposed.GetMutable<Tensor>(), from, to, nullptr, tp);
+  SingleAxisTranspose(permutations, *qkv, *qkv_transposed.GetMutable<Tensor>(), from, to, nullptr, nullptr, tp);
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/framework/transpose_helper.cc b/onnxruntime/core/framework/transpose_helper.cc
index 32d15bdf9060b..e7ef2d1bf1c6c 100644
--- a/onnxruntime/core/framework/transpose_helper.cc
+++ b/onnxruntime/core/framework/transpose_helper.cc
@@ -59,12 +59,17 @@ typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTranspos
 
 //  `input_shape_override` overrides the shape of `input` for compute purposes.
 void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output,
-                                 size_t from, size_t to, const TensorShape* input_shape_override = nullptr,
+                                 size_t from, size_t to,
+                                 const TensorShape* input_shape_override = nullptr,
+                                 const TensorShape* output_shape_override = nullptr,
                                  concurrency::ThreadPool* tp = nullptr) {
   ORT_UNUSED_PARAMETER(permutations);
 
+  //ORT_ENFORCE(input_shape_override == nullptr);
+
   const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
   const auto& input_dims = input_shape.GetDims();
+  const auto& output_shape = output_shape_override ? *output_shape_override : output.Shape();
 
   const auto element_size = input.DataType()->Size();
 
@@ -106,7 +111,7 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
     default: {
       TensorPitches src_strides(input_dims);
 
-      TensorPitches contig_dst_strides(output);
+      TensorPitches contig_dst_strides(output_shape);
 
       const auto dims = input_dims.size();
       TensorShapeVector dst_strides(dims);
@@ -114,6 +119,18 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
         dst_strides[permutations[dim]] = contig_dst_strides[dim];
       }
 
+      #if 0
+      for (int i = 0; i < rank; ++i) {
+        std::cout << "dst_strides[" << i << "] = " << dst_strides[i] << std::endl;
+      }
+      for (int i = 0; i < rank; ++i) {
+        std::cout << "input_shape[" << i << "] = " << input_shape[i] << std::endl;
+      }
+      for (int i = 0; i < rank; ++i) {
+        std::cout << "src_strides[" << i << "] = " << src_strides[i] << std::endl;
+      }
+      #endif
+
       ORT_THROW_IF_ERROR(DispatchStridedCopy<element_type_lists::All>(tp,
                                                                       output, 0, dst_strides,
                                                                       input_shape,
@@ -231,10 +248,13 @@ void TransposeSingleAxisInwards(gsl::span<const size_t> permutations, const Tens
 }
 
 //  `input_shape_override` overrides the shape of `input` for compute purposes.
-void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output, size_t from,
-                         size_t to, const TensorShape* input_shape_override, concurrency::ThreadPool* tp) {
+void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output,
+                         size_t from, size_t to,
+                         const TensorShape* input_shape_override, const TensorShape* output_shape_override,
+                         concurrency::ThreadPool* tp) {
   if (from > to) {
-    TransposeSingleAxisOutwards(permutations, input, output, from, to, input_shape_override, tp);
+    TransposeSingleAxisOutwards(permutations, input, output, from, to,
+                                input_shape_override, output_shape_override, tp);
   } else {
     TransposeSingleAxisInwards(permutations, input, output, from, to, input_shape_override);
   }
diff --git a/onnxruntime/core/framework/transpose_helper.h b/onnxruntime/core/framework/transpose_helper.h
index e33044117f89a..16f5f8c9aa193 100644
--- a/onnxruntime/core/framework/transpose_helper.h
+++ b/onnxruntime/core/framework/transpose_helper.h
@@ -41,7 +41,9 @@ We fall back to the default implementation in all other cases, and if the input
 
 namespace onnxruntime {
 bool IsTransposeMovingSingleAxis(gsl::span<const size_t> permutations, size_t& from, size_t& to);
-void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output, size_t from,
-                         size_t to, const TensorShape* input_shape_override = nullptr,
+void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output,
+                         size_t from, size_t to,
+                         const TensorShape* input_shape_override = nullptr,
+                         const TensorShape* output_shape_override = nullptr,
                          concurrency::ThreadPool* tp = nullptr);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
index d6b139a99cba2..487cb59f3bae1 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
@@ -7,6 +7,7 @@
 #endif
 
 #include "core/providers/cpu/tensor/space_depth_ops.h"
+#include "core/providers/cpu/tensor/transpose.h"
 #include "core/common/eigen_common_wrapper.h"
 #include <array>
 
@@ -182,10 +183,31 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
     auto dim3 = is_dcr_ ? blocksize_ : input_depth / blocksize_ / blocksize_;
     auto dim5 = is_dcr_ ? input_depth / blocksize_ / blocksize_ : blocksize_;
 
-    auto permutation = is_dcr_ ? std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 1, 3, 2, 4, 5}}
-                               : std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 3, 1, 4, 2, 5}};
+    int64_t virtual_input_depth = input_depth / blocksize_ / blocksize_;
+
+    TensorShape virtual_input_shape;
+    if (is_dcr_) {
+      virtual_input_shape = TensorShape{batch, input_height, input_width,
+                                        blocksize_, blocksize_, virtual_input_depth};
+    } else {
+      virtual_input_shape = TensorShape{batch, input_height, input_width,
+                                        virtual_input_depth, blocksize_, blocksize_};
+    }
+
+    TensorShape virtual_output_shape = TensorShape{batch,
+                                                   input_height, blocksize_,
+                                                   input_width, blocksize_,
+                                                   virtual_input_depth};
+
+    std::vector<size_t> permutation = is_dcr_ ? std::vector<size_t>{0, 1, 3, 2, 4, 5}
+                                              : std::vector<size_t>{0, 3, 1, 4, 2, 5};
 
     if (input.IsDataType<uint8_t>()) {
+
+      return Transpose::DoTranspose(
+        permutation, input, output, &virtual_input_shape, &virtual_output_shape, context->GetOperatorThreadPool());
+
+      #if 0
       SpaceDepthOpCpuImpl<uint8_t>(input, output, permutation,
                                   onnxruntime::narrow<std::ptrdiff_t>(batch),
                                   onnxruntime::narrow<std::ptrdiff_t>(input_height),
@@ -198,6 +220,8 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
                                   onnxruntime::narrow<std::ptrdiff_t>(input_width),
                                   onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
                                   onnxruntime::narrow<std::ptrdiff_t>(input_depth / blocksize_ / blocksize_));
+      #endif
+
     } else {
       // user will not see this as the kernel doesn't claim support for types other than float and double
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported input type in DepthToSpace (channels_last = 1) op: ", input.DataType());
diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc
index 5b904e85848d0..2ec8dadc47c85 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc
@@ -349,7 +349,8 @@ bool IsTransposeReshape(const gsl::span<const size_t>& perm, gsl::span<const int
 }
 
 static Status TransposeImpl(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
-                            const TensorShape* input_shape_override, concurrency::ThreadPool* tp) {
+                            const TensorShape* input_shape_override, const TensorShape* output_shape_override,
+                            concurrency::ThreadPool* tp) {
   TensorShape shape = input_shape_override ? *input_shape_override : input.Shape();
 
   if (IsTransposeReshape(permutations, shape.GetDims())) {
@@ -363,7 +364,7 @@ static Status TransposeImpl(const gsl::span<const size_t>& permutations, const T
   bool moving_single_axis = IsTransposeMovingSingleAxis(permutations, from, to);
 
   if (moving_single_axis && !input.IsDataTypeString()) {
-    SingleAxisTranspose(permutations, input, output, from, to, input_shape_override, tp);
+    SingleAxisTranspose(permutations, input, output, from, to, input_shape_override, output_shape_override, tp);
     return Status::OK();
   }
 
@@ -400,7 +401,7 @@ static Status DoTransposeInt4(const gsl::span<const size_t>& permutations, const
   Tensor output_unpacked(DataTypeImpl::GetType<Int8Type>(), output.Shape(), cpu_allocator);
 
   ORT_RETURN_IF_ERROR((UnpackInt4Tensor<Int4Type>(input, input_unpacked, cpu_allocator)));
-  ORT_RETURN_IF_ERROR(TransposeImpl(permutations, input_unpacked, output_unpacked, input_shape_override, tp));
+  ORT_RETURN_IF_ERROR(TransposeImpl(permutations, input_unpacked, output_unpacked, input_shape_override, nullptr /* FIXME */, tp));
   ORT_RETURN_IF_NOT(Int4Type::Pack(output.MutableDataAsSpan<Int4Type>(), output_unpacked.DataAsSpan<Int8Type>()),
                     "Failed to pack 8-bit Tensor into 4-bit Tensor");
 
@@ -409,7 +410,8 @@ static Status DoTransposeInt4(const gsl::span<const size_t>& permutations, const
 
 //`input_shape_override` overrides the shape of `input` for compute purposes.
 Status TransposeBase::DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
-                                  const TensorShape* input_shape_override, concurrency::ThreadPool* tp) {
+                                  const TensorShape* input_shape_override, const TensorShape* output_shape_override,
+                                  concurrency::ThreadPool* tp) {
   auto input_type = input.DataType();
   auto output_type = output.DataType();
 
@@ -425,7 +427,7 @@ Status TransposeBase::DoTranspose(const gsl::span<const size_t>& permutations, c
     return DoTransposeInt4<UInt4x2>(permutations, input, output, input_shape_override, tp);
   }
 
-  return TransposeImpl(permutations, input, output, input_shape_override, tp);
+  return TransposeImpl(permutations, input, output, input_shape_override, output_shape_override, tp);
 }
 
 Status Transpose::Compute(OpKernelContext* ctx) const {
@@ -450,7 +452,7 @@ Status Transpose::Compute(OpKernelContext* ctx) const {
     return Status::OK();
   }
 
-  return DoTranspose(*p_perm, X, Y, nullptr, ctx->GetOperatorThreadPool());
+  return DoTranspose(*p_perm, X, Y, nullptr, nullptr, ctx->GetOperatorThreadPool());
 }
 
 ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.h b/onnxruntime/core/providers/cpu/tensor/transpose.h
index 54d3584ba0dad..f14282986a119 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.h
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.h
@@ -34,6 +34,7 @@ class TransposeBase {
   */
   static Status DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
                             const TensorShape* input_shape_override = nullptr,
+                            const TensorShape* output_shape_override = nullptr,
                             concurrency::ThreadPool* tp = nullptr);
 
  protected:

From 35029a261750a185640ab5faed78728393f39003 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Tue, 18 Feb 2025 06:32:26 -0800
Subject: [PATCH 3/6] Revist NHWC DepthToSpace U8 and its transformation

Add tests
---
 .../core/graph/contrib_ops/contrib_defs.cc    |  17 +-
 .../providers/cpu/tensor/space_depth_ops.cc   |   2 +-
 .../providers/cpu/tensor/space_depth_ops.h    |   4 +-
 .../contrib_ops/depth_to_space_op_test.cc     | 207 +++++++++++++++---
 .../test/optimizer/nhwc_transformer_test.cc   |  31 +++
 5 files changed, 221 insertions(+), 40 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 54d9124ea0cda..853bd0abae825 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3684,7 +3684,9 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
       });
 
   static const char* DepthToSpace_ver1_doc = R"DOC(
-TODO
+It is similar to DepthToSpace (https://github.com/onnx/onnx/blob/main/docs/Operators.md#DepthToSpace) with differences:
+  1. It has additional attribute channels_last.
+  2. Input and output data type is uint8.
 )DOC";
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(DepthToSpace)
@@ -3692,7 +3694,11 @@ TODO
       .SinceVersion(1)
       .SetDoc(DepthToSpace_ver1_doc)
       .Attr("blocksize", "Blocks of [blocksize, blocksize] are moved.", AttributeProto::INT)
-      .Attr("channels_last", "", AttributeProto::INT, static_cast<int64_t>(0))
+      .Attr(
+          "channels_last",
+          "1 if the input and output are in the NHWC layout, 0 if it is in the NCHW layout. Defaults to 0.",
+          AttributeProto::INT,
+          static_cast<int64_t>(0))
       .Attr(
           "mode",
           "DCR (default) for depth-column-row order re-arrangement. Use CRD for column-row-depth order.",
@@ -3701,8 +3707,8 @@ TODO
       .Input(
           0,
           "input",
-          "Input tensor of [N,H,W,C], where N is the batch axis, C is the channel or depth"
-          ", H is the height and W is the width.",
+          "Input data tensor. Dimensions are [N,H,W,C] when channels_last is 1 or [N,C,H,W] otherwise, where N is the"
+          "batch axis, C is the channel or depth, H is the height and W is the width.",
           "T",
           OpSchema::Single,
           true,
@@ -3711,7 +3717,8 @@ TODO
       .Output(
           0,
           "output",
-          "Output tensor of [N, H * blocksize, W * blocksize, C/(blocksize * blocksize)].",
+          "Output data tensor. Dimensions are [N, H * blocksize, W * blocksize, C/(blocksize * blocksize)] when"
+          "channels_last is 1 or [N, C/(blocksize * blocksize), H * blocksize, W * blocksize] otherwise.",
           "T",
           OpSchema::Single,
           true,
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
index 487cb59f3bae1..980a452be25d8 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
@@ -170,7 +170,7 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
   int64_t output_height = -1;
   int64_t output_width = -1;
 
-  if (is_bhwc_) {
+  if (is_nhwc_) {
     ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc<true>(input,
                                                           batch,
                                                           input_depth, input_height, input_width,
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
index 923c75e2118e6..d2676c2cc4891 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
@@ -79,7 +79,7 @@ class SpaceToDepth final : public OpKernel, SpaceDepthBase {
 class DepthToSpace final : public OpKernel, SpaceDepthBase {
  public:
   explicit DepthToSpace(const OpKernelInfo& info) : OpKernel(info), SpaceDepthBase(info) {
-    is_bhwc_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
+    is_nhwc_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
     std::string mode;
     // if  mode doesn't exist, then it is the default "DCR" mode
     // (or) it is an opset < 11 model for which the only mode is "DCR" mode
@@ -96,7 +96,7 @@ class DepthToSpace final : public OpKernel, SpaceDepthBase {
 
  private:
   bool is_dcr_ = true;
-  bool is_bhwc_ = false;
+  bool is_nhwc_ = false;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/depth_to_space_op_test.cc b/onnxruntime/test/contrib_ops/depth_to_space_op_test.cc
index 9562325984070..bf247b16291ab 100644
--- a/onnxruntime/test/contrib_ops/depth_to_space_op_test.cc
+++ b/onnxruntime/test/contrib_ops/depth_to_space_op_test.cc
@@ -15,13 +15,12 @@
 namespace onnxruntime {
 namespace test {
 
-// Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape
 template <typename T>
 void RunDepthToSpace(const std::vector<T>& input,
                      const std::vector<int64_t>& input_shape,
                      const int64_t blocksize,
                      const int64_t channels_last,
-                     const std::string mode, // type?
+                     const std::string mode,
                      const std::vector<T>& output,
                      const std::vector<int64_t>& output_shape,
                      OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess) {
@@ -43,45 +42,189 @@ void RunDepthToSpace(const std::vector<T>& input,
   run_test();
 }
 
-TEST(DepthToSpaceOpTest, UInt8) {
+TEST(DepthToSpaceOpTest, ContribDCR) {
 
-    constexpr int64_t B = 2, H = 3, W = 2, C = 12;
+    constexpr int64_t N = 2, H = 3, W = 2, C = 12;
     constexpr int64_t blocksize = 2;
     std::vector<uint8_t> input = {
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-        33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-        44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-        55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-        66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-        77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-        88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-        110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-        121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
-        132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
-        143
+          0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
+         12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+
+         24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+         36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+
+         48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+         60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+
+
+         72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+
+         96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
+        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+
+        120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+        132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143
     };
-    std::vector<int64_t> input_shape = {B, H, W, C};
+    std::vector<int64_t> input_shape = {N, H, W, C};
     std::vector<uint8_t> output = {
-        0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
-        6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23,
-        24, 25, 26, 27, 28, 29, 36, 37, 38, 39, 40, 41,
-        30, 31, 32, 33, 34, 35, 42, 43, 44, 45, 46, 47,
-        48, 49, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65,
-        54, 55, 56, 57, 58, 59, 66, 67, 68, 69, 70, 71,
-        72, 73, 74, 75, 76, 77, 84, 85, 86, 87, 88, 89,
-        78, 79, 80, 81, 82, 83, 90, 91, 92, 93, 94, 95,
-        96, 97, 98, 99, 100, 101, 108, 109, 110, 111, 112, 113,
-        102, 103, 104, 105, 106, 107, 114, 115, 116, 117, 118, 119,
-        120, 121, 122, 123, 124, 125, 132, 133, 134, 135, 136, 137,
-        126, 127, 128, 129, 130, 131, 138, 139, 140, 141, 142, 143
+          0,   1,   2,
+          3,   4,   5,
+         12,  13,  14,
+         15,  16,  17,
+
+          6,   7,   8,
+          9,  10,  11,
+         18,  19,  20,
+         21,  22,  23,
+
+         24,  25,  26,
+         27,  28,  29,
+         36,  37,  38,
+         39,  40,  41,
+
+         30,  31,  32,
+         33,  34,  35,
+         42,  43,  44,
+         45,  46,  47,
+
+         48,  49,  50,
+         51,  52,  53,
+         60,  61,  62,
+         63,  64,  65,
+
+         54,  55,  56,
+         57,  58,  59,
+         66,  67,  68,
+         69,  70,  71,
+
+
+         72,  73,  74,
+         75,  76,  77,
+         84,  85,  86,
+         87,  88,  89,
+
+         78,  79,  80,
+         81,  82,  83,
+         90,  91,  92,
+         93,  94,  95,
+
+         96,  97,  98,
+         99, 100, 101,
+        108, 109, 110,
+        111, 112, 113,
+
+        102, 103, 104,
+        105, 106, 107,
+        114, 115, 116,
+        117, 118, 119,
+
+        120, 121, 122,
+        123, 124, 125,
+        132, 133, 134,
+        135, 136, 137,
+
+        126, 127, 128,
+        129, 130, 131,
+        138, 139, 140,
+        141, 142, 143
     };
-    std::vector<int64_t> output_shape = {B, H * blocksize, W * blocksize, C / (blocksize * blocksize)};
+    std::vector<int64_t> output_shape = {N, H * blocksize, W * blocksize, C / (blocksize * blocksize)};
 
     RunDepthToSpace<uint8_t>(input, input_shape, blocksize, 1, "DCR", output, output_shape);
 }
 
+TEST(DepthToSpaceOpTest, ContribCRD) {
+
+  constexpr int64_t N = 2, H = 3, W = 2, C = 12;
+  constexpr int64_t blocksize = 2;
+  std::vector<uint8_t> input = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
+       12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+
+       24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+       36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+
+       48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+       60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+
+
+       72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+       84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+
+       96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
+      108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+
+      120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+      132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143
+  };
+  std::vector<int64_t> input_shape = {N, H, W, C};
+  std::vector<uint8_t> output = {
+        0,   4,   8,
+        1,   5,   9,
+       12,  16,  20,
+       13,  17,  21,
+
+        2,   6,  10,
+        3,   7,  11,
+       14,  18,  22,
+       15,  19,  23,
+
+       24,  28,  32,
+       25,  29,  33,
+       36,  40,  44,
+       37,  41,  45,
+
+       26,  30,  34,
+       27,  31,  35,
+       38,  42,  46,
+       39,  43,  47,
+
+       48,  52,  56,
+       49,  53,  57,
+       60,  64,  68,
+       61,  65,  69,
+
+       50,  54,  58,
+       51,  55,  59,
+       62,  66,  70,
+       63,  67,  71,
+
+
+       72,  76,  80,
+       73,  77,  81,
+       84,  88,  92,
+       85,  89,  93,
+
+       74,  78,  82,
+       75,  79,  83,
+       86,  90,  94,
+       87,  91,  95,
+
+       96, 100, 104,
+       97, 101, 105,
+      108, 112, 116,
+      109, 113, 117,
+
+       98, 102, 106,
+       99, 103, 107,
+      110, 114, 118,
+      111, 115, 119,
+
+      120, 124, 128,
+      121, 125, 129,
+      132, 136, 140,
+      133, 137, 141,
+
+      122, 126, 130,
+      123, 127, 131,
+      134, 138, 142,
+      135, 139, 143
+  };
+  std::vector<int64_t> output_shape = {N, H * blocksize, W * blocksize, C / (blocksize * blocksize)};
+
+  RunDepthToSpace<uint8_t>(input, input_shape, blocksize, 1, "CRD", output, output_shape);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index a247fea7e5f53..7dc46a7a65576 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -8,6 +8,7 @@
 #include "graph_transform_test_builder.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -516,6 +517,36 @@ TEST(NhwcTransformerTests, ConvMixTensorRanks) {
                     TransformerLevel::Level3);
 }
 
+TEST(NhwcTransformerTests, DepthToSpace) {
+  auto test_case = [&](const std::vector<int64_t>& input_shape, const int64_t blocksize, const std::string mode) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input_arg = builder.MakeInput<uint8_t>(input_shape, 0, 255);
+      auto* output_arg = builder.MakeOutput();
+      NodeAttributes attrs;
+      utils::SetNodeAttribute(utils::MakeAttribute("blocksize", blocksize), attrs);
+      utils::SetNodeAttribute(utils::MakeAttribute("mode", mode), attrs);
+
+      builder.AddNode("DepthToSpace", {input_arg}, {output_arg}, "", &attrs);
+    };
+
+    auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      EXPECT_EQ(op_to_count["com.microsoft.DepthToSpace"], 1);
+      EXPECT_EQ(op_to_count["Transpose"], 2);
+    };
+
+    TransformerTester(build_test_case,
+                      check_nhwc_graph,
+                      TransformerLevel::Level2,
+                      TransformerLevel::Level3);
+  };
+
+  test_case({2, 12, 3, 2}, 2, "DCR");
+  test_case({1, 1024, 48, 48}, 4, "DCR");
+  test_case({2, 12, 3, 2}, 2, "CRD");
+  test_case({1, 1024, 48, 48}, 4, "CRD");
+}
+
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 
 static std::vector<MLFloat16> ARangeOfFP16Values(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {

From e9acabe88f05f836292a4a6f61af1bf7917a2328 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Thu, 20 Feb 2025 11:00:48 -0800
Subject: [PATCH 4/6] Add output_shape_override for transpose

---
 .../core/providers/cpu/tensor/transpose.cc    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc
index 2ec8dadc47c85..e9427716b0c01 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc
@@ -261,9 +261,12 @@ static void DoTransposeEltWise(int64_t num_axes, gsl::span<const int64_t> target
 
 //  `input_shape_override` overrides the shape of `input` for compute purposes.
 static Status DoUntypedTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
-                                 const TensorShape* input_shape_override = nullptr) {
+                                 const TensorShape* input_shape_override = nullptr,
+                                 const TensorShape* output_shape_override = nullptr) {
   const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
   const auto& input_dims = input_shape.GetDims();
+  const auto& output_shape = output_shape_override ? *output_shape_override : output.Shape();
+  const auto& output_dims = output_shape.GetDims();
   auto rank = input_shape.NumDimensions();
 
   const auto element_size = input.DataType()->Size();
@@ -307,10 +310,10 @@ static Status DoUntypedTranspose(const gsl::span<const size_t>& permutations, co
       if (1 == prefix_blocksize) {
         DoTransposeSingleBlock(suffix_blocksize, input_data, output_data);
       } else if (1 == suffix_blocksize) {
-        DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride,
+        DoTransposeEltWise(num_axes_in_prefix, output_dims, prefix_blocksize, stride,
                            input_data, output_data);
       } else {
-        DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride,
+        DoTransposeImpl(num_axes_in_prefix, output_dims, prefix_blocksize, suffix_blocksize, stride,
                         input_data, output_data);
       }
     } else {
@@ -323,10 +326,10 @@ static Status DoUntypedTranspose(const gsl::span<const size_t>& permutations, co
       DoTransposeSingleBlock(suffix_blocksize, input_data, output_data, element_size);
     } else if (1 == suffix_blocksize) {
       // this may return a failed status if the data size is not supported in this build
-      status = DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride,
+      status = DoTransposeEltWise(num_axes_in_prefix, output_dims, prefix_blocksize, stride,
                                   input_data, output_data, element_size);
     } else {
-      DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride,
+      DoTransposeImpl(num_axes_in_prefix, output_dims, prefix_blocksize, suffix_blocksize, stride,
                       input_data, output_data, element_size);
     }
   }
@@ -369,7 +372,7 @@ static Status TransposeImpl(const gsl::span<const size_t>& permutations, const T
   }
 
   // fall back to default implementation
-  return DoUntypedTranspose(permutations, input, output, input_shape_override);
+  return DoUntypedTranspose(permutations, input, output, input_shape_override, output_shape_override);
 }
 
 template <typename Int4Type>
@@ -389,7 +392,8 @@ static Status UnpackInt4Tensor(const Tensor& src, Tensor& dst, AllocatorPtr cpu_
 
 template <typename Int4Type>
 static Status DoTransposeInt4(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
-                              const TensorShape* input_shape_override, concurrency::ThreadPool* tp) {
+                              const TensorShape* input_shape_override, const TensorShape* output_shape_override,
+                              concurrency::ThreadPool* tp) {
   using Int8Type = typename Int4Type::UnpackedType;
 
   ORT_RETURN_IF_NOT(input.IsDataType<Int4Type>() && output.IsDataType<Int4Type>(),
@@ -401,7 +405,7 @@ static Status DoTransposeInt4(const gsl::span<const size_t>& permutations, const
   Tensor output_unpacked(DataTypeImpl::GetType<Int8Type>(), output.Shape(), cpu_allocator);
 
   ORT_RETURN_IF_ERROR((UnpackInt4Tensor<Int4Type>(input, input_unpacked, cpu_allocator)));
-  ORT_RETURN_IF_ERROR(TransposeImpl(permutations, input_unpacked, output_unpacked, input_shape_override, nullptr /* FIXME */, tp));
+  ORT_RETURN_IF_ERROR(TransposeImpl(permutations, input_unpacked, output_unpacked, input_shape_override, output_shape_override, tp));
   ORT_RETURN_IF_NOT(Int4Type::Pack(output.MutableDataAsSpan<Int4Type>(), output_unpacked.DataAsSpan<Int8Type>()),
                     "Failed to pack 8-bit Tensor into 4-bit Tensor");
 
@@ -420,11 +424,11 @@ Status TransposeBase::DoTranspose(const gsl::span<const size_t>& permutations, c
                            input_type, " != ", output_type);
   }
   if (input.IsDataType<Int4x2>()) {
-    return DoTransposeInt4<Int4x2>(permutations, input, output, input_shape_override, tp);
+    return DoTransposeInt4<Int4x2>(permutations, input, output, input_shape_override, output_shape_override, tp);
   }
 
   if (input.IsDataType<UInt4x2>()) {
-    return DoTransposeInt4<UInt4x2>(permutations, input, output, input_shape_override, tp);
+    return DoTransposeInt4<UInt4x2>(permutations, input, output, input_shape_override, output_shape_override, tp);
   }
 
   return TransposeImpl(permutations, input, output, input_shape_override, output_shape_override, tp);

From cd572f71c84c9b9e40a9c46ff2c7d8703d4e6aee Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Thu, 20 Feb 2025 11:01:25 -0800
Subject: [PATCH 5/6] Use Transpose::DoTranspose in NHWC DepthToSpace

---
 .../core/providers/cpu/tensor/space_depth_ops.cc    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
index 980a452be25d8..6418ad4dc0059 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
@@ -199,14 +199,16 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
                                                    input_width, blocksize_,
                                                    virtual_input_depth};
 
+#if 0
+    auto permutation = is_dcr_ ? std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 1, 3, 2, 4, 5}}
+                               : std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 1, 4, 2, 5, 3}};
+#else
     std::vector<size_t> permutation = is_dcr_ ? std::vector<size_t>{0, 1, 3, 2, 4, 5}
-                                              : std::vector<size_t>{0, 3, 1, 4, 2, 5};
+                                              : std::vector<size_t>{0, 1, 4, 2, 5, 3};
+#endif
 
     if (input.IsDataType<uint8_t>()) {
 
-      return Transpose::DoTranspose(
-        permutation, input, output, &virtual_input_shape, &virtual_output_shape, context->GetOperatorThreadPool());
-
       #if 0
       SpaceDepthOpCpuImpl<uint8_t>(input, output, permutation,
                                   onnxruntime::narrow<std::ptrdiff_t>(batch),
@@ -220,6 +222,9 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
                                   onnxruntime::narrow<std::ptrdiff_t>(input_width),
                                   onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
                                   onnxruntime::narrow<std::ptrdiff_t>(input_depth / blocksize_ / blocksize_));
+      #else
+      return Transpose::DoTranspose(
+        permutation, input, output, &virtual_input_shape, &virtual_output_shape, context->GetOperatorThreadPool());
       #endif
 
     } else {

From 9652b1e0c629f68ebe4da7898ab13cddded7a64c Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Thu, 20 Feb 2025 11:05:43 -0800
Subject: [PATCH 6/6] Remove unused code

---
 .../core/framework/transpose_helper.cc        | 14 -----------
 .../providers/cpu/tensor/space_depth_ops.cc   | 24 -------------------
 2 files changed, 38 deletions(-)

diff --git a/onnxruntime/core/framework/transpose_helper.cc b/onnxruntime/core/framework/transpose_helper.cc
index e7ef2d1bf1c6c..b1a5b85fe84db 100644
--- a/onnxruntime/core/framework/transpose_helper.cc
+++ b/onnxruntime/core/framework/transpose_helper.cc
@@ -65,8 +65,6 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
                                  concurrency::ThreadPool* tp = nullptr) {
   ORT_UNUSED_PARAMETER(permutations);
 
-  //ORT_ENFORCE(input_shape_override == nullptr);
-
   const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
   const auto& input_dims = input_shape.GetDims();
   const auto& output_shape = output_shape_override ? *output_shape_override : output.Shape();
@@ -119,18 +117,6 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
         dst_strides[permutations[dim]] = contig_dst_strides[dim];
       }
 
-      #if 0
-      for (int i = 0; i < rank; ++i) {
-        std::cout << "dst_strides[" << i << "] = " << dst_strides[i] << std::endl;
-      }
-      for (int i = 0; i < rank; ++i) {
-        std::cout << "input_shape[" << i << "] = " << input_shape[i] << std::endl;
-      }
-      for (int i = 0; i < rank; ++i) {
-        std::cout << "src_strides[" << i << "] = " << src_strides[i] << std::endl;
-      }
-      #endif
-
       ORT_THROW_IF_ERROR(DispatchStridedCopy<element_type_lists::All>(tp,
                                                                       output, 0, dst_strides,
                                                                       input_shape,
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
index 6418ad4dc0059..5b6e6f4983f57 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
@@ -179,10 +179,6 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
 
     Tensor& output = *context->Output(0, {batch, output_height, output_width, output_depth});
 
-    // handle DCR and CRD format
-    auto dim3 = is_dcr_ ? blocksize_ : input_depth / blocksize_ / blocksize_;
-    auto dim5 = is_dcr_ ? input_depth / blocksize_ / blocksize_ : blocksize_;
-
     int64_t virtual_input_depth = input_depth / blocksize_ / blocksize_;
 
     TensorShape virtual_input_shape;
@@ -199,33 +195,13 @@ Status DepthToSpace::Compute(OpKernelContext* context) const {
                                                    input_width, blocksize_,
                                                    virtual_input_depth};
 
-#if 0
-    auto permutation = is_dcr_ ? std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 1, 3, 2, 4, 5}}
-                               : std::array<Eigen::DenseIndex, IntermediateTensorRank>{{0, 1, 4, 2, 5, 3}};
-#else
     std::vector<size_t> permutation = is_dcr_ ? std::vector<size_t>{0, 1, 3, 2, 4, 5}
                                               : std::vector<size_t>{0, 1, 4, 2, 5, 3};
-#endif
 
     if (input.IsDataType<uint8_t>()) {
 
-      #if 0
-      SpaceDepthOpCpuImpl<uint8_t>(input, output, permutation,
-                                  onnxruntime::narrow<std::ptrdiff_t>(batch),
-                                  onnxruntime::narrow<std::ptrdiff_t>(input_height),
-                                  onnxruntime::narrow<std::ptrdiff_t>(input_width),
-                                  onnxruntime::narrow<std::ptrdiff_t>(dim3),
-                                  onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
-                                  onnxruntime::narrow<std::ptrdiff_t>(dim5),
-                                  onnxruntime::narrow<std::ptrdiff_t>(input_height),
-                                  onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
-                                  onnxruntime::narrow<std::ptrdiff_t>(input_width),
-                                  onnxruntime::narrow<std::ptrdiff_t>(blocksize_),
-                                  onnxruntime::narrow<std::ptrdiff_t>(input_depth / blocksize_ / blocksize_));
-      #else
       return Transpose::DoTranspose(
         permutation, input, output, &virtual_input_shape, &virtual_output_shape, context->GetOperatorThreadPool());
-      #endif
 
     } else {
       // user will not see this as the kernel doesn't claim support for types other than float and double