From 19c91c98ba8b07b4ff98fb10bacc753638d22476 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Mon, 25 Jul 2022 19:34:24 -0400
Subject: [PATCH 01/30] Initial comimt for col2im cpu kernel

---
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |   2 +
 .../core/graph/contrib_ops/contrib_defs.cc    | 186 ++++++++++++++++++
 onnxruntime/core/graph/contrib_ops/ms_opset.h |   2 +
 .../core/providers/cpu/tensor/col2im.cc       |  31 +++
 .../core/providers/cpu/tensor/col2im.h        |  63 ++++++
 .../tools/pytorch_export_contrib_ops.py       |   3 +
 .../python/contrib_ops/onnx_test_col2im.py    |  55 ++++++
 .../kernel_def_hashes/contrib.cpu.json        |   4 +
 8 files changed, 346 insertions(+)
 create mode 100644 onnxruntime/core/providers/cpu/tensor/col2im.cc
 create mode 100644 onnxruntime/core/providers/cpu/tensor/col2im.h
 create mode 100644 onnxruntime/test/python/contrib_ops/onnx_test_col2im.py

diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 0de091a9a4a0f..ed02d793f22d5 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -10,6 +10,7 @@ namespace contrib {
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp);
 
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Col2Im);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch);
@@ -187,6 +188,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp)>,
 
     // add more kernels here
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Col2Im)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch)>,
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 921d44716f12b..6c0a2389de1f4 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -904,6 +904,192 @@ ONNX_MS_OPERATOR_SET_SCHEMA(IsAllFinite, 1,
                                   updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
                                 }));
 
+void col2imShapeInference(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+  // All inputs shapes are required
+  if (!hasNInputShapes(ctx, 3)) {
+    return;
+  }
+
+  // TODO: Assume image_shape has correct spatial dimensions for next validations
+  //       An alternative is get the the number of spatial dimensions as an input
+  if (ctx.getInputType(1)->tensor_type().shape().dim_size() != 1) {
+    fail_shape_inference("image_shape tensor must have rank 1.");
+  }
+  size_t n_input_dims = ctx.getInputType(1)->tensor_type().shape().dim(0).dim_value();
+  std::vector<int64_t> image_shape = {};
+  const TensorProto* image_shape_data = ctx.getInputData(1);
+  if (image_shape_data) {
+    image_shape = ParseData<int64_t>(image_shape_data);
+    if (image_shape.size() != n_input_dims) {
+      fail_shape_inference("image_shape tensor must have ", n_input_dims, " spatial dimensions.");
+    }
+  }
+
+  std::vector<int64_t> pads = {};
+  if (getRepeatedAttribute(ctx, "pads", pads)) {
+    if ((pads.size() != 0) && (pads.size() != n_input_dims * 2)) {
+      fail_shape_inference("Attribute pads has incorrect size");
+    }
+  }
+
+  std::vector<int64_t> dilations = {};
+  if (getRepeatedAttribute(ctx, "dilations", dilations)) {
+    if ((dilations.size() != 0) && (dilations.size() != n_input_dims)) {
+      fail_shape_inference("Attribute dilations has incorrect size");
+    }
+  }
+
+  std::vector<int64_t> strides = {};
+  if (getRepeatedAttribute(ctx, "strides", strides)) {
+    if ((strides.size() != 0) && (strides.size() != n_input_dims)) {
+      fail_shape_inference("Attribute strides has incorrect size");
+    }
+  }
+
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  if (input_shape.dim_size() != 3) {
+    fail_shape_inference("input must have rank 3.");
+  }
+
+  std::vector<int64_t> block_shape = {};
+  const TensorProto* block_shape_data = ctx.getInputData(2);
+  if (block_shape_data) {
+    block_shape = ParseData<int64_t>(block_shape_data);
+    if (block_shape.size() != n_input_dims) {
+      fail_shape_inference("block_shape tensor must have ", n_input_dims, " spatial dimensions.");
+    }
+  }
+  if (ctx.getInputType(2)->tensor_type().shape().dim_size() != 1) {
+    fail_shape_inference("block_shape tensor must have rank 1.");
+  } else if (
+      (ctx.getInputType(2)->tensor_type().shape().dim(0).has_dim_value()) &&
+      (ctx.getInputType(2)->tensor_type().shape().dim(0).dim_value() != static_cast<int>(n_input_dims))) {
+    fail_shape_inference("block_shape tensor must have ", n_input_dims, " spatial dimensions.");
+  }
+
+  int block_shape_size = 0;
+  if (static_cast<int>(block_shape.size()) > 0) {
+    block_shape_size = 1;
+    for (const auto& dim : block_shape) {
+      block_shape_size *= dim;
+    }
+  }
+
+  // Final shape will be (N, C, dim_1, ..., dim_N)
+  auto final_image_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+  // Dimensions N and C are always present
+  Dim N, C;
+  if (ctx.getInputType(0)->tensor_type().shape().dim(0).has_dim_value()) {
+    N = input_shape.dim(0); // Otherwise, N is unknown.
+  }
+  *final_image_shape->add_dim() = N;
+
+  if (block_shape_size > 0) {
+    C = input_shape.dim(1) / block_shape_size; // Otherwise, C is unknown.
+  }
+  *final_image_shape->add_dim() = C;
+
+  // Image dimensions are dynamic
+  for (size_t i = 0; i < n_input_dims; ++i) {
+    Dim image_dim_i;
+    if (image_shape.size() > 0) {
+      image_dim_i.set_dim_value(image_shape[i]); // Otherwise, spatial dimensions are unknown
+    }
+    *final_image_shape->add_dim() = image_dim_i;
+  }
+  return;
+}
+
+constexpr const char* Col2Im_ver1_doc = R"DOC(
+The operator rearranges column blocks back into a multidimensional image
+
+Col2Im behaves similarly to PyTorch's fold https://pytorch.org/docs/stable/generated/torch.nn.Fold.html,
+but it only supports *batched* multi-dimensional image tensors.
+
+NOTE: Although specifying image_shape looks redundant because it could be calculated from
+      convolution formulas, it is required as input for more advanced scenarios as explained
+      at PyTorch's implementation (https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Col2Im.cpp#L10)
+
+)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
+                            OpSchema()
+                            .SetDoc(Col2Im_ver1_doc)
+                            .Attr(
+                                "dilations",
+                                "1-dimensional tensor with dilation value along each spatial axis of the image. "
+                                "If not present, the dilation defaults to 1 along each spatial axis of the image.",
+                                AttributeProto::INTS,
+                                OPTIONAL_VALUE)
+                            .Attr(
+                                "pads",
+                                "1-dimensional tensor with padding value for the beginning and ending along each spatial axis, "
+                                "it can take any value greater than or equal to 0. "
+                                "The value represent the number of pixels added to the beginning "
+                                "and end part of the corresponding axis. `pads` format should be as follow "
+                                "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels "
+                                "added at the beginning of axis `i` and xi_end is the number of pixels added at the end of axis `i`. "
+                                "If not present, the padding defaults to 0 along start and end of each spatial axis.",
+                                AttributeProto::INTS,
+                                OPTIONAL_VALUE)
+                            .Attr(
+                                "strides",
+                                "1-dimensional tensor with stride value along each spatial axis. "
+                                "If not present, the stride defaults to 1 along each spatial axis.",
+                                AttributeProto::INTS,
+                                OPTIONAL_VALUE)
+                            .Input(
+                                0,
+                                "input",
+                                "Input data tensor to be rearranged from column blocks back into an image."
+                                " This is a 3-dimensional tensor containing [N, C * n-ary-product(block_shape), L],"
+                                " where N is batch dimension, C is image channel dimension and L is number of blocks.",
+                                "T",
+                                OpSchema::Single,
+                                true,
+                                1,
+                                OpSchema::Differentiable)
+                            .Input(
+                                1,
+                                "image_shape",
+                                "The shape of the spatial dimensions of the image after rearranging the column blocks."
+                                "This is a 1-dimensional tensor with size of at least 2, containing the value [H_img, W_img] "
+                                " for a 2-D image or [dim_i1, dim_i2, ..., dim_iN] for a N-D image.",
+                                "tensor(int64)",
+                                OpSchema::Single,
+                                true,
+                                1,
+                                OpSchema::NonDifferentiable)
+                            .Input(
+                                2,
+                                "block_shape",
+                                "The shape of the block to apply on the input."
+                                "This is a 1-dimensional tensor of size of at least 2, containing the value [H_block, W_block] "
+                                " for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block.",
+                                "tensor(int64)",
+                                OpSchema::Single,
+                                true,
+                                1,
+                                OpSchema::NonDifferentiable)
+                            .Output(
+                                0,
+                                "output",
+                                "Output tensor produced by rearranging blocks into an image.",
+                                "T",
+                                OpSchema::Single,
+                                true,
+                                1,
+                                OpSchema::Differentiable)
+                            .TypeConstraint(
+                                "T",
+                                OpSchema::all_tensor_types_with_bfloat(),
+                                "Constrain input and output types to all numeric tensor types.")
+                           .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { col2imShapeInference(ctx); })
+                    );
+
 constexpr const char* GridSample_ver1_doc = R"DOC(
       Given an `input` and a flow-field `grid`, computes the `output` using `input` values and pixel locations from `grid`.
       Currently, only spatial (4-D) inputs are supported. For `input` with shape (N, C, H, W) and `grid` with shape (N, H_out, W_out, 2),
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index c6850eb8e3516..e3a774a7e9d05 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -56,6 +56,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FusedMatMul);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GatherND);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Gelu);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GreedySearch);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Col2Im);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GridSample);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Inverse);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft);
@@ -125,6 +126,7 @@ class OpSet_Microsoft_ver1 {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GatherND)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Gelu)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GreedySearch)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Col2Im)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GridSample)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Inverse)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft)>());
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
new file mode 100644
index 0000000000000..4a465d4f99826
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cpu/tensor/col2im.h"
+
+#include "core/framework/element_type_lists.h"
+#include "core/framework/TensorSeq.h"
+#include "core/providers/common.h"
+#include "core/framework/copy.h"
+#include "core/providers/op_kernel_type_control.h"
+
+namespace onnxruntime {
+
+#define REGISTER_KERNEL_TYPED(T)                                    \
+  ONNX_CPU_OPERATOR_TYPED_KERNEL(                                   \
+      Col2Im,                                                       \
+      1,                                                            \
+      T,                                                            \
+      KernelDefBuilder()                                            \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())   \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),  \
+      Col2Im<T>);
+
+REGISTER_KERNEL_TYPED(float)
+
+template <typename T>
+Status Col2Im<T>::Compute(OpKernelContext* context) const {
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.h b/onnxruntime/core/providers/cpu/tensor/col2im.h
new file mode 100644
index 0000000000000..35afed4c5ed05
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.h
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/util/math_cpuonly.h"
+#include "core/framework/tensor.h"
+#include "concatbase.h"
+
+namespace onnxruntime {
+
+template <typename T>
+class Col2Im final : public OpKernel {
+ public:
+  explicit Col2Im(const OpKernelInfo& info) : OpKernel(info) {
+    // std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
+    // std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
+    // align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
+    // ORT_ENFORCE(mode_str == "bilinear" || mode_str == "nearest" || mode_str == "bicubic",
+    //             "mode \"", mode_str, "\" not supported, expect bilinear, nearest or bicubic");
+    // ORT_ENFORCE(padding_mode_str == "zeros" || padding_mode_str == "border" || padding_mode_str == "reflection",
+    //             "padding_mode \"", padding_mode_str, "\" not supported, expect zeros, border or reflection");
+    // if (mode_str == "bicubic") {
+    //   mode_ = Bicubic;
+    // } else if (mode_str == "nearest") {
+    //   mode_ = Nearest;
+    // } else {
+    //   mode_ = Bilinear;
+    // }
+    // if (padding_mode_str == "reflection") {
+    //   padding_mode_ = Reflection;
+    // } else if (padding_mode_str == "border") {
+    //   padding_mode_ = Border;
+    // } else {
+    //   padding_mode_ = Zeros;
+    // }
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  // enum GridSampleInterpolationMode {
+  //   Bilinear,
+  //   Nearest,
+  //   Bicubic
+  // };
+
+  // enum GridSamplePaddingMode {
+  //   Zeros,
+  //   Border,
+  //   Reflection
+  // };
+
+  // T PixelAtGrid(const T* image, int64_t r, int64_t c, int64_t H, int64_t W, float border[/* 4 */]) const;
+
+  // GridSampleInterpolationMode mode_{Bilinear};
+  // GridSamplePaddingMode padding_mode_{Zeros};
+  // bool align_corners_{0};
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index aaca3806605a9..6d11f6ebeb6ae 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -91,6 +91,9 @@ def tril(g, self, diagonal):
 
     _reg(tril)
 
+    def col2im(g, self: torch._C.Value, image_shape, block_shape):
+        return g.op("com.microsoft::Col2Im", self, image_shape, block_shape)
+
 
 def unregister():
     """Unregister ONNX Runtime's built-in contrib ops."""
diff --git a/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py b/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
new file mode 100644
index 0000000000000..97269d895a125
--- /dev/null
+++ b/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
@@ -0,0 +1,55 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#
+# Test reference implementation and model for ONNX Runtime conrtib op trilu
+
+import unittest
+
+import numpy as np
+import onnx
+from onnx_contrib_ops_helper import expect
+
+
+class ONNXReferenceImplementationTest(unittest.TestCase):
+    def test_col2im(self) -> None:
+        input = np.array(
+            [
+                [
+                    [1.0, 6.0, 11.0, 16.0, 21.0],  # (1, 5, 5)
+                    [2.0, 7.0, 12.0, 17.0, 22.0],
+                    [3.0, 8.0, 13.0, 18.0, 23.0],
+                    [4.0, 9.0, 14.0, 19.0, 24.0],
+                    [5.0, 0.0, 15.0, 20.0, 25.0],
+                ]
+            ]
+        ).astype(np.float32)
+        image_shape = np.array([5, 5]).astype(np.int64)
+        block_shape = np.array([1, 5]).astype(np.int64)
+        node = onnx.helper.make_node(
+            "Col2Im", ["input", "image_shape", "block_shape"], ["col2im_reference_implementation"]
+        )
+
+        col2im_reference_implementation = np.array(
+            [
+                [
+                    [
+                        [1.0, 2.0, 3.0, 4.0, 5.0],  # (1, 1, 5, 5)
+                        [6.0, 7.0, 8.0, 9.0, 0.0],
+                        [11.0, 12.0, 13.0, 14.0, 15.0],
+                        [16.0, 17.0, 18.0, 19.0, 20.0],
+                        [21.0, 22.0, 23.0, 24.0, 25.0],
+                    ]
+                ]
+            ]
+        ).astype(np.float32)
+
+        expect(
+            node,
+            inputs=[input, image_shape, block_shape],
+            outputs=[col2im_reference_implementation],
+            name="test_col2im",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main(module=__name__, buffer=True)
diff --git a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
index 5fb55faa14c5c..1babf97a23f73 100644
--- a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
+++ b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
@@ -147,6 +147,10 @@
         "Gelu com.microsoft CPUExecutionProvider",
         4658746266161736328
     ],
+    [
+        "Col2Im com.microsoft CPUExecutionProvider",
+        11924582339825775592
+    ],
     [
         "GridSample com.microsoft CPUExecutionProvider",
         11924582339825775592

From 630604249395fb16f843a63224674f8d442b400c Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Tue, 26 Jul 2022 12:22:56 -0400
Subject: [PATCH 02/30] Add missing op declaration

---
 onnxruntime/contrib_ops/cpu/col2im.cc         | 22 +++++++++++++++++++
 .../core/providers/cpu/tensor/col2im.cc       | 14 +++++-------
 2 files changed, 28 insertions(+), 8 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/cpu/col2im.cc

diff --git a/onnxruntime/contrib_ops/cpu/col2im.cc b/onnxruntime/contrib_ops/cpu/col2im.cc
new file mode 100644
index 0000000000000..50689ccb6b4ab
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/col2im.cc
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/cpu/tensor/col2im.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+#define REGISTER_KERNEL_TYPED(T)                                              \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                              \
+      Col2Im,                                                                 \
+      kMSDomain,                                                              \
+      1,                                                                      \
+      T,                                                                      \
+      kCpuExecutionProvider,                                                  \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), \
+      Col2Im<T>);
+
+REGISTER_KERNEL_TYPED(float)
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 4a465d4f99826..f2893cb62c854 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -11,14 +11,12 @@
 
 namespace onnxruntime {
 
-#define REGISTER_KERNEL_TYPED(T)                                    \
-  ONNX_CPU_OPERATOR_TYPED_KERNEL(                                   \
-      Col2Im,                                                       \
-      1,                                                            \
-      T,                                                            \
-      KernelDefBuilder()                                            \
-          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())   \
-          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),  \
+#define REGISTER_KERNEL_TYPED(T)                                                            \
+  ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                           \
+      Col2Im,                                                                               \
+      1,                                                                                    \
+      T,                                                                                    \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), \
       Col2Im<T>);
 
 REGISTER_KERNEL_TYPED(float)

From 24f95da558a01a0b7fd793f796a6e12afa6633f0 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Tue, 26 Jul 2022 13:34:45 -0400
Subject: [PATCH 03/30] Fix hash

---
 .../test_col2im/test_data_set_0/input_0.pb        | Bin 0 -> 117 bytes
 .../test_col2im/test_data_set_0/input_1.pb        | Bin 0 -> 35 bytes
 .../test_col2im/test_data_set_0/input_2.pb        | Bin 0 -> 35 bytes
 .../test_col2im/test_data_set_0/output_0.pb       | Bin 0 -> 145 bytes
 .../testdata/kernel_def_hashes/contrib.cpu.json   |   2 +-
 5 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_0.pb
 create mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_1.pb
 create mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_2.pb
 create mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/output_0.pb

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..164166b2c84e8c0968a316c70ceb85e9b5fea07e
GIT binary patch
literal 117
zcmd;J<Y47s6<~B?&CDw(E%8cWU}&&sU^w8wz+m7AWCQUEAZ7sa9{{lfklg{q8-UmW
o$Y%i30YE$fh<5;S15iu=NGAaC3?M!L#0x;`fS>@x2I3Qr0LGRS@&Et;

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e2e47c174ce48b0b6cc775ccbad84426c3925a39
GIT binary patch
literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_2.pb
new file mode 100644
index 0000000000000000000000000000000000000000..c0b7595628c4bb8bd1859c490f6242ca6bdbf7cc
GIT binary patch
literal 35
gcmd;J5@2`YPRhwo&W<n6NGwS85@2M209Ggs0DrUvN&o-=

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..a0327f2382805a66df87597f4f6d304d58f2f3e0
GIT binary patch
literal 145
zcmWl~u?>ST6h+bR0?8#CWCg?mft0DT0YT(PvV;vn4B(QIG9@J?WlGA7!6@83-J@rX
zH8!zHYHpXNYjfwTLEk!PXtZ*^Z`ErU>(O=L{t}5-5h(}}3w9ikQY6KH8DHRmCwfFo
OnDN1a6&t?T@heYoXdiw6

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
index 1babf97a23f73..181e69b61090f 100644
--- a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
+++ b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
@@ -149,7 +149,7 @@
     ],
     [
         "Col2Im com.microsoft CPUExecutionProvider",
-        11924582339825775592
+        16946735406825550320
     ],
     [
         "GridSample com.microsoft CPUExecutionProvider",

From 98176b94aaf08b4d831670f2c36ffc6cde1a8111 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Tue, 26 Jul 2022 14:39:55 -0400
Subject: [PATCH 04/30] Fix ci

---
 .../core/graph/contrib_ops/contrib_defs.cc      | 17 ++++++++---------
 onnxruntime/core/providers/cpu/tensor/col2im.cc |  1 +
 onnxruntime/core/providers/cpu/tensor/col2im.h  |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 6c0a2389de1f4..8fae65f390e4b 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -912,7 +912,7 @@ void col2imShapeInference(InferenceContext& ctx) {
     return;
   }
 
-  // TODO: Assume image_shape has correct spatial dimensions for next validations
+  // Assuming image_shape has correct spatial dimensions and reused for next validation steps
   //       An alternative is get the the number of spatial dimensions as an input
   if (ctx.getInputType(1)->tensor_type().shape().dim_size() != 1) {
     fail_shape_inference("image_shape tensor must have rank 1.");
@@ -969,7 +969,7 @@ void col2imShapeInference(InferenceContext& ctx) {
     fail_shape_inference("block_shape tensor must have ", n_input_dims, " spatial dimensions.");
   }
 
-  int block_shape_size = 0;
+  int64_t block_shape_size = 0;
   if (static_cast<int>(block_shape.size()) > 0) {
     block_shape_size = 1;
     for (const auto& dim : block_shape) {
@@ -983,12 +983,12 @@ void col2imShapeInference(InferenceContext& ctx) {
   // Dimensions N and C are always present
   Dim N, C;
   if (ctx.getInputType(0)->tensor_type().shape().dim(0).has_dim_value()) {
-    N = input_shape.dim(0); // Otherwise, N is unknown.
+    N = input_shape.dim(0);  // Otherwise, N is unknown.
   }
   *final_image_shape->add_dim() = N;
 
   if (block_shape_size > 0) {
-    C = input_shape.dim(1) / block_shape_size; // Otherwise, C is unknown.
+    C = input_shape.dim(1) / block_shape_size;  // Otherwise, C is unknown.
   }
   *final_image_shape->add_dim() = C;
 
@@ -996,7 +996,7 @@ void col2imShapeInference(InferenceContext& ctx) {
   for (size_t i = 0; i < n_input_dims; ++i) {
     Dim image_dim_i;
     if (image_shape.size() > 0) {
-      image_dim_i.set_dim_value(image_shape[i]); // Otherwise, spatial dimensions are unknown
+      image_dim_i.set_dim_value(image_shape[i]);  // Otherwise, spatial dimensions are unknown
     }
     *final_image_shape->add_dim() = image_dim_i;
   }
@@ -1026,8 +1026,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
                                 OPTIONAL_VALUE)
                             .Attr(
                                 "pads",
-                                "1-dimensional tensor with padding value for the beginning and ending along each spatial axis, "
-                                "it can take any value greater than or equal to 0. "
+                                "1-dimensional tensor with padding value for the beginning and ending along each"
+                                " spatial axis, it can take any value greater than or equal to 0. "
                                 "The value represent the number of pixels added to the beginning "
                                 "and end part of the corresponding axis. `pads` format should be as follow "
                                 "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels "
@@ -1087,8 +1087,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
                                 "T",
                                 OpSchema::all_tensor_types_with_bfloat(),
                                 "Constrain input and output types to all numeric tensor types.")
-                           .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { col2imShapeInference(ctx); })
-                    );
+                           .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { col2imShapeInference(ctx); }));
 
 constexpr const char* GridSample_ver1_doc = R"DOC(
       Given an `input` and a flow-field `grid`, computes the `output` using `input` values and pixel locations from `grid`.
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index f2893cb62c854..fa95dd23560a9 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -23,6 +23,7 @@ REGISTER_KERNEL_TYPED(float)
 
 template <typename T>
 Status Col2Im<T>::Compute(OpKernelContext* context) const {
+  (void) context;
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.h b/onnxruntime/core/providers/cpu/tensor/col2im.h
index 35afed4c5ed05..03cfc3630877c 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.h
@@ -7,7 +7,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"
 #include "core/framework/tensor.h"
-#include "concatbase.h"
+#include "core/providers/cpu/tensor/concatbase.h"
 
 namespace onnxruntime {
 

From 5b70c4a9379c6b79622cbe97678305c10dcc50f5 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Wed, 27 Jul 2022 17:48:49 -0400
Subject: [PATCH 05/30] Kernel impl

---
 .../core/providers/cpu/tensor/col2im.cc       | 55 ++++++++++++++++++-
 .../core/providers/cpu/tensor/col2im.h        | 43 ++-------------
 .../providers/cpu/tensor/col2im_attributes.h  | 55 +++++++++++++++++++
 3 files changed, 113 insertions(+), 40 deletions(-)
 create mode 100644 onnxruntime/core/providers/cpu/tensor/col2im_attributes.h

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index fa95dd23560a9..dd95327e62935 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -7,6 +7,7 @@
 #include "core/framework/TensorSeq.h"
 #include "core/providers/common.h"
 #include "core/framework/copy.h"
+#include "core/common/safeint.h"
 #include "core/providers/op_kernel_type_control.h"
 
 namespace onnxruntime {
@@ -23,7 +24,59 @@ REGISTER_KERNEL_TYPED(float)
 
 template <typename T>
 Status Col2Im<T>::Compute(OpKernelContext* context) const {
-  (void) context;
+  const auto* col_input = context->Input<Tensor>(0);
+  const auto* image_shape = context->Input<Tensor>(1);
+  const auto* kernel_shape = context->Input<Tensor>(2);
+
+  TensorShape col_shape = col_input->Shape();
+  const auto num_image_channels = image_shape->Shape()[1];
+  const auto batch_size = col_shape[0];
+
+  const int64_t image_size = image_shape->Shape().Size();
+
+  AllocatorPtr alloc;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
+  const int64_t col_buffer_size = col_input->Shape().Size();
+  auto col_data = alloc->Alloc(SafeInt<size_t>(sizeof(T)) * col_buffer_size);
+
+  BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc)));
+  T* col_buffer_data = static_cast<T*>(col_buffer.get());
+
+  TensorShapeVector Y_dims;
+  Y_dims.insert(Y_dims.begin(), {batch_size, num_image_channels});
+  TensorShape Yshape(Y_dims);
+  Tensor* Y = context->Output(0, Yshape);
+  T* Ydata = Y->template MutableData<T>();
+
+  // template <typename T, class Provider, int order>
+  // void Col2imNd(
+  //     const T* data_col,
+  //     const int64_t* img_shape,
+  //     const int64_t* output_shape,
+  //     int64_t channels_col,
+  //     int64_t img_size,
+  //     const int64_t* kernel_shape,
+  //     const int64_t* stride,
+  //     const int64_t* dilation,
+  //     const int64_t* pad,
+  //     ptrdiff_t N,
+  //     T* data_img,
+  //     Provider* provider);
+
+  math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
+    col_buffer_data,
+    image_shape->Shape().GetDims().data(),
+    col_shape.GetDims().data(),
+    num_image_channels,
+    image_size,
+    kernel_shape->Shape().GetDims().data(),
+    col2im_attrs_.strides.data(),
+    col2im_attrs_.dilations.data(),
+    col2im_attrs_.pads.data(),
+    static_cast<int>(kernel_shape->Shape().Size()),
+    Ydata,
+    &CPUMathUtil::Instance());
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.h b/onnxruntime/core/providers/cpu/tensor/col2im.h
index 03cfc3630877c..8cbefd2ec668b 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include "core/providers/cpu/tensor/col2im_attributes.h"
+
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"
@@ -14,50 +16,13 @@ namespace onnxruntime {
 template <typename T>
 class Col2Im final : public OpKernel {
  public:
-  explicit Col2Im(const OpKernelInfo& info) : OpKernel(info) {
-    // std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
-    // std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
-    // align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
-    // ORT_ENFORCE(mode_str == "bilinear" || mode_str == "nearest" || mode_str == "bicubic",
-    //             "mode \"", mode_str, "\" not supported, expect bilinear, nearest or bicubic");
-    // ORT_ENFORCE(padding_mode_str == "zeros" || padding_mode_str == "border" || padding_mode_str == "reflection",
-    //             "padding_mode \"", padding_mode_str, "\" not supported, expect zeros, border or reflection");
-    // if (mode_str == "bicubic") {
-    //   mode_ = Bicubic;
-    // } else if (mode_str == "nearest") {
-    //   mode_ = Nearest;
-    // } else {
-    //   mode_ = Bilinear;
-    // }
-    // if (padding_mode_str == "reflection") {
-    //   padding_mode_ = Reflection;
-    // } else if (padding_mode_str == "border") {
-    //   padding_mode_ = Border;
-    // } else {
-    //   padding_mode_ = Zeros;
-    // }
+  explicit Col2Im(const OpKernelInfo& info) : OpKernel(info), col2im_attrs_(info) {
   }
 
   Status Compute(OpKernelContext* context) const override;
 
  private:
-  // enum GridSampleInterpolationMode {
-  //   Bilinear,
-  //   Nearest,
-  //   Bicubic
-  // };
-
-  // enum GridSamplePaddingMode {
-  //   Zeros,
-  //   Border,
-  //   Reflection
-  // };
-
-  // T PixelAtGrid(const T* image, int64_t r, int64_t c, int64_t H, int64_t W, float border[/* 4 */]) const;
-
-  // GridSampleInterpolationMode mode_{Bilinear};
-  // GridSamplePaddingMode padding_mode_{Zeros};
-  // bool align_corners_{0};
+  Col2ImAttributes col2im_attrs_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
new file mode 100644
index 0000000000000..299bd533296f5
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
@@ -0,0 +1,55 @@
+/**
+* Copyright (c) 2016-present, Facebook, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/* Modifications Copyright (c) Microsoft. */
+
+#pragma once
+
+#ifndef SHARED_PROVIDER
+#include "core/common/common.h"
+#include "core/providers/common.h"
+#include "core/util/math.h"
+#endif
+
+#include "core/common/inlined_containers.h"
+#include "core/framework/op_kernel.h"
+#include "core/framework/op_node_proto_helper.h"
+
+namespace onnxruntime {
+
+struct Col2ImAttributes {
+  using Col2ImPadVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;
+
+  explicit Col2ImAttributes(const OpKernelInfo& info) {
+    auto status = info.GetAttrs("strides", strides);
+    ORT_ENFORCE(status.IsOK());
+
+    gsl::span<const int64_t> pads_span;
+    status = info.GetAttrsAsSpan("pads", pads_span);
+    ORT_ENFORCE(status.IsOK());
+    pads.assign(pads_span.cbegin(), pads_span.cend());
+
+    status = info.GetAttrs("dilations", dilations);
+    ORT_ENFORCE(status.IsOK());
+  }
+
+  ~Col2ImAttributes() = default;
+
+  Col2ImPadVector pads;
+  TensorShapeVector dilations;
+  TensorShapeVector strides;
+};
+
+}  // namespace onnxruntime

From 47c085a13760ab980a0505699bf21970b48cdd43 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Mon, 1 Aug 2022 20:36:38 -0400
Subject: [PATCH 06/30] Add debug info

---
 .../core/providers/cpu/tensor/col2im.cc       |  83 ++++++++++--------
 .../providers/cpu/tensor/col2im_attributes.h  |   6 +-
 onnxruntime/core/util/math_cpu.cc             |  83 ++++++++++++++----
 onnxruntime/test/contrib_ops/col2im_test.cc   |  28 ++++++
 .../test_col2im/test_data_set_0/input_0.pb    | Bin 0 -> 117 bytes
 .../test_col2im/test_data_set_0/input_1.pb    | Bin 0 -> 35 bytes
 .../test_col2im/test_data_set_0/input_2.pb    | Bin 0 -> 35 bytes
 .../test_col2im/test_data_set_0/output_0.pb   | Bin 0 -> 120 bytes
 .../test_col2im_5d/test_data_set_0/input_0.pb | Bin 0 -> 498 bytes
 .../test_col2im_5d/test_data_set_0/input_1.pb | Bin 0 -> 43 bytes
 .../test_col2im_5d/test_data_set_0/input_2.pb | Bin 0 -> 43 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 503 bytes
 .../test_data_set_0/input_0.pb                | Bin 0 -> 97 bytes
 .../test_data_set_0/input_1.pb                | Bin 0 -> 35 bytes
 .../test_data_set_0/input_2.pb                | Bin 0 -> 35 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 165 bytes
 .../test_data_set_0/input_0.pb                | Bin 0 -> 318 bytes
 .../test_data_set_0/input_1.pb                | Bin 0 -> 35 bytes
 .../test_data_set_0/input_2.pb                | Bin 0 -> 35 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 120 bytes
 .../test_data_set_0/input_0.pb                | Bin 0 -> 162 bytes
 .../test_data_set_0/input_1.pb                | Bin 0 -> 35 bytes
 .../test_data_set_0/input_2.pb                | Bin 0 -> 35 bytes
 .../test_data_set_0/output_0.pb               | Bin 0 -> 120 bytes
 24 files changed, 140 insertions(+), 60 deletions(-)
 create mode 100644 onnxruntime/test/contrib_ops/col2im_test.cc
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_1.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_2.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/output_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_1.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_2.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/output_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_1.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_2.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/output_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_1.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_2.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/output_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_0.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_1.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_2.pb
 create mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/output_0.pb

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index dd95327e62935..4c3f999766f5e 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -27,55 +27,60 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   const auto* col_input = context->Input<Tensor>(0);
   const auto* image_shape = context->Input<Tensor>(1);
   const auto* kernel_shape = context->Input<Tensor>(2);
+  std::cout << "Status Col2Im<T>::Compute(OpKernelContext* context)" << std::endl;
 
-  TensorShape col_shape = col_input->Shape();
-  const auto num_image_channels = image_shape->Shape()[1];
-  const auto batch_size = col_shape[0];
+  const T* col_input_data = col_input->template Data<T>();
+  TensorShape col_input_shape = col_input->Shape();
+  int64_t col_input_C = col_input_shape[1];
+  const auto col_input_N = col_input_shape[0];
 
-  const int64_t image_size = image_shape->Shape().Size();
-
-  AllocatorPtr alloc;
-  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
-  const int64_t col_buffer_size = col_input->Shape().Size();
-  auto col_data = alloc->Alloc(SafeInt<size_t>(sizeof(T)) * col_buffer_size);
-
-  BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc)));
-  T* col_buffer_data = static_cast<T*>(col_buffer.get());
+  int64_t image_shape_size = 1;
+  int64_t kernel_shape_size = 1;
+  for (auto i=0; i < image_shape->Shape().Size(); ++i) {
+    image_shape_size *=  image_shape->Data<int64_t>()[i];
+    kernel_shape_size *=  kernel_shape->Data<int64_t>()[i];
+    // col_input_C computed as => (C*n-ary-prod{kernel_shape}) / n-ary-prod{kernel_shape}
+    col_input_C /= kernel_shape->Data<int64_t>()[i];
+  }
 
   TensorShapeVector Y_dims;
-  Y_dims.insert(Y_dims.begin(), {batch_size, num_image_channels});
+  Y_dims.insert(Y_dims.begin(), {col_input_N, col_input_C});
+  for (auto i=0; i < image_shape->Shape()[0]; ++i) {
+    Y_dims.push_back(image_shape->Data<int64_t>()[i]);
+  }
   TensorShape Yshape(Y_dims);
   Tensor* Y = context->Output(0, Yshape);
   T* Ydata = Y->template MutableData<T>();
 
-  // template <typename T, class Provider, int order>
-  // void Col2imNd(
-  //     const T* data_col,
-  //     const int64_t* img_shape,
-  //     const int64_t* output_shape,
-  //     int64_t channels_col,
-  //     int64_t img_size,
-  //     const int64_t* kernel_shape,
-  //     const int64_t* stride,
-  //     const int64_t* dilation,
-  //     const int64_t* pad,
-  //     ptrdiff_t N,
-  //     T* data_img,
-  //     Provider* provider);
+  std::cout << "\n\tInput 0: col_input = ("; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  col_input_data[i] << ", "; std::cout << ") with shape "<< Yshape << std::endl;
+  std::cout << "\tInput 1: image_shape = ("; for (auto i=0; i < image_shape->Shape().Size(); ++i) std::cout << image_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
+  std::cout << "\tInput 2: kernel_shape = ("; for (auto i=0; i < kernel_shape->Shape().Size(); ++i) std::cout << kernel_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
+  std::cout << "\tAttribute strides = ("; for (size_t i=0; i < col2im_attrs_.strides.size(); ++i) std::cout <<  col2im_attrs_.strides[i] << ", "; std::cout << ")"<< std::endl;
+  std::cout << "\tAttribute dilations = ("; for (size_t i=0; i < col2im_attrs_.dilations.size(); ++i) std::cout <<  col2im_attrs_.dilations[i] << ", "; std::cout << ")"<< std::endl;
+  std::cout << "\tAttribute pads = ("; for (size_t i=0; i < col2im_attrs_.pads.size(); ++i) std::cout <<  col2im_attrs_.pads[i] << ", "; std::cout << ")"<< std::endl;
+
+  std::cout << "\tVariable col_input_C: " << col_input_C << std::endl;
+  std::cout << "\tVariable col_input_N = " << col_input_N << std::endl;
+  std::cout <<  "\tVariable image_shape_size: " << image_shape_size << std::endl;
+  std::cout <<  "\tVariable kernel_shape_size: " << kernel_shape_size << std::endl;
+
+  std::cout << "\n\tStatus Col2Im<T>::Compute() --> math::Col2imNd<>()" << std::endl;
 
   math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
-    col_buffer_data,
-    image_shape->Shape().GetDims().data(),
-    col_shape.GetDims().data(),
-    num_image_channels,
-    image_size,
-    kernel_shape->Shape().GetDims().data(),
-    col2im_attrs_.strides.data(),
-    col2im_attrs_.dilations.data(),
-    col2im_attrs_.pads.data(),
-    static_cast<int>(kernel_shape->Shape().Size()),
-    Ydata,
-    &CPUMathUtil::Instance());
+    col_input_data,                                   // const T* data_col,
+    image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
+    Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
+    col_input_C,                                      // int64_t channels_col, --> output_num_channels * kernel_shape_size
+    image_shape_size,                                 // int64_t img_size,
+    kernel_shape->Data<int64_t>(),                    // const int64_t* kernel_shape,
+    col2im_attrs_.strides.data(),                     // const int64_t* stride,
+    col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
+    col2im_attrs_.pads.data(),                        // const int64_t* pad,
+    kernel_shape->Shape().Size(),                     // ptrdiff_t N, --> number of spatial dims for image
+    Ydata,                                            // T* data_img,
+    &CPUMathUtil::Instance()                          // Provider* provider
+    );
+  std::cout << "\n\n Return Col2Im<T>::Compute() --> "; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  Ydata[i] << ", "; std::cout << ") with shape " << Yshape << std::endl << std::endl;
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
index 299bd533296f5..9639718db5ecf 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
@@ -34,15 +34,15 @@ struct Col2ImAttributes {
 
   explicit Col2ImAttributes(const OpKernelInfo& info) {
     auto status = info.GetAttrs("strides", strides);
-    ORT_ENFORCE(status.IsOK());
+    // ORT_ENFORCE(status.IsOK());
 
     gsl::span<const int64_t> pads_span;
     status = info.GetAttrsAsSpan("pads", pads_span);
-    ORT_ENFORCE(status.IsOK());
+    // ORT_ENFORCE(status.IsOK());
     pads.assign(pads_span.cbegin(), pads_span.cend());
 
     status = info.GetAttrs("dilations", dilations);
-    ORT_ENFORCE(status.IsOK());
+    // ORT_ENFORCE(status.IsOK());
   }
 
   ~Col2ImAttributes() = default;
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 164e88573c4cb..05b265715e407 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -31,6 +31,7 @@
 #pragma GCC diagnostic pop
 #endif
 using onnxruntime::concurrency::ThreadPool;
+#include <iostream>
 
 namespace onnxruntime {
 namespace math {
@@ -370,7 +371,27 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
     T* data_col,
     bool accumulate_output,
     T padding_value) {
-  int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + rank, 1LL, std::multiplies<int64_t>());
+
+  int64_t im_shape_size = std::accumulate(im_shape, im_shape + rank, 1LL, std::multiplies<int64_t>());
+  int64_t output_shape_size = std::accumulate(output_shape, output_shape + rank, 1LL, std::multiplies<int64_t>());
+  int64_t kernel_shape_size = std::accumulate(kernel_shape, kernel_shape + rank, 1LL, std::multiplies<int64_t>());
+
+  std::cout << "\n\nCalled void Im2col<T, StorageOrder::NCHW>::operator()(";
+  std::cout << ",\n\tconst T* data_im={"; for (auto i=0; i < im_shape_size; ++i) std::cout << data_im[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* im_shape={"; for (auto i=0; i < rank; ++i) std::cout << im_shape[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < rank; ++i) std::cout << output_shape[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tint64_t channels_col=" << channels_col;
+  std::cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < rank; ++i) std::cout << kernel_shape[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < rank; ++i) std::cout << stride[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < rank; ++i) std::cout << dilation[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < rank; ++i) std::cout << pad[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tptrdiff_t rank=" << rank;
+  std::cout << ",\n\tT* data_col= preallocated pointer to write at {"; for (auto i=0; i < output_shape_size; ++i) std::cout << data_col[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tbool accumulate_output=" << accumulate_output;
+  std::cout << ",\n\tT padding_value=" << padding_value << ")";
+
+  std::cout << "\n\n\tVariable im_shape_size: " << im_shape_size << "\n\tVariable output_shape_size: "<<output_shape_size << "\n\tVariable kernel_shape_size: " << kernel_shape_size << std::endl<< std::endl;
+
   std::vector<int64_t> d_offset(rank, 0);
   std::vector<int64_t> d_iter(rank, 0);
   for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
@@ -386,7 +407,7 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
       // Loop over spatial axes in forward order to compute the indices in the
       // image and column, and whether the index lies in the padding.
       int64_t index_col = c_col;
-      int64_t index_im = c_col / kernel_size;
+      int64_t index_im = c_col / kernel_shape_size;
       bool is_padding = false;
       for (ptrdiff_t d_i = 0; d_i < rank; ++d_i) {
         int64_t d = d_iter[d_i];
@@ -408,6 +429,8 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
       }
     } while (NextPosition(rank, output_shape, d_iter.data()));
   }  // for (int c = 0; c < channels_col; ++c) {
+
+  std::cout << "Return void Im2col -> T* data_col={"; for (auto i=0; i < output_shape_size; ++i) std::cout << data_col[i] << ", "; std::cout << "}\n";
 }
 
 template struct Im2col<float, StorageOrder::NCHW>;
@@ -780,24 +803,48 @@ void Col2im<float, CPUMathUtil, StorageOrder::NHWC>(const float* data_col, int64
 }
 
 template <>
-void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, const int64_t* img_shape,
-                                                      const int64_t* output_shape, int64_t channels_col, int64_t img_size,
-                                                      const int64_t* kernel_shape, const int64_t* stride,
-                                                      const int64_t* dilation, const int64_t* pad, ptrdiff_t N,
-                                                      float* data_img, CPUMathUtil* context) {
+void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col,
+                                                      const int64_t* img_shape,
+                                                      const int64_t* output_shape,
+                                                      int64_t channels_col,
+                                                      int64_t img_size,
+                                                      const int64_t* kernel_shape,
+                                                      const int64_t* stride,
+                                                      const int64_t* dilation,
+                                                      const int64_t* pad,
+                                                      ptrdiff_t N,
+                                                      float* data_img,
+                                                      CPUMathUtil* context) {
+  std::cout << "\n\nCalled void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(";
+  std::cout << ",\n\tconst float* data_col={"; for (auto i=0; i < img_size; ++i) std::cout << data_col[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* img_shape={"; for (auto i=0; i < N; ++i) std::cout << img_shape[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < N; ++i) std::cout << output_shape[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tint64_t channels_col=" << channels_col;
+  std::cout << ",\n\tint64_t img_size=" << img_size;
+  std::cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < N; ++i) std::cout << kernel_shape[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < N; ++i) std::cout << stride[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < N; ++i) std::cout << dilation[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*N; ++i) std::cout << pad[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tptrdiff_t N=" << N;
+  std::cout << ",\n\tfloat* data_img= preallocated pointer to save at {"; for (auto i=0; i < img_size; ++i) std::cout << data_img[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tCPUMathUtil* context=...)" << std::endl;
+
   Set<float, CPUMathUtil>(gsl::narrow<ptrdiff_t>(img_size), 0, data_img, context);
   Im2col<float, StorageOrder::NCHW>()(
-      data_col,
-      img_shape,
-      output_shape,
-      channels_col,
-      kernel_shape,
-      stride,
-      dilation,
-      pad,
-      N,
-      data_img,
-      true);
+      data_col,       // const T* data_im,
+      img_shape,      // const int64_t* im_shape,
+      output_shape,   // const int64_t* output_shape,
+      channels_col,   // int64_t channels_col,
+      kernel_shape,   // const int64_t* kernel_shape,
+      stride,         // const int64_t* stride,
+      dilation,       // const int64_t* dilation,
+      pad,            // const int64_t* pad,
+      N,              // ptrdiff_t rank,
+      data_img,       // T* data_col,
+      true            // bool accumulate_output,
+      );
+
+  std::cout << "Return void Col2imNd --> float* data_img= {"; for (auto i=0; i < img_size; ++i) std::cout << data_img[i] << ", "; std::cout << "}";
 }
 
 #define SPECIALIZED_COPYVECTOR(T)                                                          \
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
new file mode 100644
index 0000000000000..2a1b692673fc5
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+#include "core/util/math.h"
+
+namespace onnxruntime {
+namespace test {
+
+TEST(Col2ImContribOpTest, simple) {
+  OpTester test("Col2Im", 1, kMSDomain);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  test.AddInput<float>("input", {1, 5, 5},  std::vector<float>{1.f, 6.f, 11.f, 16.f, 21.f, 2.f, 7.f, 12.f, 17.f, 22.f, 3.f, 8.f, 13.f, 18.f, 23.f, 4.f, 9.f, 14.f, 19.f, 24.f, 5.f, 0.f, 15.f, 20.f, 25.f});
+  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {1, 1, 5, 5}, std::vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f});
+  test.Run();
+}
+
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..164166b2c84e8c0968a316c70ceb85e9b5fea07e
GIT binary patch
literal 117
zcmd;J<Y47s6<~B?&CDw(E%8cWU}&&sU^w8wz+m7AWCQUEAZ7sa9{{lfklg{q8-UmW
o$Y%i30YE$fh<5;S15iu=NGAaC3?M!L#0x;`fS>@x2I3Qr0LGRS@&Et;

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e2e47c174ce48b0b6cc775ccbad84426c3925a39
GIT binary patch
literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_2.pb
new file mode 100644
index 0000000000000000000000000000000000000000..c0b7595628c4bb8bd1859c490f6242ca6bdbf7cc
GIT binary patch
literal 35
gcmd;J5@2`YPRhwo&W<n6NGwS85@2M209Ggs0DrUvN&o-=

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..28d470182c4b3a2a641ea74976b4ce1774f912c2
GIT binary patch
literal 120
zcmWl~ArgQf6h+Zb(D{+2xC6mxatQ{z1n5Mg(MU8JjYOB>QoO#oXPz20+Gu0V(5@+l
l6vss*_afp583h$D#EV$`>v)F=GXgR;>^M+x;=+xJ$A70x6@35z

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..0b66e3fbccc21c2a88060142326527a6fd6ca537
GIT binary patch
literal 498
zcmWm4F(?FJ0EXfJLR~W4U@#aA27|$1Fev9TNQN5>27|#sNjID_7%oYYBuST~OVTAt
zk}gS?q)U<{U6L+Il60fdV|ePT_lriN_0fh{BvCgpm6}cuRBA%V^n_51htSb$ljoja
zVu=u1>0*#kf^UVmzQ_?}g4ReI-6R<!?i=(Zs}!hEt2NR7|6z!6GX61rlq^L)iE7Ps
z(8n;7<osLo6gf&%Y0z5eq@NK|q?u=hEIS-<N|75LdF7KC7FcDIT@ER5$t_P*_+pkt
v*4Sc?BhI+ujxuk2lVOQ<w#jqMIoI6t%sW5KvCIZJ_Br8#5)ZuaK{fFQcK&2$

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..5505bdad286eddb0d654d28ce6d3b0e28ef6353a
GIT binary patch
literal 43
icmd;J7GQVc&dg0rPmM3mNGwS8l3-?l02V0C3Z((6Jq1w!

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_2.pb
new file mode 100644
index 0000000000000000000000000000000000000000..3abcacd89b1f6387825606c3360b099e828b7df7
GIT binary patch
literal 43
icmd;J7GQVcPRhwo&W<n6NGwS8l3-+j02s{*<pTh!-UU<u

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..cdb3bba06438e3657315a850363cb23029214e83
GIT binary patch
literal 503
zcmWN=F(`y#007VrNB>~B!C){L3<kpu27_`ggJdum3<iV2pxkiEV7Men(j`ffE=iXp
zNxCFmk|gPpBuST~OOny()q7D*R2$Vrv8X;?lWLfn&ZMU^1C_cEvOOV$BxcQ9vSR2B
zA#SS)lMXp%+N_J_+%Rv!Ba4>2vFx)ItN!|DNQDqHveCFrHrryWZMNHC!cM#FHff*z
z4m#wpBaS-exD!sAHsg%5W}S211s7d%*%foHy5_nYZo1{Rd3W4(&w~3Nc<7PGo_K1}
nGta%S<fT_$d*iKl-dpy;N1uH5#aG{~`0j_FR{iqZAAeK-gCS(_

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..bc96c5aca2112690e216f9b700c9ecef2e343fae
GIT binary patch
literal 97
zcmd;J<Y3`o6<~B?&CDw(E%6FqU}&&sU|8V5z#!nrzz_h$9YD+g<R1WH1t2>Ch$jHC
c1Caj!hz)@30wA6N#0@|`14s_U2I2*d02b*GVE_OC

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..ed056b38ede071201a58c4d489ee72565a9de9e6
GIT binary patch
literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2J102mDbe@g{O

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_2.pb
new file mode 100644
index 0000000000000000000000000000000000000000..ea04f67ddf5b80dd13a9f42589cd7104b5e46f7a
GIT binary patch
literal 35
fcmd;J5@2`YPRhwo&W<n6NGwS85@2G002mDbf2;*a

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..cefb7d5de2c8bdbd6589283d37bb43ded3b5c740
GIT binary patch
literal 165
zcmd;J<Y44r<6skDbYjadEh#81@tVNMz|dgNfC(IcLXMci3P7<2EaDS@Vh6B@UjT_Y
PVVc7M6q|uX{Q^e-RWl2p

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..cccac0d652c49c5e2d42e6108db2f5912452797b
GIT binary patch
literal 318
zcmWm5AxHxO7zN;e;^2bd1%tt`uwXDO7#0RQ4}-|Ug27-|Feu2lFkxZgkTHgc3=uId
z8VrlZMWb<X(P%XK89sRM-b*S;HQ7!pdG)$q+zw9PHzLY*#4?L$9Lc=##V=_dQKw0V
z9s|Za5`Jqk+bIiHgl>m@T6DQ&$b=c0U)Xk;@xhu+-7W|J8_u|5#FQsW-_*9pD@(pf
jbbB;7=A0W!?s;bJTeB^g^T`iex*CU^a6!Qx54_}m`4%=I

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e2e47c174ce48b0b6cc775ccbad84426c3925a39
GIT binary patch
literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_2.pb
new file mode 100644
index 0000000000000000000000000000000000000000..c0b7595628c4bb8bd1859c490f6242ca6bdbf7cc
GIT binary patch
literal 35
gcmd;J5@2`YPRhwo&W<n6NGwS85@2M209Ggs0DrUvN&o-=

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..b1faf2aed1971aadecc616ff3237cafe72606a92
GIT binary patch
literal 120
zcmWm3Ar3%L0EhAa3+zQvu-V)MiP=<9&<WlR1d|-&9Yb&kNs^qxC;aw>&l+oNV1vjM
r;ppbc&GjHDmpl$!=n)svf(<)5thn*ug~y4clu~5)pJ;^!Eu!)RQ05ig

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..f33a7620e97e8b2934587759212fbf1350d5effd
GIT binary patch
literal 162
qcmd;J<ly9B5nyy;&CDw(E%BPb$bbPF>=_tH!59V*5I{E@RXG6prw@7n

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e2e47c174ce48b0b6cc775ccbad84426c3925a39
GIT binary patch
literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_2.pb
new file mode 100644
index 0000000000000000000000000000000000000000..19b497c93ccceed2813a63a90e568d62835d8ed1
GIT binary patch
literal 35
fcmd;J5@2`YPRhwo&W<n6NGwS85@2S402mDbf5Zh!

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..df3f1bfa1e652a0d8dd2c78be6f6560e110bb6d7
GIT binary patch
literal 120
zcmd;J<Y44r<zN+HbYjadEh#81@k(I;g9dvB26POP0EwXh2dEg7hRMTdm|7SgLIVJ-
CxeZwW

literal 0
HcmV?d00001


From d4390f21af397071333e0b532380de6ed7d8777e Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Wed, 3 Aug 2022 16:46:35 -0400
Subject: [PATCH 07/30] Added Tests for 4D and 5D images

4d col2im works, 5d and higher doesn't
---
 .../core/providers/cpu/tensor/col2im.cc       | 50 ++++++++----
 onnxruntime/core/util/math_cpu.cc             |  2 +-
 onnxruntime/test/contrib_ops/col2im_test.cc   | 76 ++++++++++++++++++-
 3 files changed, 110 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 4c3f999766f5e..f1daf277c2d0e 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -66,20 +66,42 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   std::cout << "\n\tStatus Col2Im<T>::Compute() --> math::Col2imNd<>()" << std::endl;
 
-  math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
-    col_input_data,                                   // const T* data_col,
-    image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
-    Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
-    col_input_C,                                      // int64_t channels_col, --> output_num_channels * kernel_shape_size
-    image_shape_size,                                 // int64_t img_size,
-    kernel_shape->Data<int64_t>(),                    // const int64_t* kernel_shape,
-    col2im_attrs_.strides.data(),                     // const int64_t* stride,
-    col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
-    col2im_attrs_.pads.data(),                        // const int64_t* pad,
-    kernel_shape->Shape().Size(),                     // ptrdiff_t N, --> number of spatial dims for image
-    Ydata,                                            // T* data_img,
-    &CPUMathUtil::Instance()                          // Provider* provider
-    );
+  if (image_shape->Shape()[0] == 2) {
+    std::cout << "image_shape->Shape()[0] == 2 --> Col2Im" << std::endl;
+    math::Col2im<float, CPUMathUtil, StorageOrder::NCHW>(
+      col_input_data,
+      col_input_C,
+      image_shape->Data<int64_t>()[0],
+      image_shape->Data<int64_t>()[1],
+      kernel_shape->Data<int64_t>()[0],
+      kernel_shape->Data<int64_t>()[1],
+      col2im_attrs_.dilations[0],
+      col2im_attrs_.dilations[1],
+      col2im_attrs_.pads[0],
+      col2im_attrs_.pads[1],
+      col2im_attrs_.pads[2],
+      col2im_attrs_.pads[3],
+      col2im_attrs_.strides[0],
+      col2im_attrs_.strides[1],
+      Ydata,
+      &CPUMathUtil::Instance());
+  } else {
+    std::cout << "image_shape->Shape()[0] != 2 --> Col2ImNd (nd=" << image_shape->Shape()[0] << ") " << std::endl;
+    math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
+      col_input_data,                                   // const T* data_col,
+      image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
+      Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
+      col_input_C,                                      // int64_t channels_col, --> output_num_channels * kernel_shape_size
+      image_shape_size,                                 // int64_t img_size,
+      kernel_shape->Data<int64_t>(),                    // const int64_t* kernel_shape,
+      col2im_attrs_.strides.data(),                     // const int64_t* stride,
+      col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
+      col2im_attrs_.pads.data(),                        // const int64_t* pad,
+      kernel_shape->Shape().Size(),                     // ptrdiff_t N, --> number of spatial dims for image
+      Ydata,                                            // T* data_img,
+      &CPUMathUtil::Instance()                          // Provider* provider
+      );
+  }
   std::cout << "\n\n Return Col2Im<T>::Compute() --> "; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  Ydata[i] << ", "; std::cout << ") with shape " << Yshape << std::endl << std::endl;
 
   return Status::OK();
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 05b265715e407..709941a819f79 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -384,7 +384,7 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
   std::cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < rank; ++i) std::cout << kernel_shape[i] << ", "; std::cout << "}";
   std::cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < rank; ++i) std::cout << stride[i] << ", "; std::cout << "}";
   std::cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < rank; ++i) std::cout << dilation[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < rank; ++i) std::cout << pad[i] << ", "; std::cout << "}";
+  std::cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*rank; ++i) std::cout << pad[i] << ", "; std::cout << "}";
   std::cout << ",\n\tptrdiff_t rank=" << rank;
   std::cout << ",\n\tT* data_col= preallocated pointer to write at {"; for (auto i=0; i < output_shape_size; ++i) std::cout << data_col[i] << ", "; std::cout << "}";
   std::cout << ",\n\tbool accumulate_output=" << accumulate_output;
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index 2a1b692673fc5..f98a2d754d049 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <stdexcept>
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "core/util/math.h"
@@ -8,18 +9,87 @@
 namespace onnxruntime {
 namespace test {
 
-TEST(Col2ImContribOpTest, simple) {
+template <typename T>
+std::vector<T> _transpose_1dvector(std::vector<T> &input, size_t C, size_t H, size_t W)
+{
+    size_t n = input.size();
+    if (n == 0){
+        throw std::runtime_error("Invalid input");
+    }
+    std::vector<T> trans_vec(input);
+
+    std::cout << "input: (";
+    for(size_t i = 0; i < n; ++i)
+      std::cout << trans_vec[i] << ", ";
+    std::cout << ")" << std::endl;
+
+    for(size_t c = 0; c < C; ++c)
+      for(size_t i = 0; i < H; ++i)
+        for(size_t j = i+1; j < W; ++j)
+            std::swap(trans_vec[c*(H*W) + (H*i + j)], trans_vec[c*(H*W) + (W*j + i)]);
+
+    std::cout << "trans_vec: (";
+    for(size_t i = 0; i < n; ++i)
+      std::cout << trans_vec[i] << ", ";
+    std::cout << ")" << std::endl;
+
+    return trans_vec;
+}
+
+TEST(Col2ImContribOpTest, simple4dNCHW) {
   OpTester test("Col2Im", 1, kMSDomain);
 
   test.AddAttribute("strides", std::vector<int64_t>{1, 1});
   test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
   test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
 
-  test.AddInput<float>("input", {1, 5, 5},  std::vector<float>{1.f, 6.f, 11.f, 16.f, 21.f, 2.f, 7.f, 12.f, 17.f, 22.f, 3.f, 8.f, 13.f, 18.f, 23.f, 4.f, 9.f, 14.f, 19.f, 24.f, 5.f, 0.f, 15.f, 20.f, 25.f});
+  std::vector<float> input(25);
+  std::vector<float> output(25);
+  std::iota(output.begin(), output.end(), 1);
+  input = _transpose_1dvector(output, 1, 5, 5);
+  test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
   test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
 
-  test.AddOutput<float>("output", {1, 1, 5, 5}, std::vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f});
+  test.AddOutput<float>("output", {1, 1, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImContribOpTest, with3channels4dNCHW) {
+  OpTester test("Col2Im", 1, kMSDomain);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(75);
+  std::vector<float> output(75);
+  std::iota(output.begin(), output.end(), 1);
+  input = _transpose_1dvector(output, 3, 5, 5);
+  test.AddInput<float>("input", {1, 15, 5},  input);
+  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {1, 3, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImContribOpTest, simple5dNCHWD) {
+  OpTester test("Col2Im", 1, kMSDomain);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0, 0, 0});
+
+  std::vector<float> input(25);
+  std::vector<float> output(25);
+  std::iota(output.begin(), output.end(), 1);
+  input = _transpose_1dvector(output, 1, 5, 5);
+  test.AddInput<float>("input", {1, 5, 5},  input);
+  test.AddInput<int64_t>("image_shape", {3},  std::vector<int64_t>{1, 5, 5});
+  test.AddInput<int64_t>("block_shape", {3},  std::vector<int64_t>{1, 1, 5});
+
+  test.AddOutput<float>("output", {1, 1, 1, 5, 5}, output);
   test.Run();
 }
 

From d00f2c5c0af888b407251d4cb81f7b759a9e29e1 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Thu, 4 Aug 2022 17:36:52 -0400
Subject: [PATCH 08/30] Add support to N>1

---
 .../core/providers/cpu/tensor/col2im.cc       | 84 +++++++++++--------
 onnxruntime/test/contrib_ops/col2im_test.cc   | 44 +++++++---
 2 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index f1daf277c2d0e..fc514c453ae43 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <cassert>
 #include "core/providers/cpu/tensor/col2im.h"
 
 #include "core/framework/element_type_lists.h"
@@ -36,12 +37,15 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   int64_t image_shape_size = 1;
   int64_t kernel_shape_size = 1;
+  int64_t kernel_shape_rank = 0;
   for (auto i=0; i < image_shape->Shape().Size(); ++i) {
+    ++kernel_shape_rank;
     image_shape_size *=  image_shape->Data<int64_t>()[i];
     kernel_shape_size *=  kernel_shape->Data<int64_t>()[i];
     // col_input_C computed as => (C*n-ary-prod{kernel_shape}) / n-ary-prod{kernel_shape}
     col_input_C /= kernel_shape->Data<int64_t>()[i];
   }
+  const int64_t col_input_offset = col_input_C * image_shape_size;
 
   TensorShapeVector Y_dims;
   Y_dims.insert(Y_dims.begin(), {col_input_N, col_input_C});
@@ -51,8 +55,11 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   TensorShape Yshape(Y_dims);
   Tensor* Y = context->Output(0, Yshape);
   T* Ydata = Y->template MutableData<T>();
+  for (auto i=0; i < Yshape.Size(); ++i)
+    Ydata[i] = -1; // just for debug (to know what has been written to Ydata in the end)
+  const int64_t Y_offset = Yshape.Size() / Yshape[0];
 
-  std::cout << "\n\tInput 0: col_input = ("; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  col_input_data[i] << ", "; std::cout << ") with shape "<< Yshape << std::endl;
+  std::cout << "\n\tInput 0: col_input = ("; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  col_input_data[i] << ", "; std::cout << ") with shape "<< col_input_shape << std::endl;
   std::cout << "\tInput 1: image_shape = ("; for (auto i=0; i < image_shape->Shape().Size(); ++i) std::cout << image_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
   std::cout << "\tInput 2: kernel_shape = ("; for (auto i=0; i < kernel_shape->Shape().Size(); ++i) std::cout << kernel_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
   std::cout << "\tAttribute strides = ("; for (size_t i=0; i < col2im_attrs_.strides.size(); ++i) std::cout <<  col2im_attrs_.strides[i] << ", "; std::cout << ")"<< std::endl;
@@ -66,41 +73,46 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   std::cout << "\n\tStatus Col2Im<T>::Compute() --> math::Col2imNd<>()" << std::endl;
 
-  if (image_shape->Shape()[0] == 2) {
-    std::cout << "image_shape->Shape()[0] == 2 --> Col2Im" << std::endl;
-    math::Col2im<float, CPUMathUtil, StorageOrder::NCHW>(
-      col_input_data,
-      col_input_C,
-      image_shape->Data<int64_t>()[0],
-      image_shape->Data<int64_t>()[1],
-      kernel_shape->Data<int64_t>()[0],
-      kernel_shape->Data<int64_t>()[1],
-      col2im_attrs_.dilations[0],
-      col2im_attrs_.dilations[1],
-      col2im_attrs_.pads[0],
-      col2im_attrs_.pads[1],
-      col2im_attrs_.pads[2],
-      col2im_attrs_.pads[3],
-      col2im_attrs_.strides[0],
-      col2im_attrs_.strides[1],
-      Ydata,
-      &CPUMathUtil::Instance());
-  } else {
-    std::cout << "image_shape->Shape()[0] != 2 --> Col2ImNd (nd=" << image_shape->Shape()[0] << ") " << std::endl;
-    math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
-      col_input_data,                                   // const T* data_col,
-      image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
-      Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
-      col_input_C,                                      // int64_t channels_col, --> output_num_channels * kernel_shape_size
-      image_shape_size,                                 // int64_t img_size,
-      kernel_shape->Data<int64_t>(),                    // const int64_t* kernel_shape,
-      col2im_attrs_.strides.data(),                     // const int64_t* stride,
-      col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
-      col2im_attrs_.pads.data(),                        // const int64_t* pad,
-      kernel_shape->Shape().Size(),                     // ptrdiff_t N, --> number of spatial dims for image
-      Ydata,                                            // T* data_img,
-      &CPUMathUtil::Instance()                          // Provider* provider
-      );
+  assert(image_shape_size == Y_offset); // just for temp debug
+
+  for (auto image_id = 0; image_id < col_input_N; ++image_id) {
+    std::cout << "Image " << image_id+1 << " out of "<< col_input_N << std::endl;
+    if (image_shape->Shape()[0] == 2) {
+      std::cout << "image_shape->Shape()[0] == 2 --> Col2Im" << std::endl;
+      math::Col2im<float, CPUMathUtil, StorageOrder::NCHW>(
+        col_input_data + image_id * col_input_offset,
+        col_input_C,
+        image_shape->Data<int64_t>()[0],
+        image_shape->Data<int64_t>()[1],
+        kernel_shape->Data<int64_t>()[0],
+        kernel_shape->Data<int64_t>()[1],
+        col2im_attrs_.dilations[0],
+        col2im_attrs_.dilations[1],
+        col2im_attrs_.pads[0],
+        col2im_attrs_.pads[1],
+        col2im_attrs_.pads[2],
+        col2im_attrs_.pads[3],
+        col2im_attrs_.strides[0],
+        col2im_attrs_.strides[1],
+        Ydata + image_id * Y_offset,
+        &CPUMathUtil::Instance());
+    } else {
+      std::cout << "image_shape->Shape()[0] != 2 --> Col2ImNd (nd=" << image_shape->Shape()[0] << ") " << std::endl;
+      math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
+        col_input_data + image_id * col_input_offset,     // const T* data_col,
+        image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
+        Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
+        col_input_C,                                      // int64_t channels_col,
+        image_shape_size,                                 // int64_t img_size,
+        kernel_shape->Data<int64_t>(),                    // const int64_t* kernel_shape,
+        col2im_attrs_.strides.data(),                     // const int64_t* stride,
+        col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
+        col2im_attrs_.pads.data(),                        // const int64_t* pad,
+        kernel_shape->Shape().Size(),                     // ptrdiff_t N, --> #spatial_dims?
+        Ydata + image_id * Y_offset,                      // T* data_img,
+        &CPUMathUtil::Instance()                          // Provider* provider
+        );
+    }
   }
   std::cout << "\n\n Return Col2Im<T>::Compute() --> "; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  Ydata[i] << ", "; std::cout << ") with shape " << Yshape << std::endl << std::endl;
 
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index f98a2d754d049..6dacfa9f02301 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -10,26 +10,27 @@ namespace onnxruntime {
 namespace test {
 
 template <typename T>
-std::vector<T> _transpose_1dvector(std::vector<T> &input, size_t C, size_t H, size_t W)
+std::vector<T> _transpose_serialized_vector(std::vector<T> &input, size_t N, size_t C, size_t H, size_t W)
 {
-    size_t n = input.size();
-    if (n == 0){
+    size_t input_size = input.size();
+    if (input_size == 0){
         throw std::runtime_error("Invalid input");
     }
     std::vector<T> trans_vec(input);
 
     std::cout << "input: (";
-    for(size_t i = 0; i < n; ++i)
+    for(size_t i = 0; i < input_size; ++i)
       std::cout << trans_vec[i] << ", ";
     std::cout << ")" << std::endl;
 
-    for(size_t c = 0; c < C; ++c)
-      for(size_t i = 0; i < H; ++i)
-        for(size_t j = i+1; j < W; ++j)
-            std::swap(trans_vec[c*(H*W) + (H*i + j)], trans_vec[c*(H*W) + (W*j + i)]);
+    for(size_t n = 0; n < N; ++n)
+      for(size_t c = 0; c < C; ++c)
+        for(size_t i = 0; i < H; ++i)
+          for(size_t j = i+1; j < W; ++j)
+              std::swap(trans_vec[n*(C*H*W) + c*(H*W) + (H*i + j)], trans_vec[n*(C*H*W) + c*(H*W) + (W*j + i)]);
 
     std::cout << "trans_vec: (";
-    for(size_t i = 0; i < n; ++i)
+    for(size_t i = 0; i < input_size; ++i)
       std::cout << trans_vec[i] << ", ";
     std::cout << ")" << std::endl;
 
@@ -46,7 +47,7 @@ TEST(Col2ImContribOpTest, simple4dNCHW) {
   std::vector<float> input(25);
   std::vector<float> output(25);
   std::iota(output.begin(), output.end(), 1);
-  input = _transpose_1dvector(output, 1, 5, 5);
+  input = _transpose_serialized_vector(output, 1, 1, 5, 5);
   test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
   test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
@@ -65,7 +66,7 @@ TEST(Col2ImContribOpTest, with3channels4dNCHW) {
   std::vector<float> input(75);
   std::vector<float> output(75);
   std::iota(output.begin(), output.end(), 1);
-  input = _transpose_1dvector(output, 3, 5, 5);
+  input = _transpose_serialized_vector(output, 1, 3, 5, 5);
   test.AddInput<float>("input", {1, 15, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
   test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
@@ -74,6 +75,25 @@ TEST(Col2ImContribOpTest, with3channels4dNCHW) {
   test.Run();
 }
 
+TEST(Col2ImContribOpTest, with2Images3channels4dNCHW) {
+  OpTester test("Col2Im", 1, kMSDomain);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(150);
+  std::vector<float> output(150);
+  std::iota(output.begin(), output.end(), 1);
+  input = _transpose_serialized_vector(output, 2, 3, 5, 5);
+  test.AddInput<float>("input", {2, 15, 5},  input);
+  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {2, 3, 5, 5}, output);
+  test.Run();
+}
+
 TEST(Col2ImContribOpTest, simple5dNCHWD) {
   OpTester test("Col2Im", 1, kMSDomain);
 
@@ -84,7 +104,7 @@ TEST(Col2ImContribOpTest, simple5dNCHWD) {
   std::vector<float> input(25);
   std::vector<float> output(25);
   std::iota(output.begin(), output.end(), 1);
-  input = _transpose_1dvector(output, 1, 5, 5);
+  input = _transpose_serialized_vector(output, 1, 1, 5, 5);
   test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {3},  std::vector<int64_t>{1, 5, 5});
   test.AddInput<int64_t>("block_shape", {3},  std::vector<int64_t>{1, 1, 5});

From 5b27cb1cfb679c3c2b805ad4c923a62c10f718bb Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Tue, 9 Aug 2022 18:35:48 -0400
Subject: [PATCH 09/30] Add logs and Rama's suggestions

---
 .../core/graph/contrib_ops/contrib_defs.cc    |   9 +-
 .../core/providers/cpu/tensor/col2im.cc       |  71 ++++++----
 onnxruntime/core/util/math_cpu.cc             | 127 +++++++++++++-----
 onnxruntime/test/contrib_ops/col2im_test.cc   |  69 ++++++++--
 .../python/contrib_ops/onnx_test_col2im.py    |   4 +-
 5 files changed, 204 insertions(+), 76 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 8fae65f390e4b..cc992a7dfbdbe 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1031,7 +1031,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
                                 "The value represent the number of pixels added to the beginning "
                                 "and end part of the corresponding axis. `pads` format should be as follow "
                                 "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels "
-                                "added at the beginning of axis `i` and xi_end is the number of pixels added at the end of axis `i`. "
+                                "added at the beginning of axis `i` and xi_end the same for the end of axis `i`. "
                                 "If not present, the padding defaults to 0 along start and end of each spatial axis.",
                                 AttributeProto::INTS,
                                 OPTIONAL_VALUE)
@@ -1056,7 +1056,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
                                 1,
                                 "image_shape",
                                 "The shape of the spatial dimensions of the image after rearranging the column blocks."
-                                "This is a 1-dimensional tensor with size of at least 2, containing the value [H_img, W_img] "
+                                "This is a 1-dim tensor with size of at least 2, containing the value [H_img, W_img] "
                                 " for a 2-D image or [dim_i1, dim_i2, ..., dim_iN] for a N-D image.",
                                 "tensor(int64)",
                                 OpSchema::Single,
@@ -1067,8 +1067,9 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
                                 2,
                                 "block_shape",
                                 "The shape of the block to apply on the input."
-                                "This is a 1-dimensional tensor of size of at least 2, containing the value [H_block, W_block] "
-                                " for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block.",
+                                "This is a 1-dim tensor of size of at least 2, containing the value [H_block, W_block] "
+                                " for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block."
+                                "Dilations, pads and strides are applied to block_shape under the hood.",
                                 "tensor(int64)",
                                 OpSchema::Single,
                                 true,
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index fc514c453ae43..51af71364ca15 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -32,7 +32,6 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   const T* col_input_data = col_input->template Data<T>();
   TensorShape col_input_shape = col_input->Shape();
-  int64_t col_input_C = col_input_shape[1];
   const auto col_input_N = col_input_shape[0];
 
   int64_t image_shape_size = 1;
@@ -42,13 +41,13 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
     ++kernel_shape_rank;
     image_shape_size *=  image_shape->Data<int64_t>()[i];
     kernel_shape_size *=  kernel_shape->Data<int64_t>()[i];
-    // col_input_C computed as => (C*n-ary-prod{kernel_shape}) / n-ary-prod{kernel_shape}
-    col_input_C /= kernel_shape->Data<int64_t>()[i];
   }
-  const int64_t col_input_offset = col_input_C * image_shape_size;
+  const int64_t C = col_input_shape[1] / kernel_shape_size;
+  const int64_t col_output_stride = col_input_shape.SizeFromDimension(1);
+  const int64_t col_input_stride = C * image_shape_size;
 
   TensorShapeVector Y_dims;
-  Y_dims.insert(Y_dims.begin(), {col_input_N, col_input_C});
+  Y_dims.insert(Y_dims.begin(), {col_input_N, C});
   for (auto i=0; i < image_shape->Shape()[0]; ++i) {
     Y_dims.push_back(image_shape->Data<int64_t>()[i]);
   }
@@ -56,32 +55,38 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   Tensor* Y = context->Output(0, Yshape);
   T* Ydata = Y->template MutableData<T>();
   for (auto i=0; i < Yshape.Size(); ++i)
-    Ydata[i] = -1; // just for debug (to know what has been written to Ydata in the end)
-  const int64_t Y_offset = Yshape.Size() / Yshape[0];
-
-  std::cout << "\n\tInput 0: col_input = ("; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  col_input_data[i] << ", "; std::cout << ") with shape "<< col_input_shape << std::endl;
-  std::cout << "\tInput 1: image_shape = ("; for (auto i=0; i < image_shape->Shape().Size(); ++i) std::cout << image_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
-  std::cout << "\tInput 2: kernel_shape = ("; for (auto i=0; i < kernel_shape->Shape().Size(); ++i) std::cout << kernel_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
-  std::cout << "\tAttribute strides = ("; for (size_t i=0; i < col2im_attrs_.strides.size(); ++i) std::cout <<  col2im_attrs_.strides[i] << ", "; std::cout << ")"<< std::endl;
-  std::cout << "\tAttribute dilations = ("; for (size_t i=0; i < col2im_attrs_.dilations.size(); ++i) std::cout <<  col2im_attrs_.dilations[i] << ", "; std::cout << ")"<< std::endl;
-  std::cout << "\tAttribute pads = ("; for (size_t i=0; i < col2im_attrs_.pads.size(); ++i) std::cout <<  col2im_attrs_.pads[i] << ", "; std::cout << ")"<< std::endl;
-
-  std::cout << "\tVariable col_input_C: " << col_input_C << std::endl;
+    Ydata[i] = -1;  // just for debug (to know what has been written to Ydata in the end)
+  // const int64_t Y_offset = Yshape.Size() / Yshape[0];
+
+  std::cout << "\n\tInput 0: col_input = ("; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<
+    col_input_data[i] << ", "; std::cout << ") with shape "<< col_input_shape << std::endl;
+  std::cout << "\tInput 1: image_shape = ("; for (auto i=0; i < image_shape->Shape().Size(); ++i) std::cout <<
+    image_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
+  std::cout << "\tInput 2: kernel_shape = ("; for (auto i=0; i < kernel_shape->Shape().Size(); ++i) std::cout <<
+    kernel_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
+  std::cout << "\tAttribute strides = ("; for (size_t i=0; i < col2im_attrs_.strides.size(); ++i) std::cout <<
+    col2im_attrs_.strides[i] << ", "; std::cout << ")"<< std::endl;
+  std::cout << "\tAttribute dilations = ("; for (size_t i=0; i < col2im_attrs_.dilations.size(); ++i) std::cout <<
+    col2im_attrs_.dilations[i] << ", "; std::cout << ")"<< std::endl;
+  std::cout << "\tAttribute pads = ("; for (size_t i=0; i < col2im_attrs_.pads.size(); ++i) std::cout <<
+    col2im_attrs_.pads[i] << ", "; std::cout << ")"<< std::endl;
+
+  std::cout << "\tVariable C: " << C << std::endl;
   std::cout << "\tVariable col_input_N = " << col_input_N << std::endl;
   std::cout <<  "\tVariable image_shape_size: " << image_shape_size << std::endl;
   std::cout <<  "\tVariable kernel_shape_size: " << kernel_shape_size << std::endl;
 
   std::cout << "\n\tStatus Col2Im<T>::Compute() --> math::Col2imNd<>()" << std::endl;
 
-  assert(image_shape_size == Y_offset); // just for temp debug
+  assert(image_shape_size == Y_offset);  // just for temp debug
 
   for (auto image_id = 0; image_id < col_input_N; ++image_id) {
     std::cout << "Image " << image_id+1 << " out of "<< col_input_N << std::endl;
     if (image_shape->Shape()[0] == 2) {
       std::cout << "image_shape->Shape()[0] == 2 --> Col2Im" << std::endl;
       math::Col2im<float, CPUMathUtil, StorageOrder::NCHW>(
-        col_input_data + image_id * col_input_offset,
-        col_input_C,
+        col_input_data + image_id * col_output_stride,
+        C,
         image_shape->Data<int64_t>()[0],
         image_shape->Data<int64_t>()[1],
         kernel_shape->Data<int64_t>()[0],
@@ -94,27 +99,41 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
         col2im_attrs_.pads[3],
         col2im_attrs_.strides[0],
         col2im_attrs_.strides[1],
-        Ydata + image_id * Y_offset,
+        Ydata + image_id * col_input_stride,
         &CPUMathUtil::Instance());
     } else {
       std::cout << "image_shape->Shape()[0] != 2 --> Col2ImNd (nd=" << image_shape->Shape()[0] << ") " << std::endl;
       math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
-        col_input_data + image_id * col_input_offset,     // const T* data_col,
+        col_input_data + image_id * col_output_stride,    // const T* data_col,
         image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
         Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
-        col_input_C,                                      // int64_t channels_col,
+        // col_input_shape[1],                               // int64_t channels_col,
+        //    leads to output
+        //          {1, -nan, 11, 16, 2.58141e+34, 2, 8.80295e+34, 12, 17, 22, 3, 4.59718e+24, 13, 18, 2.85144e+34, 4,
+        //           -443.863, 14, -nan, 24, 5, 10, 15, 20, 25}
+        //    that is similar to input with some spots with random values
+
+        C,                                   // int64_t channels_col,
+        //    leads to output {1, 6, 11, 16, 21, 2, 7, 12, 17, 22, 3, 8, 13, 18, 23, 4, 9, 14, 19, 24, 5, 10, 15, 20, 25, }
+        //    that is identical to input
+
+        // col_input_shape[2],                               // int64_t channels_col,
+        //    leads to output
+        //          {1, 6, 1.92869e+31, 4.84145e+30, 1.88774e+31, 2, 7, 12, 17, 22, 3, 8, 1.86549e+31, 3.40686e+25,
+        //           2.20182e+24, 4, -2.56655e+29, 5.08551e+31, -1.05888e+29, 1.51107e+29, 5, 10, 15, 20, 7.2793e+31}
+        //    that is very similar to input, but with some rounded numbers and corrupted "25" value
         image_shape_size,                                 // int64_t img_size,
         kernel_shape->Data<int64_t>(),                    // const int64_t* kernel_shape,
         col2im_attrs_.strides.data(),                     // const int64_t* stride,
         col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
         col2im_attrs_.pads.data(),                        // const int64_t* pad,
         kernel_shape->Shape().Size(),                     // ptrdiff_t N, --> #spatial_dims?
-        Ydata + image_id * Y_offset,                      // T* data_img,
-        &CPUMathUtil::Instance()                          // Provider* provider
-        );
+        Ydata + image_id * col_input_stride,              // T* data_img,
+        &CPUMathUtil::Instance());                        // Provider* provider
     }
   }
-  std::cout << "\n\n Return Col2Im<T>::Compute() --> "; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<  Ydata[i] << ", "; std::cout << ") with shape " << Yshape << std::endl << std::endl;
+  std::cout << "\n\n Return Col2Im<T>::Compute() --> "; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<
+    Ydata[i] << ", "; std::cout << ") with shape " << Yshape << std::endl << std::endl;
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 709941a819f79..3b5fa0a549af9 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -36,6 +36,8 @@ using onnxruntime::concurrency::ThreadPool;
 namespace onnxruntime {
 namespace math {
 
+using std::cout;
+
 // MatMul implementation purely based on Eigen.
 #define EIGEN_MATMUL_FUNCTION(T)                                                                                  \
   template <>                                                                                                     \
@@ -308,23 +310,56 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
     int64_t stride_w,
     T* data_col,
     T padding_value) {
+
+    cout << "void Im2col<T, StorageOrder::NCHW>::operator()(";
+    cout << "\n\tconst T* data_im=" << data_im;
+    cout << "\n\tint64_t channels=" << channels;
+    cout << "\n\tint64_t heigh=" << height;
+    cout << "\n\tint64_t width=" << width;
+    cout << "\n\tint64_t kernel_h=" << kernel_h;
+    cout << "\n\tint64_t kernel_w=" << kernel_w;
+    cout << "\n\tint64_t dilation_h=" << dilation_h;
+    cout << "\n\tint64_t dilation_w=" << dilation_w;
+    cout << "\n\tint64_t pad_t=" << pad_t;
+    cout << "\n\tint64_t pad_l=" << pad_l;
+    cout << "\n\tint64_t pad_b=" << pad_b;
+    cout << "\n\tint64_t pad_r=" << pad_r;
+    cout << "\n\tint64_t stride_h=" << stride_h;
+    cout << "\n\tint64_t stride_w=" << stride_w;
+    cout << "\n\tT* data_col=" << data_col;
+    cout << "\n\tT padding_value=" << padding_value << ")" << std::endl;
+
+  int data_col_offset = 0;
   const int64_t output_h = (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   const int64_t output_w = (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
+  cout << "output_h: " << output_h << std::endl;
+  cout << "output_w: " << output_w << std::endl;
   // From Intel, https://github.com/BVLC/caffe/pull/3536
   int64_t channel_size = height * width;
+  cout << "channel_size (height * width): " << channel_size << std::endl;
   for (int64_t channel = channels; channel--; data_im += channel_size) {
+    cout << "for channel= " << channel << "/channel_size=" << channel_size << std::endl;
     for (int64_t kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      cout << "\tfor kernel_row= " << kernel_row << "/kernel_h=" << kernel_h << std::endl;
       for (int64_t kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        cout << "\t\tfor kernel_col= " << kernel_col << "/kernel_w=" << kernel_w << std::endl;
         int64_t input_row = -pad_t + kernel_row * dilation_h;
+        cout << "\t\t\tinput_row= " << input_row << std::endl;
         for (int64_t output_rows = output_h; output_rows; output_rows--) {
+          cout << "\t\t\tfor output_rows= " << output_rows << "/output_h=" << output_h << std::endl;
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             std::fill_n(data_col, output_w, padding_value);
+            cout << "\t\t\t\t(input_row out of bounds) data_col["<< data_col_offset << "] = " << padding_value <<
+              " * " << output_w << " times"<< std::endl;
+            data_col_offset += output_w;
             data_col += output_w;
           } else {
             int64_t input_col = -pad_l + kernel_col * dilation_w;
+            cout << "\t\t\t\tinput_col= " << input_col << std::endl;
+            cout << "\t\t\t\tinput_pos= " << input_row * width + input_col << std::endl;
             const T* rdptr = data_im + input_row * width + input_col;
             for (int64_t i = 0; i < output_w;) {
+              cout << "\t\t\t\tfor i= " << i << "/output_w=" << output_w << std::endl;
               int64_t output_handled = 1;
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 if (stride_w == 1) {
@@ -332,6 +367,10 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
                   // and the number of output elements to produce.
                   output_handled = std::min(width - input_col, output_w - i);
                   data_col = std::copy_n(&rdptr[i], static_cast<size_t>(output_handled), data_col);
+                  cout << "\t\t\t\t\tdata_col["<< data_col_offset << "] = " << rdptr[i] << std::endl;
+                  data_col_offset += output_handled;
+                  // cout << "\t\t\t\t\t(stride 1) Copied " << output_handled <<
+                  //  " element(s) from data_im to data_col: " << rdptr[i] << std::endl;
                 } else if (stride_w == 2) {
                   // Same as above except using the number of strided input elements.
                   output_handled = std::min((width - input_col + 1) / 2, output_w - i);
@@ -340,11 +379,16 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
                     *(data_col++) = *local_rdptr;
                     local_rdptr += 2;
                   }
+                  cout << "\t\t\t\t\t(stride 2) Copy " << output_handled
+                    << " elements from data_im to data_col " << std::endl;
                 } else {
                   *(data_col++) = rdptr[i * stride_w];
+                  cout << "\t\t\t\t\t(stride >2) Copy 1 element from data_im to data_col " << std::endl;
                 }
               } else {
                 *(data_col++) = padding_value;
+                cout << "\t\t\t\t\t(input_col out of bounds) fill data_col with 1 padding_value= " <<
+                  padding_value << std::endl;
               }
               input_col += output_handled * stride_w;
               i += output_handled;
@@ -376,21 +420,30 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
   int64_t output_shape_size = std::accumulate(output_shape, output_shape + rank, 1LL, std::multiplies<int64_t>());
   int64_t kernel_shape_size = std::accumulate(kernel_shape, kernel_shape + rank, 1LL, std::multiplies<int64_t>());
 
-  std::cout << "\n\nCalled void Im2col<T, StorageOrder::NCHW>::operator()(";
-  std::cout << ",\n\tconst T* data_im={"; for (auto i=0; i < im_shape_size; ++i) std::cout << data_im[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* im_shape={"; for (auto i=0; i < rank; ++i) std::cout << im_shape[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < rank; ++i) std::cout << output_shape[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tint64_t channels_col=" << channels_col;
-  std::cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < rank; ++i) std::cout << kernel_shape[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < rank; ++i) std::cout << stride[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < rank; ++i) std::cout << dilation[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*rank; ++i) std::cout << pad[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tptrdiff_t rank=" << rank;
-  std::cout << ",\n\tT* data_col= preallocated pointer to write at {"; for (auto i=0; i < output_shape_size; ++i) std::cout << data_col[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tbool accumulate_output=" << accumulate_output;
-  std::cout << ",\n\tT padding_value=" << padding_value << ")";
-
-  std::cout << "\n\n\tVariable im_shape_size: " << im_shape_size << "\n\tVariable output_shape_size: "<<output_shape_size << "\n\tVariable kernel_shape_size: " << kernel_shape_size << std::endl<< std::endl;
+  cout << "\n\nCalled void Im2col<T, StorageOrder::NCHW>::operator()(";
+  cout << ",\n\tconst T* data_im={"; for (auto i=0; i < im_shape_size; ++i) cout << data_im[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tconst int64_t* im_shape={"; for (auto i=0; i < rank; ++i) cout << im_shape[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < rank; ++i) cout << output_shape[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tint64_t channels_col=" << channels_col;
+  cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < rank; ++i) cout << kernel_shape[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < rank; ++i) cout << stride[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < rank; ++i) cout << dilation[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*rank; ++i) cout << pad[i] << ", ";
+    cout << "}";
+  cout << ",\n\tptrdiff_t rank=" << rank;
+  cout << ",\n\tT* data_col= preallocated pointer to write at {"; for (auto i=0; i < output_shape_size; ++i) cout <<
+    data_col[i] << ", "; cout << "}";
+  cout << ",\n\tbool accumulate_output=" << accumulate_output;
+  cout << ",\n\tT padding_value=" << padding_value << ")";
+
+  cout << "\n\n\tVariable im_shape_size: " << im_shape_size << "\n\tVariable output_shape_size: " <<
+    output_shape_size << "\n\tVariable kernel_shape_size: " << kernel_shape_size << std::endl << std::endl;
 
   std::vector<int64_t> d_offset(rank, 0);
   std::vector<int64_t> d_iter(rank, 0);
@@ -430,7 +483,8 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
     } while (NextPosition(rank, output_shape, d_iter.data()));
   }  // for (int c = 0; c < channels_col; ++c) {
 
-  std::cout << "Return void Im2col -> T* data_col={"; for (auto i=0; i < output_shape_size; ++i) std::cout << data_col[i] << ", "; std::cout << "}\n";
+  cout << "Return void Im2col -> T* data_col={"; for (auto i=0; i < output_shape_size; ++i) cout <<
+    data_col[i] << ", "; cout << "}\n";
 }
 
 template struct Im2col<float, StorageOrder::NCHW>;
@@ -815,19 +869,26 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col,
                                                       ptrdiff_t N,
                                                       float* data_img,
                                                       CPUMathUtil* context) {
-  std::cout << "\n\nCalled void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(";
-  std::cout << ",\n\tconst float* data_col={"; for (auto i=0; i < img_size; ++i) std::cout << data_col[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* img_shape={"; for (auto i=0; i < N; ++i) std::cout << img_shape[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < N; ++i) std::cout << output_shape[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tint64_t channels_col=" << channels_col;
-  std::cout << ",\n\tint64_t img_size=" << img_size;
-  std::cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < N; ++i) std::cout << kernel_shape[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < N; ++i) std::cout << stride[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < N; ++i) std::cout << dilation[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*N; ++i) std::cout << pad[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tptrdiff_t N=" << N;
-  std::cout << ",\n\tfloat* data_img= preallocated pointer to save at {"; for (auto i=0; i < img_size; ++i) std::cout << data_img[i] << ", "; std::cout << "}";
-  std::cout << ",\n\tCPUMathUtil* context=...)" << std::endl;
+  cout << "\n\nCalled void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(";
+  cout << ",\n\tconst float* data_col={"; for (auto i=0; i < img_size; ++i) cout <<
+    data_col[i] << ", "; cout << "}";
+  cout << ",\n\tconst int64_t* img_shape={"; for (auto i=0; i < N; ++i) cout << img_shape[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < N; ++i) cout << output_shape[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tint64_t channels_col=" << channels_col;
+  cout << ",\n\tint64_t img_size=" << img_size;
+  cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < N; ++i) cout << kernel_shape[i] <<
+    ", "; cout << "}";
+  cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < N; ++i) cout << stride[i] << ", ";
+    cout << "}";
+  cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < N; ++i) cout << dilation[i] << ", ";
+    cout << "}";
+  cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*N; ++i) cout << pad[i] << ", "; cout << "}";
+  cout << ",\n\tptrdiff_t N=" << N;
+  cout << ",\n\tfloat* data_img= preallocated pointer to save at {"; for (auto i=0; i < img_size; ++i) cout <<
+    data_img[i] << ", "; cout << "}";
+  cout << ",\n\tCPUMathUtil* context=...)" << std::endl;
 
   Set<float, CPUMathUtil>(gsl::narrow<ptrdiff_t>(img_size), 0, data_img, context);
   Im2col<float, StorageOrder::NCHW>()(
@@ -841,10 +902,10 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col,
       pad,            // const int64_t* pad,
       N,              // ptrdiff_t rank,
       data_img,       // T* data_col,
-      true            // bool accumulate_output,
-      );
+      true);          // bool accumulate_output,
 
-  std::cout << "Return void Col2imNd --> float* data_img= {"; for (auto i=0; i < img_size; ++i) std::cout << data_img[i] << ", "; std::cout << "}";
+  cout << "Return void Col2imNd --> float* data_img= {"; for (auto i=0; i < img_size; ++i) cout <<
+    data_img[i] << ", "; cout << "}";
 }
 
 #define SPECIALIZED_COPYVECTOR(T)                                                          \
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index 6dacfa9f02301..57638b08bcb89 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -10,27 +10,26 @@ namespace onnxruntime {
 namespace test {
 
 template <typename T>
-std::vector<T> _transpose_serialized_vector(std::vector<T> &input, size_t N, size_t C, size_t H, size_t W)
-{
+std::vector<T> _transpose_serialized_vector(std::vector<T> &input, size_t N, size_t C, size_t H, size_t W) {
     size_t input_size = input.size();
-    if (input_size == 0){
+    if (input_size == 0) {
         throw std::runtime_error("Invalid input");
     }
     std::vector<T> trans_vec(input);
 
     std::cout << "input: (";
-    for(size_t i = 0; i < input_size; ++i)
+    for (size_t i = 0; i < input_size; ++i)
       std::cout << trans_vec[i] << ", ";
     std::cout << ")" << std::endl;
 
-    for(size_t n = 0; n < N; ++n)
-      for(size_t c = 0; c < C; ++c)
-        for(size_t i = 0; i < H; ++i)
-          for(size_t j = i+1; j < W; ++j)
-              std::swap(trans_vec[n*(C*H*W) + c*(H*W) + (H*i + j)], trans_vec[n*(C*H*W) + c*(H*W) + (W*j + i)]);
+    for (size_t n = 0; n < N; ++n)
+      for (size_t c = 0; c < C; ++c)
+        for (size_t h = 0; h < H; ++h)
+          for (size_t w = 0; w < W; ++w)
+              trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] = input[n * (C * H * W) + c * (H * W) + (w + W * h)];
 
     std::cout << "trans_vec: (";
-    for(size_t i = 0; i < input_size; ++i)
+    for (size_t i = 0; i < input_size; ++i)
       std::cout << trans_vec[i] << ", ";
     std::cout << ")" << std::endl;
 
@@ -56,6 +55,25 @@ TEST(Col2ImContribOpTest, simple4dNCHW) {
   test.Run();
 }
 
+TEST(Col2ImContribOpTest, with2Images3channelsNonSquare4dNCHW) {
+  OpTester test("Col2Im", 1, kMSDomain);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(120);
+  std::vector<float> output(120);
+  std::iota(output.begin(), output.end(), 1);
+  input = _transpose_serialized_vector(output, 2, 3, 4, 5);
+  test.AddInput<float>("input", {2, 15, 4},  input);
+  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {2, 3, 4, 5}, output);
+  test.Run();
+}
+
 TEST(Col2ImContribOpTest, with3channels4dNCHW) {
   OpTester test("Col2Im", 1, kMSDomain);
 
@@ -108,11 +126,40 @@ TEST(Col2ImContribOpTest, simple5dNCHWD) {
   test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {3},  std::vector<int64_t>{1, 5, 5});
   test.AddInput<int64_t>("block_shape", {3},  std::vector<int64_t>{1, 1, 5});
-
   test.AddOutput<float>("output", {1, 1, 1, 5, 5}, output);
   test.Run();
 }
 
+TEST(Im2ColContribOpTest, simple) {
+  std::vector<float> input(24);
+  std::vector<float> expected_output(24);
+  std::iota(input.begin(), input.end(), 1);
+  expected_output = {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12, 13, 17, 21, 14, 18, 22, 15, 19, 23, 16, 20, 24};
+  float* actual_output = new float(24);
+  std::cout << "\nExpected output --> "; for (auto i=0; i < 24; ++i) std::cout <<  expected_output[i] << ", ";
+    std::cout << ")" << std::endl;
+  math::Im2col<float, StorageOrder::NCHW>()(
+    input.data(),
+    int64_t(2),
+    int64_t(3),
+    int64_t(4),
+    int64_t(1),
+    int64_t(4),
+    int64_t(1),
+    int64_t(1),
+    int64_t(0),
+    int64_t(0),
+    int64_t(0),
+    int64_t(0),
+    int64_t(1),
+    int64_t(1),
+    actual_output,
+    0.);
+
+    std::cout << "\nActual output   --> "; for (auto i=0; i < 24; ++i) std::cout <<  actual_output[i] <<
+      ", "; std::cout << ")" << std::endl;
+    delete[] actual_output;
+}
 
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py b/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
index 97269d895a125..31c5f129fad6b 100644
--- a/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
+++ b/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
@@ -12,7 +12,7 @@
 
 class ONNXReferenceImplementationTest(unittest.TestCase):
     def test_col2im(self) -> None:
-        input = np.array(
+        inputs = np.array(
             [
                 [
                     [1.0, 6.0, 11.0, 16.0, 21.0],  # (1, 5, 5)
@@ -45,7 +45,7 @@ def test_col2im(self) -> None:
 
         expect(
             node,
-            inputs=[input, image_shape, block_shape],
+            inputs=[inputs, image_shape, block_shape],
             outputs=[col2im_reference_implementation],
             name="test_col2im",
         )

From 9ed857f005d5aa1cd0c0db00561e4eeceb8b81a9 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Wed, 10 Aug 2022 20:32:12 -0400
Subject: [PATCH 10/30] Add dilated kernel shape as per Rama review

---
 .../core/providers/cpu/tensor/col2im.cc       | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 51af71364ca15..0c9b2a30feca9 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -28,7 +28,14 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   const auto* col_input = context->Input<Tensor>(0);
   const auto* image_shape = context->Input<Tensor>(1);
   const auto* kernel_shape = context->Input<Tensor>(2);
+
+  // TODO(rama): Kernel with dilation
+  TensorShapeVector dilated_kernel_shape_dims;
   std::cout << "Status Col2Im<T>::Compute(OpKernelContext* context)" << std::endl;
+  for (auto i=0; i < kernel_shape->Shape().Size(); ++i) {
+    dilated_kernel_shape_dims[i] = col2im_attrs_.dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1;
+  }
+  TensorShape dilated_kernel_shape(dilated_kernel_shape_dims);
 
   const T* col_input_data = col_input->template Data<T>();
   TensorShape col_input_shape = col_input->Shape();
@@ -40,7 +47,7 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   for (auto i=0; i < image_shape->Shape().Size(); ++i) {
     ++kernel_shape_rank;
     image_shape_size *=  image_shape->Data<int64_t>()[i];
-    kernel_shape_size *=  kernel_shape->Data<int64_t>()[i];
+    kernel_shape_size *=  dilated_kernel_shape_dims[i];
   }
   const int64_t C = col_input_shape[1] / kernel_shape_size;
   const int64_t col_output_stride = col_input_shape.SizeFromDimension(1);
@@ -54,9 +61,6 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   TensorShape Yshape(Y_dims);
   Tensor* Y = context->Output(0, Yshape);
   T* Ydata = Y->template MutableData<T>();
-  for (auto i=0; i < Yshape.Size(); ++i)
-    Ydata[i] = -1;  // just for debug (to know what has been written to Ydata in the end)
-  // const int64_t Y_offset = Yshape.Size() / Yshape[0];
 
   std::cout << "\n\tInput 0: col_input = ("; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<
     col_input_data[i] << ", "; std::cout << ") with shape "<< col_input_shape << std::endl;
@@ -73,9 +77,10 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   std::cout << "\tVariable C: " << C << std::endl;
   std::cout << "\tVariable col_input_N = " << col_input_N << std::endl;
-  std::cout <<  "\tVariable image_shape_size: " << image_shape_size << std::endl;
-  std::cout <<  "\tVariable kernel_shape_size: " << kernel_shape_size << std::endl;
-
+  std::cout << "\tVariable image_shape_size: " << image_shape_size << std::endl;
+  std::cout << "\tVariable kernel_shape_size: " << kernel_shape_size << std::endl;
+  std::cout << "\tVariable: dilated_kernel_shape = ("; for (auto i=0; i < dilated_kernel_shape.Size(); ++i) std::cout <<
+    dilated_kernel_shape[i] << ", "; std::cout << ")" << std::endl;
   std::cout << "\n\tStatus Col2Im<T>::Compute() --> math::Col2imNd<>()" << std::endl;
 
   assert(image_shape_size == Y_offset);  // just for temp debug
@@ -123,11 +128,11 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
         //           2.20182e+24, 4, -2.56655e+29, 5.08551e+31, -1.05888e+29, 1.51107e+29, 5, 10, 15, 20, 7.2793e+31}
         //    that is very similar to input, but with some rounded numbers and corrupted "25" value
         image_shape_size,                                 // int64_t img_size,
-        kernel_shape->Data<int64_t>(),                    // const int64_t* kernel_shape,
+        dilated_kernel_shape_dims.data(),                    // const int64_t* kernel_shape,
         col2im_attrs_.strides.data(),                     // const int64_t* stride,
         col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
         col2im_attrs_.pads.data(),                        // const int64_t* pad,
-        kernel_shape->Shape().Size(),                     // ptrdiff_t N, --> #spatial_dims?
+        dilated_kernel_shape.Size(),                     // ptrdiff_t N, --> #spatial_dims?
         Ydata + image_id * col_input_stride,              // T* data_img,
         &CPUMathUtil::Instance());                        // Provider* provider
     }

From a09c151bf053948b5073a16969dc46bd12038a8d Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Thu, 11 Aug 2022 15:17:36 -0400
Subject: [PATCH 11/30] Add support to dilation/padding/strides

---
 .../core/providers/cpu/tensor/col2im.cc       | 31 +++++++--------
 onnxruntime/test/contrib_ops/col2im_test.cc   | 39 +++++++++++++++++++
 2 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 0c9b2a30feca9..08c315193479d 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -32,10 +32,6 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   // TODO(rama): Kernel with dilation
   TensorShapeVector dilated_kernel_shape_dims;
   std::cout << "Status Col2Im<T>::Compute(OpKernelContext* context)" << std::endl;
-  for (auto i=0; i < kernel_shape->Shape().Size(); ++i) {
-    dilated_kernel_shape_dims[i] = col2im_attrs_.dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1;
-  }
-  TensorShape dilated_kernel_shape(dilated_kernel_shape_dims);
 
   const T* col_input_data = col_input->template Data<T>();
   TensorShape col_input_shape = col_input->Shape();
@@ -47,8 +43,10 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   for (auto i=0; i < image_shape->Shape().Size(); ++i) {
     ++kernel_shape_rank;
     image_shape_size *=  image_shape->Data<int64_t>()[i];
-    kernel_shape_size *=  dilated_kernel_shape_dims[i];
+    kernel_shape_size *=  kernel_shape->Data<int64_t>()[i];
+    dilated_kernel_shape_dims.push_back(col2im_attrs_.dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1);
   }
+  TensorShape dilated_kernel_shape(dilated_kernel_shape_dims);
   const int64_t C = col_input_shape[1] / kernel_shape_size;
   const int64_t col_output_stride = col_input_shape.SizeFromDimension(1);
   const int64_t col_input_stride = C * image_shape_size;
@@ -79,12 +77,10 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   std::cout << "\tVariable col_input_N = " << col_input_N << std::endl;
   std::cout << "\tVariable image_shape_size: " << image_shape_size << std::endl;
   std::cout << "\tVariable kernel_shape_size: " << kernel_shape_size << std::endl;
-  std::cout << "\tVariable: dilated_kernel_shape = ("; for (auto i=0; i < dilated_kernel_shape.Size(); ++i) std::cout <<
+  std::cout << "\tVariable: dilated_kernel_shape = ("; for (size_t i=0; i < dilated_kernel_shape.NumDimensions(); ++i) std::cout <<
     dilated_kernel_shape[i] << ", "; std::cout << ")" << std::endl;
   std::cout << "\n\tStatus Col2Im<T>::Compute() --> math::Col2imNd<>()" << std::endl;
 
-  assert(image_shape_size == Y_offset);  // just for temp debug
-
   for (auto image_id = 0; image_id < col_input_N; ++image_id) {
     std::cout << "Image " << image_id+1 << " out of "<< col_input_N << std::endl;
     if (image_shape->Shape()[0] == 2) {
@@ -112,27 +108,28 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
         col_input_data + image_id * col_output_stride,    // const T* data_col,
         image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
         Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
-        // col_input_shape[1],                               // int64_t channels_col,
+        // col_input_shape[1],                            // int64_t channels_col,
         //    leads to output
-        //          {1, -nan, 11, 16, 2.58141e+34, 2, 8.80295e+34, 12, 17, 22, 3, 4.59718e+24, 13, 18, 2.85144e+34, 4,
-        //           -443.863, 14, -nan, 24, 5, 10, 15, 20, 25}
+        //          {1, 6, 11, 16, 21, 2, 7, 12, 17, 22, 3, 8, 13, 18, 23,
+        //           4, 9, 14, 3.13005e+12, 1.88865e+31, 5, 10, 15, 20, 25,}
         //    that is similar to input with some spots with random values
 
-        C,                                   // int64_t channels_col,
+        C,                                                // int64_t channels_col,
         //    leads to output {1, 6, 11, 16, 21, 2, 7, 12, 17, 22, 3, 8, 13, 18, 23, 4, 9, 14, 19, 24, 5, 10, 15, 20, 25, }
         //    that is identical to input
 
-        // col_input_shape[2],                               // int64_t channels_col,
+        // col_input_shape[2],                            // int64_t channels_col,
         //    leads to output
-        //          {1, 6, 1.92869e+31, 4.84145e+30, 1.88774e+31, 2, 7, 12, 17, 22, 3, 8, 1.86549e+31, 3.40686e+25,
-        //           2.20182e+24, 4, -2.56655e+29, 5.08551e+31, -1.05888e+29, 1.51107e+29, 5, 10, 15, 20, 7.2793e+31}
+        //          {1, 6, 1.89906e+28, 7.00716e+22, 8.96572e+22, 2, 7, 6.09175e+22, 1.81786e+31, 3.50226e+29, 3, 8,
+        //           1.8001e+14, 2.67907e+20, 2.79522e+20, 4, 1.79858e+14, 4.74181e+30, 7.40484e+28, 1.80733e+28, 5,
+        //           10, 1.42889e+19, 6635.59, 2.46452e+11}
         //    that is very similar to input, but with some rounded numbers and corrupted "25" value
         image_shape_size,                                 // int64_t img_size,
-        dilated_kernel_shape_dims.data(),                    // const int64_t* kernel_shape,
+        dilated_kernel_shape.GetDims().data(),            // const int64_t* kernel_shape,
         col2im_attrs_.strides.data(),                     // const int64_t* stride,
         col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
         col2im_attrs_.pads.data(),                        // const int64_t* pad,
-        dilated_kernel_shape.Size(),                     // ptrdiff_t N, --> #spatial_dims?
+        image_shape->Shape().Size(),                      // ptrdiff_t N, --> #spatial_dims?
         Ydata + image_id * col_input_stride,              // T* data_img,
         &CPUMathUtil::Instance());                        // Provider* provider
     }
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index 57638b08bcb89..0e542aff9ac68 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -74,6 +74,45 @@ TEST(Col2ImContribOpTest, with2Images3channelsNonSquare4dNCHW) {
   test.Run();
 }
 
+TEST(Col2ImContribOpTest, with2Images2channelsNonSquareDilationPadStride4dNCHW) {
+  OpTester test("Col2Im", 1, kMSDomain);
+
+  test.AddAttribute("strides", std::vector<int64_t>{2, 2});
+  test.AddAttribute("dilations", std::vector<int64_t>{2, 2});
+  test.AddAttribute("pads", std::vector<int64_t>{2, 2, 2, 2});
+
+  std::vector<float> input{ 0., 0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0.,
+                            0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0., 0.,
+                            0., 0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0.,
+                            0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0., 0.,
+                            0., 0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0.,
+                            0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0., 0.,
+                            0., 0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0.,
+                            0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0., 0.};
+  std::vector<float> output { 2., 0., 6., 0., 10.,
+                              0., 0., 0., 0., 0.,
+                              22., 0., 26., 0., 30.,
+                              0., 0., 0., 0., 0.,
+                              42., 0., 46., 0., 50.,
+                              0., 0., 0., 0., 0.,
+                              62., 0., 66., 0., 70.,
+                              0., 0., 0., 0., 0.,
+                              82., 0., 86., 0., 90.,
+                              0., 0., 0., 0., 0.,
+                              102., 0., 106., 0., 110.,
+                              0., 0., 0., 0., 0.,
+                              122., 0., 126., 0., 130.,
+                              0., 0., 0., 0., 0.,
+                              142., 0., 146., 0., 150.,
+                              0., 0., 0., 0., 0.};
+  test.AddInput<float>("input", {2, 4, 16},  input);
+  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 2});
+
+  test.AddOutput<float>("output", {2, 2, 4, 5}, output);
+  test.Run();
+}
+
 TEST(Col2ImContribOpTest, with3channels4dNCHW) {
   OpTester test("Col2Im", 1, kMSDomain);
 

From f65b85f66299c777710d17176804ac01664cd4bb Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Thu, 11 Aug 2022 17:41:32 -0400
Subject: [PATCH 12/30] Code cleanup

---
 .../core/graph/contrib_ops/contrib_defs.cc    |   6 +-
 .../core/providers/cpu/tensor/col2im.cc       | 113 ++++----------
 .../providers/cpu/tensor/col2im_attributes.h  |   5 -
 onnxruntime/core/util/math_cpu.cc             | 146 +++---------------
 .../tools/pytorch_export_contrib_ops.py       |   2 +
 onnxruntime/test/contrib_ops/col2im_test.cc   |  43 +++---
 6 files changed, 77 insertions(+), 238 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index cc992a7dfbdbe..715eb9cda97b5 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -917,7 +917,7 @@ void col2imShapeInference(InferenceContext& ctx) {
   if (ctx.getInputType(1)->tensor_type().shape().dim_size() != 1) {
     fail_shape_inference("image_shape tensor must have rank 1.");
   }
-  size_t n_input_dims = ctx.getInputType(1)->tensor_type().shape().dim(0).dim_value();
+  size_t n_input_dims = static_cast<size_t>(ctx.getInputType(1)->tensor_type().shape().dim(0).dim_value());
   std::vector<int64_t> image_shape = {};
   const TensorProto* image_shape_data = ctx.getInputData(1);
   if (image_shape_data) {
@@ -1069,7 +1069,9 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
                                 "The shape of the block to apply on the input."
                                 "This is a 1-dim tensor of size of at least 2, containing the value [H_block, W_block] "
                                 " for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block."
-                                "Dilations, pads and strides are applied to block_shape under the hood.",
+                                "Dilations, pads and strides are applied to block_shape under the hood."
+                                "The kernel window start at the top-left of the block and slides to the right and down,"
+                                "similarly to how Convolution kernels do.",
                                 "tensor(int64)",
                                 OpSchema::Single,
                                 true,
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 08c315193479d..3a75910f9ff6c 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <cassert>
 #include "core/providers/cpu/tensor/col2im.h"
 
 #include "core/framework/element_type_lists.h"
@@ -25,68 +24,39 @@ REGISTER_KERNEL_TYPED(float)
 
 template <typename T>
 Status Col2Im<T>::Compute(OpKernelContext* context) const {
-  const auto* col_input = context->Input<Tensor>(0);
+  const auto* col_tensor = context->Input<Tensor>(0);
   const auto* image_shape = context->Input<Tensor>(1);
   const auto* kernel_shape = context->Input<Tensor>(2);
 
-  // TODO(rama): Kernel with dilation
-  TensorShapeVector dilated_kernel_shape_dims;
-  std::cout << "Status Col2Im<T>::Compute(OpKernelContext* context)" << std::endl;
-
-  const T* col_input_data = col_input->template Data<T>();
-  TensorShape col_input_shape = col_input->Shape();
-  const auto col_input_N = col_input_shape[0];
-
   int64_t image_shape_size = 1;
   int64_t kernel_shape_size = 1;
-  int64_t kernel_shape_rank = 0;
+  TensorShapeVector adjusted_kernel_shape_dims;
   for (auto i=0; i < image_shape->Shape().Size(); ++i) {
-    ++kernel_shape_rank;
     image_shape_size *=  image_shape->Data<int64_t>()[i];
     kernel_shape_size *=  kernel_shape->Data<int64_t>()[i];
-    dilated_kernel_shape_dims.push_back(col2im_attrs_.dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1);
+    adjusted_kernel_shape_dims.push_back(col2im_attrs_.dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1);
   }
-  TensorShape dilated_kernel_shape(dilated_kernel_shape_dims);
-  const int64_t C = col_input_shape[1] / kernel_shape_size;
-  const int64_t col_output_stride = col_input_shape.SizeFromDimension(1);
-  const int64_t col_input_stride = C * image_shape_size;
-
-  TensorShapeVector Y_dims;
-  Y_dims.insert(Y_dims.begin(), {col_input_N, C});
+  TensorShape col_shape = col_tensor->Shape();
+  const auto N = col_shape[0];
+  const int64_t C = col_shape[1] / kernel_shape_size;
+  const int64_t col_stride = C * image_shape_size;
+  TensorShape adjusted_kernel_shape(adjusted_kernel_shape_dims);
+  const int64_t col_data_stride = col_shape.SizeFromDimension(1);
+
+  TensorShapeVector batched_image_shape_dims, adjusted_image_shape_dims;
+  batched_image_shape_dims.insert(batched_image_shape_dims.begin(), {N, C});
   for (auto i=0; i < image_shape->Shape()[0]; ++i) {
-    Y_dims.push_back(image_shape->Data<int64_t>()[i]);
+    batched_image_shape_dims.push_back(image_shape->Data<int64_t>()[i]);
+    adjusted_image_shape_dims.push_back(image_shape->Data<int64_t>()[i]-adjusted_kernel_shape[i]+1);
   }
-  TensorShape Yshape(Y_dims);
-  Tensor* Y = context->Output(0, Yshape);
-  T* Ydata = Y->template MutableData<T>();
-
-  std::cout << "\n\tInput 0: col_input = ("; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<
-    col_input_data[i] << ", "; std::cout << ") with shape "<< col_input_shape << std::endl;
-  std::cout << "\tInput 1: image_shape = ("; for (auto i=0; i < image_shape->Shape().Size(); ++i) std::cout <<
-    image_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
-  std::cout << "\tInput 2: kernel_shape = ("; for (auto i=0; i < kernel_shape->Shape().Size(); ++i) std::cout <<
-    kernel_shape->Data<int64_t>()[i] << ", "; std::cout << ")" << std::endl;
-  std::cout << "\tAttribute strides = ("; for (size_t i=0; i < col2im_attrs_.strides.size(); ++i) std::cout <<
-    col2im_attrs_.strides[i] << ", "; std::cout << ")"<< std::endl;
-  std::cout << "\tAttribute dilations = ("; for (size_t i=0; i < col2im_attrs_.dilations.size(); ++i) std::cout <<
-    col2im_attrs_.dilations[i] << ", "; std::cout << ")"<< std::endl;
-  std::cout << "\tAttribute pads = ("; for (size_t i=0; i < col2im_attrs_.pads.size(); ++i) std::cout <<
-    col2im_attrs_.pads[i] << ", "; std::cout << ")"<< std::endl;
-
-  std::cout << "\tVariable C: " << C << std::endl;
-  std::cout << "\tVariable col_input_N = " << col_input_N << std::endl;
-  std::cout << "\tVariable image_shape_size: " << image_shape_size << std::endl;
-  std::cout << "\tVariable kernel_shape_size: " << kernel_shape_size << std::endl;
-  std::cout << "\tVariable: dilated_kernel_shape = ("; for (size_t i=0; i < dilated_kernel_shape.NumDimensions(); ++i) std::cout <<
-    dilated_kernel_shape[i] << ", "; std::cout << ")" << std::endl;
-  std::cout << "\n\tStatus Col2Im<T>::Compute() --> math::Col2imNd<>()" << std::endl;
+  TensorShape batched_image_shape(batched_image_shape_dims), adjusted_image_shape(adjusted_image_shape_dims);
+  T* image_data = context->Output(0, batched_image_shape)->template MutableData<T>();
 
-  for (auto image_id = 0; image_id < col_input_N; ++image_id) {
-    std::cout << "Image " << image_id+1 << " out of "<< col_input_N << std::endl;
+  const T* col_data = col_tensor->template Data<T>();
+  for (auto image_id = 0; image_id < N; ++image_id) {
     if (image_shape->Shape()[0] == 2) {
-      std::cout << "image_shape->Shape()[0] == 2 --> Col2Im" << std::endl;
       math::Col2im<float, CPUMathUtil, StorageOrder::NCHW>(
-        col_input_data + image_id * col_output_stride,
+        col_data + image_id * col_data_stride,
         C,
         image_shape->Data<int64_t>()[0],
         image_shape->Data<int64_t>()[1],
@@ -100,43 +70,24 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
         col2im_attrs_.pads[3],
         col2im_attrs_.strides[0],
         col2im_attrs_.strides[1],
-        Ydata + image_id * col_input_stride,
+        image_data + image_id * col_stride,
         &CPUMathUtil::Instance());
     } else {
-      std::cout << "image_shape->Shape()[0] != 2 --> Col2ImNd (nd=" << image_shape->Shape()[0] << ") " << std::endl;
       math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
-        col_input_data + image_id * col_output_stride,    // const T* data_col,
-        image_shape->Data<int64_t>(),                     // const int64_t* img_shape,
-        Yshape.Slice(2).GetDims().data(),                 // const int64_t* output_shape,
-        // col_input_shape[1],                            // int64_t channels_col,
-        //    leads to output
-        //          {1, 6, 11, 16, 21, 2, 7, 12, 17, 22, 3, 8, 13, 18, 23,
-        //           4, 9, 14, 3.13005e+12, 1.88865e+31, 5, 10, 15, 20, 25,}
-        //    that is similar to input with some spots with random values
-
-        C,                                                // int64_t channels_col,
-        //    leads to output {1, 6, 11, 16, 21, 2, 7, 12, 17, 22, 3, 8, 13, 18, 23, 4, 9, 14, 19, 24, 5, 10, 15, 20, 25, }
-        //    that is identical to input
-
-        // col_input_shape[2],                            // int64_t channels_col,
-        //    leads to output
-        //          {1, 6, 1.89906e+28, 7.00716e+22, 8.96572e+22, 2, 7, 6.09175e+22, 1.81786e+31, 3.50226e+29, 3, 8,
-        //           1.8001e+14, 2.67907e+20, 2.79522e+20, 4, 1.79858e+14, 4.74181e+30, 7.40484e+28, 1.80733e+28, 5,
-        //           10, 1.42889e+19, 6635.59, 2.46452e+11}
-        //    that is very similar to input, but with some rounded numbers and corrupted "25" value
-        image_shape_size,                                 // int64_t img_size,
-        dilated_kernel_shape.GetDims().data(),            // const int64_t* kernel_shape,
-        col2im_attrs_.strides.data(),                     // const int64_t* stride,
-        col2im_attrs_.dilations.data(),                   // const int64_t* dilation,
-        col2im_attrs_.pads.data(),                        // const int64_t* pad,
-        image_shape->Shape().Size(),                      // ptrdiff_t N, --> #spatial_dims?
-        Ydata + image_id * col_input_stride,              // T* data_img,
-        &CPUMathUtil::Instance());                        // Provider* provider
+        col_data + image_id * col_data_stride,
+        image_shape->Data<int64_t>(),
+        adjusted_image_shape.GetDims().data(),
+        kernel_shape_size * C,
+        image_shape_size,
+        adjusted_kernel_shape.GetDims().data(),
+        col2im_attrs_.strides.data(),
+        col2im_attrs_.dilations.data(),
+        col2im_attrs_.pads.data(),
+        image_shape->Shape().Size(),
+        image_data + image_id * col_stride,
+        &CPUMathUtil::Instance());
     }
   }
-  std::cout << "\n\n Return Col2Im<T>::Compute() --> "; for (auto i=0; i < Yshape.Size(); ++i) std::cout <<
-    Ydata[i] << ", "; std::cout << ") with shape " << Yshape << std::endl << std::endl;
-
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
index 9639718db5ecf..eaef183334ba6 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
@@ -34,15 +34,10 @@ struct Col2ImAttributes {
 
   explicit Col2ImAttributes(const OpKernelInfo& info) {
     auto status = info.GetAttrs("strides", strides);
-    // ORT_ENFORCE(status.IsOK());
-
     gsl::span<const int64_t> pads_span;
     status = info.GetAttrsAsSpan("pads", pads_span);
-    // ORT_ENFORCE(status.IsOK());
     pads.assign(pads_span.cbegin(), pads_span.cend());
-
     status = info.GetAttrs("dilations", dilations);
-    // ORT_ENFORCE(status.IsOK());
   }
 
   ~Col2ImAttributes() = default;
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 3b5fa0a549af9..164e88573c4cb 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -31,13 +31,10 @@
 #pragma GCC diagnostic pop
 #endif
 using onnxruntime::concurrency::ThreadPool;
-#include <iostream>
 
 namespace onnxruntime {
 namespace math {
 
-using std::cout;
-
 // MatMul implementation purely based on Eigen.
 #define EIGEN_MATMUL_FUNCTION(T)                                                                                  \
   template <>                                                                                                     \
@@ -310,56 +307,23 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
     int64_t stride_w,
     T* data_col,
     T padding_value) {
-
-    cout << "void Im2col<T, StorageOrder::NCHW>::operator()(";
-    cout << "\n\tconst T* data_im=" << data_im;
-    cout << "\n\tint64_t channels=" << channels;
-    cout << "\n\tint64_t heigh=" << height;
-    cout << "\n\tint64_t width=" << width;
-    cout << "\n\tint64_t kernel_h=" << kernel_h;
-    cout << "\n\tint64_t kernel_w=" << kernel_w;
-    cout << "\n\tint64_t dilation_h=" << dilation_h;
-    cout << "\n\tint64_t dilation_w=" << dilation_w;
-    cout << "\n\tint64_t pad_t=" << pad_t;
-    cout << "\n\tint64_t pad_l=" << pad_l;
-    cout << "\n\tint64_t pad_b=" << pad_b;
-    cout << "\n\tint64_t pad_r=" << pad_r;
-    cout << "\n\tint64_t stride_h=" << stride_h;
-    cout << "\n\tint64_t stride_w=" << stride_w;
-    cout << "\n\tT* data_col=" << data_col;
-    cout << "\n\tT padding_value=" << padding_value << ")" << std::endl;
-
-  int data_col_offset = 0;
   const int64_t output_h = (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   const int64_t output_w = (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  cout << "output_h: " << output_h << std::endl;
-  cout << "output_w: " << output_w << std::endl;
+
   // From Intel, https://github.com/BVLC/caffe/pull/3536
   int64_t channel_size = height * width;
-  cout << "channel_size (height * width): " << channel_size << std::endl;
   for (int64_t channel = channels; channel--; data_im += channel_size) {
-    cout << "for channel= " << channel << "/channel_size=" << channel_size << std::endl;
     for (int64_t kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      cout << "\tfor kernel_row= " << kernel_row << "/kernel_h=" << kernel_h << std::endl;
       for (int64_t kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        cout << "\t\tfor kernel_col= " << kernel_col << "/kernel_w=" << kernel_w << std::endl;
         int64_t input_row = -pad_t + kernel_row * dilation_h;
-        cout << "\t\t\tinput_row= " << input_row << std::endl;
         for (int64_t output_rows = output_h; output_rows; output_rows--) {
-          cout << "\t\t\tfor output_rows= " << output_rows << "/output_h=" << output_h << std::endl;
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             std::fill_n(data_col, output_w, padding_value);
-            cout << "\t\t\t\t(input_row out of bounds) data_col["<< data_col_offset << "] = " << padding_value <<
-              " * " << output_w << " times"<< std::endl;
-            data_col_offset += output_w;
             data_col += output_w;
           } else {
             int64_t input_col = -pad_l + kernel_col * dilation_w;
-            cout << "\t\t\t\tinput_col= " << input_col << std::endl;
-            cout << "\t\t\t\tinput_pos= " << input_row * width + input_col << std::endl;
             const T* rdptr = data_im + input_row * width + input_col;
             for (int64_t i = 0; i < output_w;) {
-              cout << "\t\t\t\tfor i= " << i << "/output_w=" << output_w << std::endl;
               int64_t output_handled = 1;
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 if (stride_w == 1) {
@@ -367,10 +331,6 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
                   // and the number of output elements to produce.
                   output_handled = std::min(width - input_col, output_w - i);
                   data_col = std::copy_n(&rdptr[i], static_cast<size_t>(output_handled), data_col);
-                  cout << "\t\t\t\t\tdata_col["<< data_col_offset << "] = " << rdptr[i] << std::endl;
-                  data_col_offset += output_handled;
-                  // cout << "\t\t\t\t\t(stride 1) Copied " << output_handled <<
-                  //  " element(s) from data_im to data_col: " << rdptr[i] << std::endl;
                 } else if (stride_w == 2) {
                   // Same as above except using the number of strided input elements.
                   output_handled = std::min((width - input_col + 1) / 2, output_w - i);
@@ -379,16 +339,11 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
                     *(data_col++) = *local_rdptr;
                     local_rdptr += 2;
                   }
-                  cout << "\t\t\t\t\t(stride 2) Copy " << output_handled
-                    << " elements from data_im to data_col " << std::endl;
                 } else {
                   *(data_col++) = rdptr[i * stride_w];
-                  cout << "\t\t\t\t\t(stride >2) Copy 1 element from data_im to data_col " << std::endl;
                 }
               } else {
                 *(data_col++) = padding_value;
-                cout << "\t\t\t\t\t(input_col out of bounds) fill data_col with 1 padding_value= " <<
-                  padding_value << std::endl;
               }
               input_col += output_handled * stride_w;
               i += output_handled;
@@ -415,36 +370,7 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
     T* data_col,
     bool accumulate_output,
     T padding_value) {
-
-  int64_t im_shape_size = std::accumulate(im_shape, im_shape + rank, 1LL, std::multiplies<int64_t>());
-  int64_t output_shape_size = std::accumulate(output_shape, output_shape + rank, 1LL, std::multiplies<int64_t>());
-  int64_t kernel_shape_size = std::accumulate(kernel_shape, kernel_shape + rank, 1LL, std::multiplies<int64_t>());
-
-  cout << "\n\nCalled void Im2col<T, StorageOrder::NCHW>::operator()(";
-  cout << ",\n\tconst T* data_im={"; for (auto i=0; i < im_shape_size; ++i) cout << data_im[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tconst int64_t* im_shape={"; for (auto i=0; i < rank; ++i) cout << im_shape[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < rank; ++i) cout << output_shape[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tint64_t channels_col=" << channels_col;
-  cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < rank; ++i) cout << kernel_shape[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < rank; ++i) cout << stride[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < rank; ++i) cout << dilation[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*rank; ++i) cout << pad[i] << ", ";
-    cout << "}";
-  cout << ",\n\tptrdiff_t rank=" << rank;
-  cout << ",\n\tT* data_col= preallocated pointer to write at {"; for (auto i=0; i < output_shape_size; ++i) cout <<
-    data_col[i] << ", "; cout << "}";
-  cout << ",\n\tbool accumulate_output=" << accumulate_output;
-  cout << ",\n\tT padding_value=" << padding_value << ")";
-
-  cout << "\n\n\tVariable im_shape_size: " << im_shape_size << "\n\tVariable output_shape_size: " <<
-    output_shape_size << "\n\tVariable kernel_shape_size: " << kernel_shape_size << std::endl << std::endl;
-
+  int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + rank, 1LL, std::multiplies<int64_t>());
   std::vector<int64_t> d_offset(rank, 0);
   std::vector<int64_t> d_iter(rank, 0);
   for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
@@ -460,7 +386,7 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
       // Loop over spatial axes in forward order to compute the indices in the
       // image and column, and whether the index lies in the padding.
       int64_t index_col = c_col;
-      int64_t index_im = c_col / kernel_shape_size;
+      int64_t index_im = c_col / kernel_size;
       bool is_padding = false;
       for (ptrdiff_t d_i = 0; d_i < rank; ++d_i) {
         int64_t d = d_iter[d_i];
@@ -482,9 +408,6 @@ void Im2col<T, StorageOrder::NCHW>::operator()(
       }
     } while (NextPosition(rank, output_shape, d_iter.data()));
   }  // for (int c = 0; c < channels_col; ++c) {
-
-  cout << "Return void Im2col -> T* data_col={"; for (auto i=0; i < output_shape_size; ++i) cout <<
-    data_col[i] << ", "; cout << "}\n";
 }
 
 template struct Im2col<float, StorageOrder::NCHW>;
@@ -857,55 +780,24 @@ void Col2im<float, CPUMathUtil, StorageOrder::NHWC>(const float* data_col, int64
 }
 
 template <>
-void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col,
-                                                      const int64_t* img_shape,
-                                                      const int64_t* output_shape,
-                                                      int64_t channels_col,
-                                                      int64_t img_size,
-                                                      const int64_t* kernel_shape,
-                                                      const int64_t* stride,
-                                                      const int64_t* dilation,
-                                                      const int64_t* pad,
-                                                      ptrdiff_t N,
-                                                      float* data_img,
-                                                      CPUMathUtil* context) {
-  cout << "\n\nCalled void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(";
-  cout << ",\n\tconst float* data_col={"; for (auto i=0; i < img_size; ++i) cout <<
-    data_col[i] << ", "; cout << "}";
-  cout << ",\n\tconst int64_t* img_shape={"; for (auto i=0; i < N; ++i) cout << img_shape[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tconst int64_t* output_shape={"; for (auto i=0; i < N; ++i) cout << output_shape[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tint64_t channels_col=" << channels_col;
-  cout << ",\n\tint64_t img_size=" << img_size;
-  cout << ",\n\tconst int64_t* kernel_shape={"; for (auto i=0; i < N; ++i) cout << kernel_shape[i] <<
-    ", "; cout << "}";
-  cout << ",\n\tconst int64_t* stride={"; for (auto i=0; i < N; ++i) cout << stride[i] << ", ";
-    cout << "}";
-  cout << ",\n\tconst int64_t* dilation={"; for (auto i=0; i < N; ++i) cout << dilation[i] << ", ";
-    cout << "}";
-  cout << ",\n\tconst int64_t* pad={"; for (auto i=0; i < 2*N; ++i) cout << pad[i] << ", "; cout << "}";
-  cout << ",\n\tptrdiff_t N=" << N;
-  cout << ",\n\tfloat* data_img= preallocated pointer to save at {"; for (auto i=0; i < img_size; ++i) cout <<
-    data_img[i] << ", "; cout << "}";
-  cout << ",\n\tCPUMathUtil* context=...)" << std::endl;
-
+void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, const int64_t* img_shape,
+                                                      const int64_t* output_shape, int64_t channels_col, int64_t img_size,
+                                                      const int64_t* kernel_shape, const int64_t* stride,
+                                                      const int64_t* dilation, const int64_t* pad, ptrdiff_t N,
+                                                      float* data_img, CPUMathUtil* context) {
   Set<float, CPUMathUtil>(gsl::narrow<ptrdiff_t>(img_size), 0, data_img, context);
   Im2col<float, StorageOrder::NCHW>()(
-      data_col,       // const T* data_im,
-      img_shape,      // const int64_t* im_shape,
-      output_shape,   // const int64_t* output_shape,
-      channels_col,   // int64_t channels_col,
-      kernel_shape,   // const int64_t* kernel_shape,
-      stride,         // const int64_t* stride,
-      dilation,       // const int64_t* dilation,
-      pad,            // const int64_t* pad,
-      N,              // ptrdiff_t rank,
-      data_img,       // T* data_col,
-      true);          // bool accumulate_output,
-
-  cout << "Return void Col2imNd --> float* data_img= {"; for (auto i=0; i < img_size; ++i) cout <<
-    data_img[i] << ", "; cout << "}";
+      data_col,
+      img_shape,
+      output_shape,
+      channels_col,
+      kernel_shape,
+      stride,
+      dilation,
+      pad,
+      N,
+      data_img,
+      true);
 }
 
 #define SPECIALIZED_COPYVECTOR(T)                                                          \
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index 6d11f6ebeb6ae..82a5c558a2f59 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -94,6 +94,8 @@ def tril(g, self, diagonal):
     def col2im(g, self: torch._C.Value, image_shape, block_shape):
         return g.op("com.microsoft::Col2Im", self, image_shape, block_shape)
 
+    _reg(col2im)
+
 
 def unregister():
     """Unregister ONNX Runtime's built-in contrib ops."""
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index 0e542aff9ac68..e4ed7908418a4 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -17,25 +17,25 @@ std::vector<T> _transpose_serialized_vector(std::vector<T> &input, size_t N, siz
     }
     std::vector<T> trans_vec(input);
 
-    std::cout << "input: (";
-    for (size_t i = 0; i < input_size; ++i)
-      std::cout << trans_vec[i] << ", ";
-    std::cout << ")" << std::endl;
-
     for (size_t n = 0; n < N; ++n)
       for (size_t c = 0; c < C; ++c)
         for (size_t h = 0; h < H; ++h)
           for (size_t w = 0; w < W; ++w)
-              trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] = input[n * (C * H * W) + c * (H * W) + (w + W * h)];
-
-    std::cout << "trans_vec: (";
-    for (size_t i = 0; i < input_size; ++i)
-      std::cout << trans_vec[i] << ", ";
-    std::cout << ")" << std::endl;
+              trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] = \
+                input[n * (C * H * W) + c * (H * W) + (w + W * h)];
 
     return trans_vec;
 }
 
+struct float_iota {
+    explicit float_iota(float inc, float init_value = 0.0) : _value(init_value), _inc(inc) {}
+
+    operator float() const { return _value; }
+    float_iota& operator++() { _value += _inc; return *this; }
+    float _value;
+    float _inc;
+};
+
 TEST(Col2ImContribOpTest, simple4dNCHW) {
   OpTester test("Col2Im", 1, kMSDomain);
 
@@ -45,7 +45,8 @@ TEST(Col2ImContribOpTest, simple4dNCHW) {
 
   std::vector<float> input(25);
   std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), 1);
+  std::iota(output.begin(), output.end(), float_iota(1., 1.));
+
   input = _transpose_serialized_vector(output, 1, 1, 5, 5);
   test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
@@ -64,7 +65,7 @@ TEST(Col2ImContribOpTest, with2Images3channelsNonSquare4dNCHW) {
 
   std::vector<float> input(120);
   std::vector<float> output(120);
-  std::iota(output.begin(), output.end(), 1);
+  std::iota(output.begin(), output.end(), float_iota(1., 1.));
   input = _transpose_serialized_vector(output, 2, 3, 4, 5);
   test.AddInput<float>("input", {2, 15, 4},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{4, 5});
@@ -122,7 +123,7 @@ TEST(Col2ImContribOpTest, with3channels4dNCHW) {
 
   std::vector<float> input(75);
   std::vector<float> output(75);
-  std::iota(output.begin(), output.end(), 1);
+  std::iota(output.begin(), output.end(), float_iota(1., 1.));
   input = _transpose_serialized_vector(output, 1, 3, 5, 5);
   test.AddInput<float>("input", {1, 15, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
@@ -141,7 +142,7 @@ TEST(Col2ImContribOpTest, with2Images3channels4dNCHW) {
 
   std::vector<float> input(150);
   std::vector<float> output(150);
-  std::iota(output.begin(), output.end(), 1);
+  std::iota(output.begin(), output.end(), float_iota(1., 1.));
   input = _transpose_serialized_vector(output, 2, 3, 5, 5);
   test.AddInput<float>("input", {2, 15, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
@@ -160,7 +161,7 @@ TEST(Col2ImContribOpTest, simple5dNCHWD) {
 
   std::vector<float> input(25);
   std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), 1);
+  std::iota(output.begin(), output.end(), float_iota(1., 1.));
   input = _transpose_serialized_vector(output, 1, 1, 5, 5);
   test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {3},  std::vector<int64_t>{1, 5, 5});
@@ -172,11 +173,9 @@ TEST(Col2ImContribOpTest, simple5dNCHWD) {
 TEST(Im2ColContribOpTest, simple) {
   std::vector<float> input(24);
   std::vector<float> expected_output(24);
-  std::iota(input.begin(), input.end(), 1);
+  std::iota(input.begin(), input.end(), float_iota(1., 1.));
   expected_output = {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12, 13, 17, 21, 14, 18, 22, 15, 19, 23, 16, 20, 24};
-  float* actual_output = new float(24);
-  std::cout << "\nExpected output --> "; for (auto i=0; i < 24; ++i) std::cout <<  expected_output[i] << ", ";
-    std::cout << ")" << std::endl;
+  float* actual_output = new float[24];
   math::Im2col<float, StorageOrder::NCHW>()(
     input.data(),
     int64_t(2),
@@ -195,9 +194,7 @@ TEST(Im2ColContribOpTest, simple) {
     actual_output,
     0.);
 
-    std::cout << "\nActual output   --> "; for (auto i=0; i < 24; ++i) std::cout <<  actual_output[i] <<
-      ", "; std::cout << ")" << std::endl;
-    delete[] actual_output;
+    delete [] actual_output;
 }
 
 }  // namespace test

From 8b033a744fb2c25936f26507daa08d08b9c692a9 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Tue, 16 Aug 2022 13:13:37 -0400
Subject: [PATCH 13/30] Update documentation

---
 docs/ContribOperators.md | 54 ++++++++++++++++++++++++++++++++++++++++
 docs/OperatorKernels.md  |  1 +
 2 files changed, 55 insertions(+)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index e35bc530338d6..6f49c85fcf24e 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -13,6 +13,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.BitmaskBiasDropout">com.microsoft.BitmaskBiasDropout</a>
   * <a href="#com.microsoft.BitmaskDropout">com.microsoft.BitmaskDropout</a>
   * <a href="#com.microsoft.CDist">com.microsoft.CDist</a>
+  * <a href="#com.microsoft.Col2Im">com.microsoft.Col2Im</a>
   * <a href="#com.microsoft.ComplexMul">com.microsoft.ComplexMul</a>
   * <a href="#com.microsoft.ComplexMulConj">com.microsoft.ComplexMulConj</a>
   * <a href="#com.microsoft.ConvTransposeWithDynamicPads">com.microsoft.ConvTransposeWithDynamicPads</a>
@@ -753,6 +754,59 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.Col2Im"></a><a name="com.microsoft.col2im">**com.microsoft.Col2Im**</a>
+
+  The operator rearranges column blocks back into a multidimensional image
+  
+  Col2Im behaves similarly to PyTorch's fold https://pytorch.org/docs/stable/generated/torch.nn.Fold.html,
+  but it only supports *batched* multi-dimensional image tensors.
+  
+  NOTE: Although specifying image_shape looks redundant because it could be calculated from
+        convolution formulas, it is required as input for more advanced scenarios as explained
+        at PyTorch's implementation (https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Col2Im.cpp#L10)
+  
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd>1-dimensional tensor with dilation value along each spatial axis of the image. If not present, the dilation defaults to 1 along each spatial axis of the image.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>1-dimensional tensor with padding value for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels added at the beginning of axis `i` and xi_end the same for the end of axis `i`. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>1-dimensional tensor with stride value along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T</dt>
+<dd>Input data tensor to be rearranged from column blocks back into an image. This is a 3-dimensional tensor containing [N, C * n-ary-product(block_shape), L], where N is batch dimension, C is image channel dimension and L is number of blocks.</dd>
+<dt><tt>image_shape</tt> : tensor(int64)</dt>
+<dd>The shape of the spatial dimensions of the image after rearranging the column blocks.This is a 1-dim tensor with size of at least 2, containing the value [H_img, W_img]  for a 2-D image or [dim_i1, dim_i2, ..., dim_iN] for a N-D image.</dd>
+<dt><tt>block_shape</tt> : tensor(int64)</dt>
+<dd>The shape of the block to apply on the input.This is a 1-dim tensor of size of at least 2, containing the value [H_block, W_block]  for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block.Dilations, pads and strides are applied to block_shape under the hood.The kernel window start at the top-left of the block and slides to the right and down,similarly to how Convolution kernels do.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>Output tensor produced by rearranging blocks into an image.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Constrain input and output types to all numeric tensor types.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.ComplexMul"></a><a name="com.microsoft.complexmul">**com.microsoft.ComplexMul**</a>
 
 #### Version
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 0b5ae058a3474..2f84ec528646a 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -399,6 +399,7 @@ Do not modify directly.*
 |BiasGelu|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float)|
 |BifurcationDetector|*in* src_tokens:**T**<br> *in* cur_tokens:**T**<br> *in* prev_suffix_match_idx:**T**<br> *in* pred_tokens:**T**<br> *out* tokens:**T**<br> *out* suffix_match_idx:**T**|1+|**T** = tensor(int64)|
 |CDist|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(double), tensor(float)|
+|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |CropAndResize|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *in* crop_size:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int32)|
 |DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(float)|

From 5c2d137968dade45180d5d16d382537b584d74c4 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Mon, 22 Aug 2022 10:42:50 -0400
Subject: [PATCH 14/30] Address comments

---
 onnxruntime/test/contrib_ops/col2im_test.cc | 33 ++++-----------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index e4ed7908418a4..2e750f2e9f276 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -7,8 +7,11 @@
 #include "core/util/math.h"
 
 namespace onnxruntime {
+namespace contrib {
 namespace test {
+using namespace onnxruntime::test;
 
+namespace {
 template <typename T>
 std::vector<T> _transpose_serialized_vector(std::vector<T> &input, size_t N, size_t C, size_t H, size_t W) {
     size_t input_size = input.size();
@@ -36,6 +39,8 @@ struct float_iota {
     float _inc;
 };
 
+}  // namespace
+
 TEST(Col2ImContribOpTest, simple4dNCHW) {
   OpTester test("Col2Im", 1, kMSDomain);
 
@@ -170,32 +175,6 @@ TEST(Col2ImContribOpTest, simple5dNCHWD) {
   test.Run();
 }
 
-TEST(Im2ColContribOpTest, simple) {
-  std::vector<float> input(24);
-  std::vector<float> expected_output(24);
-  std::iota(input.begin(), input.end(), float_iota(1., 1.));
-  expected_output = {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12, 13, 17, 21, 14, 18, 22, 15, 19, 23, 16, 20, 24};
-  float* actual_output = new float[24];
-  math::Im2col<float, StorageOrder::NCHW>()(
-    input.data(),
-    int64_t(2),
-    int64_t(3),
-    int64_t(4),
-    int64_t(1),
-    int64_t(4),
-    int64_t(1),
-    int64_t(1),
-    int64_t(0),
-    int64_t(0),
-    int64_t(0),
-    int64_t(0),
-    int64_t(1),
-    int64_t(1),
-    actual_output,
-    0.);
-
-    delete [] actual_output;
-}
-
 }  // namespace test
+}  // namespace contrib
 }  // namespace onnxruntime

From bda84f2c3d349c6571a080bc165c8f93448e875f Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Mon, 29 Aug 2022 17:55:17 -0400
Subject: [PATCH 15/30] Address comments

---
 .../core/providers/cpu/tensor/col2im.cc       | 27 ++++++--------
 .../core/providers/cpu/tensor/col2im.h        |  6 ----
 .../providers/cpu/tensor/col2im_attributes.h  |  9 +++--
 onnxruntime/test/contrib_ops/col2im_test.cc   | 35 ++++++++-----------
 4 files changed, 29 insertions(+), 48 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 3a75910f9ff6c..0625999303e6d 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -2,25 +2,20 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/tensor/col2im.h"
+#include "core/util/math_cpuonly.h"
 
-#include "core/framework/element_type_lists.h"
-#include "core/framework/TensorSeq.h"
-#include "core/providers/common.h"
-#include "core/framework/copy.h"
-#include "core/common/safeint.h"
-#include "core/providers/op_kernel_type_control.h"
 
 namespace onnxruntime {
 
-#define REGISTER_KERNEL_TYPED(T)                                                            \
+#define REGISTER_COL2IM_TYPED_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS)                  \
   ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                           \
-      Col2Im,                                                                               \
-      1,                                                                                    \
-      T,                                                                                    \
-      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), \
-      Col2Im<T>);
+      OP_TYPE,                                                                     \
+      VERSION,                                                                     \
+      TYPE,                                                                        \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()), \
+      KERNEL_CLASS<TYPE>);
 
-REGISTER_KERNEL_TYPED(float)
+REGISTER_COL2IM_TYPED_KERNEL(Col2Im, 1, float, Col2Im);
 
 template <typename T>
 Status Col2Im<T>::Compute(OpKernelContext* context) const {
@@ -49,13 +44,13 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
     batched_image_shape_dims.push_back(image_shape->Data<int64_t>()[i]);
     adjusted_image_shape_dims.push_back(image_shape->Data<int64_t>()[i]-adjusted_kernel_shape[i]+1);
   }
-  TensorShape batched_image_shape(batched_image_shape_dims), adjusted_image_shape(adjusted_image_shape_dims);
+  TensorShape batched_image_shape(batched_image_shape_dims);
   T* image_data = context->Output(0, batched_image_shape)->template MutableData<T>();
 
   const T* col_data = col_tensor->template Data<T>();
   for (auto image_id = 0; image_id < N; ++image_id) {
     if (image_shape->Shape()[0] == 2) {
-      math::Col2im<float, CPUMathUtil, StorageOrder::NCHW>(
+      math::Col2im<T, CPUMathUtil, StorageOrder::NCHW>(
         col_data + image_id * col_data_stride,
         C,
         image_shape->Data<int64_t>()[0],
@@ -76,7 +71,7 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
       math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
         col_data + image_id * col_data_stride,
         image_shape->Data<int64_t>(),
-        adjusted_image_shape.GetDims().data(),
+        adjusted_image_shape_dims.data(),
         kernel_shape_size * C,
         image_shape_size,
         adjusted_kernel_shape.GetDims().data(),
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.h b/onnxruntime/core/providers/cpu/tensor/col2im.h
index 8cbefd2ec668b..b5849ecc9426b 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.h
@@ -5,12 +5,6 @@
 
 #include "core/providers/cpu/tensor/col2im_attributes.h"
 
-#include "core/common/common.h"
-#include "core/framework/op_kernel.h"
-#include "core/util/math_cpuonly.h"
-#include "core/framework/tensor.h"
-#include "core/providers/cpu/tensor/concatbase.h"
-
 namespace onnxruntime {
 
 template <typename T>
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
index eaef183334ba6..3b94ed213e8bb 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
@@ -17,11 +17,9 @@
 
 #pragma once
 
-#ifndef SHARED_PROVIDER
 #include "core/common/common.h"
 #include "core/providers/common.h"
 #include "core/util/math.h"
-#endif
 
 #include "core/common/inlined_containers.h"
 #include "core/framework/op_kernel.h"
@@ -33,11 +31,12 @@ struct Col2ImAttributes {
   using Col2ImPadVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;
 
   explicit Col2ImAttributes(const OpKernelInfo& info) {
-    auto status = info.GetAttrs("strides", strides);
+    // Make sure empty strides, pads or dilations are defaulted to 1 if necessary
+    ORT_ENFORCE(info.GetAttrs("strides", strides).IsOK());
     gsl::span<const int64_t> pads_span;
-    status = info.GetAttrsAsSpan("pads", pads_span);
+    ORT_ENFORCE(info.GetAttrsAsSpan("pads", pads_span).IsOK());
     pads.assign(pads_span.cbegin(), pads_span.cend());
-    status = info.GetAttrs("dilations", dilations);
+    ORT_ENFORCE(info.GetAttrs("dilations", dilations).IsOK());
   }
 
   ~Col2ImAttributes() = default;
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index 2e750f2e9f276..0801f3792509e 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -4,16 +4,18 @@
 #include <stdexcept>
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+
+using namespace onnxruntime::test;
 #include "core/util/math.h"
 
 namespace onnxruntime {
 namespace contrib {
 namespace test {
-using namespace onnxruntime::test;
+
 
 namespace {
 template <typename T>
-std::vector<T> _transpose_serialized_vector(std::vector<T> &input, size_t N, size_t C, size_t H, size_t W) {
+std::vector<T> TransposeSerializedVector(std::vector<T> &input, size_t N, size_t C, size_t H, size_t W) {
     size_t input_size = input.size();
     if (input_size == 0) {
         throw std::runtime_error("Invalid input");
@@ -30,15 +32,6 @@ std::vector<T> _transpose_serialized_vector(std::vector<T> &input, size_t N, siz
     return trans_vec;
 }
 
-struct float_iota {
-    explicit float_iota(float inc, float init_value = 0.0) : _value(init_value), _inc(inc) {}
-
-    operator float() const { return _value; }
-    float_iota& operator++() { _value += _inc; return *this; }
-    float _value;
-    float _inc;
-};
-
 }  // namespace
 
 TEST(Col2ImContribOpTest, simple4dNCHW) {
@@ -50,9 +43,9 @@ TEST(Col2ImContribOpTest, simple4dNCHW) {
 
   std::vector<float> input(25);
   std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), float_iota(1., 1.));
+  std::iota(output.begin(), output.end(), 1.0f);
 
-  input = _transpose_serialized_vector(output, 1, 1, 5, 5);
+  input = TransposeSerializedVector(output, 1, 1, 5, 5);
   test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
   test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
@@ -70,8 +63,8 @@ TEST(Col2ImContribOpTest, with2Images3channelsNonSquare4dNCHW) {
 
   std::vector<float> input(120);
   std::vector<float> output(120);
-  std::iota(output.begin(), output.end(), float_iota(1., 1.));
-  input = _transpose_serialized_vector(output, 2, 3, 4, 5);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 2, 3, 4, 5);
   test.AddInput<float>("input", {2, 15, 4},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{4, 5});
   test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
@@ -128,8 +121,8 @@ TEST(Col2ImContribOpTest, with3channels4dNCHW) {
 
   std::vector<float> input(75);
   std::vector<float> output(75);
-  std::iota(output.begin(), output.end(), float_iota(1., 1.));
-  input = _transpose_serialized_vector(output, 1, 3, 5, 5);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 1, 3, 5, 5);
   test.AddInput<float>("input", {1, 15, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
   test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
@@ -147,8 +140,8 @@ TEST(Col2ImContribOpTest, with2Images3channels4dNCHW) {
 
   std::vector<float> input(150);
   std::vector<float> output(150);
-  std::iota(output.begin(), output.end(), float_iota(1., 1.));
-  input = _transpose_serialized_vector(output, 2, 3, 5, 5);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 2, 3, 5, 5);
   test.AddInput<float>("input", {2, 15, 5},  input);
   test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
   test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
@@ -166,8 +159,8 @@ TEST(Col2ImContribOpTest, simple5dNCHWD) {
 
   std::vector<float> input(25);
   std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), float_iota(1., 1.));
-  input = _transpose_serialized_vector(output, 1, 1, 5, 5);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 1, 1, 5, 5);
   test.AddInput<float>("input", {1, 5, 5},  input);
   test.AddInput<int64_t>("image_shape", {3},  std::vector<int64_t>{1, 5, 5});
   test.AddInput<int64_t>("block_shape", {3},  std::vector<int64_t>{1, 1, 5});

From 1d36599e78fe063e284f6cbe3fb3739efe2c1baa Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Thu, 1 Sep 2022 12:46:46 -0400
Subject: [PATCH 16/30] Address comments

---
 onnxruntime/contrib_ops/cpu/col2im.cc                     | 2 +-
 onnxruntime/core/providers/cpu/tensor/col2im_attributes.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/col2im.cc b/onnxruntime/contrib_ops/cpu/col2im.cc
index 50689ccb6b4ab..2f66b3a5ace4b 100644
--- a/onnxruntime/contrib_ops/cpu/col2im.cc
+++ b/onnxruntime/contrib_ops/cpu/col2im.cc
@@ -13,7 +13,7 @@ namespace contrib {
       1,                                                                      \
       T,                                                                      \
       kCpuExecutionProvider,                                                  \
-      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Col2Im<T>);
 
 REGISTER_KERNEL_TYPED(float)
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
index 3b94ed213e8bb..2e9db6ca0bc15 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
@@ -32,11 +32,11 @@ struct Col2ImAttributes {
 
   explicit Col2ImAttributes(const OpKernelInfo& info) {
     // Make sure empty strides, pads or dilations are defaulted to 1 if necessary
-    ORT_ENFORCE(info.GetAttrs("strides", strides).IsOK());
+    ORT_THROW_IF_ERROR(info.GetAttrs("strides", strides).IsOK());
     gsl::span<const int64_t> pads_span;
-    ORT_ENFORCE(info.GetAttrsAsSpan("pads", pads_span).IsOK());
+    ORT_THROW_IF_ERROR(info.GetAttrsAsSpan("pads", pads_span).IsOK());
     pads.assign(pads_span.cbegin(), pads_span.cend());
-    ORT_ENFORCE(info.GetAttrs("dilations", dilations).IsOK());
+    ORT_THROW_IF_ERROR(info.GetAttrs("dilations", dilations).IsOK());
   }
 
   ~Col2ImAttributes() = default;

From 9a8c6568d11acee81ddb686f3c10581ea79ee138 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Tue, 6 Sep 2022 13:18:38 -0400
Subject: [PATCH 17/30] Address comments

---
 onnxruntime/contrib_ops/cpu/col2im.cc         |  14 +-
 .../core/graph/contrib_ops/contrib_defs.cc    | 153 +++++++++---------
 .../core/providers/cpu/tensor/col2im.cc       |  71 ++++----
 .../providers/cpu/tensor/col2im_attributes.h  |   6 +-
 .../tools/pytorch_export_contrib_ops.py       |  38 ++---
 onnxruntime/test/contrib_ops/col2im_test.cc   | 115 +++++++------
 6 files changed, 198 insertions(+), 199 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/col2im.cc b/onnxruntime/contrib_ops/cpu/col2im.cc
index 2f66b3a5ace4b..d6ed5495e49aa 100644
--- a/onnxruntime/contrib_ops/cpu/col2im.cc
+++ b/onnxruntime/contrib_ops/cpu/col2im.cc
@@ -6,13 +6,13 @@
 namespace onnxruntime {
 namespace contrib {
 
-#define REGISTER_KERNEL_TYPED(T)                                              \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                              \
-      Col2Im,                                                                 \
-      kMSDomain,                                                              \
-      1,                                                                      \
-      T,                                                                      \
-      kCpuExecutionProvider,                                                  \
+#define REGISTER_KERNEL_TYPED(T)                                                \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
+      Col2Im,                                                                   \
+      kMSDomain,                                                                \
+      1,                                                                        \
+      T,                                                                        \
+      kCpuExecutionProvider,                                                    \
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Col2Im<T>);
 
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 715eb9cda97b5..0e3e27643ff8b 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -762,7 +762,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(BiasSoftmax, 1,
                                     "Y = softmax(scores + bias)) with simple broadcast on bias. "
                                     "Intended to specialize softmax(scores + additive_mask) commonly found in transformer models.")
                                 .Attr("axis", "apply softmax to elements for dimensions axis or higher", AttributeProto::INT, static_cast<int64_t>(1))
-                                .Attr("is_inner_broadcast", "true if broadcast bias across input for dimensions broadcast_axis to axis-1, "
+                                .Attr("is_inner_broadcast",
+                                      "true if broadcast bias across input for dimensions broadcast_axis to axis-1, "
                                       "otherwise broadcast bias across input for dimensions 0 to broadcast_axis - 1",
                                       AttributeProto::INT)
                                 .Input(0, "data", "The input data as Tensor.", "T")
@@ -1011,86 +1012,86 @@ but it only supports *batched* multi-dimensional image tensors.
 
 NOTE: Although specifying image_shape looks redundant because it could be calculated from
       convolution formulas, it is required as input for more advanced scenarios as explained
-      at PyTorch's implementation (https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Col2Im.cpp#L10)
+      at PyTorch's implementation (https://github.com/pytorch/pytorch/blob/faac3dbce20a6068a3e530c11788896e81a73c64/aten/src/ATen/native/Col2Im.cpp#L10)
 
 )DOC";
 
 ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
                             OpSchema()
-                            .SetDoc(Col2Im_ver1_doc)
-                            .Attr(
-                                "dilations",
-                                "1-dimensional tensor with dilation value along each spatial axis of the image. "
-                                "If not present, the dilation defaults to 1 along each spatial axis of the image.",
-                                AttributeProto::INTS,
-                                OPTIONAL_VALUE)
-                            .Attr(
-                                "pads",
-                                "1-dimensional tensor with padding value for the beginning and ending along each"
-                                " spatial axis, it can take any value greater than or equal to 0. "
-                                "The value represent the number of pixels added to the beginning "
-                                "and end part of the corresponding axis. `pads` format should be as follow "
-                                "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels "
-                                "added at the beginning of axis `i` and xi_end the same for the end of axis `i`. "
-                                "If not present, the padding defaults to 0 along start and end of each spatial axis.",
-                                AttributeProto::INTS,
-                                OPTIONAL_VALUE)
-                            .Attr(
-                                "strides",
-                                "1-dimensional tensor with stride value along each spatial axis. "
-                                "If not present, the stride defaults to 1 along each spatial axis.",
-                                AttributeProto::INTS,
-                                OPTIONAL_VALUE)
-                            .Input(
-                                0,
-                                "input",
-                                "Input data tensor to be rearranged from column blocks back into an image."
-                                " This is a 3-dimensional tensor containing [N, C * n-ary-product(block_shape), L],"
-                                " where N is batch dimension, C is image channel dimension and L is number of blocks.",
-                                "T",
-                                OpSchema::Single,
-                                true,
-                                1,
-                                OpSchema::Differentiable)
-                            .Input(
-                                1,
-                                "image_shape",
-                                "The shape of the spatial dimensions of the image after rearranging the column blocks."
-                                "This is a 1-dim tensor with size of at least 2, containing the value [H_img, W_img] "
-                                " for a 2-D image or [dim_i1, dim_i2, ..., dim_iN] for a N-D image.",
-                                "tensor(int64)",
-                                OpSchema::Single,
-                                true,
-                                1,
-                                OpSchema::NonDifferentiable)
-                            .Input(
-                                2,
-                                "block_shape",
-                                "The shape of the block to apply on the input."
-                                "This is a 1-dim tensor of size of at least 2, containing the value [H_block, W_block] "
-                                " for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block."
-                                "Dilations, pads and strides are applied to block_shape under the hood."
-                                "The kernel window start at the top-left of the block and slides to the right and down,"
-                                "similarly to how Convolution kernels do.",
-                                "tensor(int64)",
-                                OpSchema::Single,
-                                true,
-                                1,
-                                OpSchema::NonDifferentiable)
-                            .Output(
-                                0,
-                                "output",
-                                "Output tensor produced by rearranging blocks into an image.",
-                                "T",
-                                OpSchema::Single,
-                                true,
-                                1,
-                                OpSchema::Differentiable)
-                            .TypeConstraint(
-                                "T",
-                                OpSchema::all_tensor_types_with_bfloat(),
-                                "Constrain input and output types to all numeric tensor types.")
-                           .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { col2imShapeInference(ctx); }));
+                                .SetDoc(Col2Im_ver1_doc)
+                                .Attr(
+                                    "dilations",
+                                    "1-dimensional tensor with dilation value along each spatial axis of the image. "
+                                    "If not present, the dilation defaults to 1 along each spatial axis of the image.",
+                                    AttributeProto::INTS,
+                                    OPTIONAL_VALUE)
+                                .Attr(
+                                    "pads",
+                                    "1-dimensional tensor with padding value for the beginning and ending along each "
+                                    "spatial axis, it can take any value greater than or equal to 0. "
+                                    "The value represent the number of pixels added to the beginning "
+                                    "and end part of the corresponding axis. `pads` format should be as follow "
+                                    "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels "
+                                    "added at the beginning of axis `i` and xi_end the same for the end of axis `i`. "
+                                    "If not present, the padding defaults to 0 along start and end of each spatial axis.",
+                                    AttributeProto::INTS,
+                                    OPTIONAL_VALUE)
+                                .Attr(
+                                    "strides",
+                                    "1-dimensional tensor with stride value along each spatial axis. "
+                                    "If not present, the stride defaults to 1 along each spatial axis.",
+                                    AttributeProto::INTS,
+                                    OPTIONAL_VALUE)
+                                .Input(
+                                    0,
+                                    "input",
+                                    "Input data tensor to be rearranged from column blocks back into an image. "
+                                    "This is a 3-dimensional tensor containing [N, C * n-ary-product(block_shape), L], "
+                                    "where N is batch dimension, C is image channel dimension and L is number of blocks.",
+                                    "T",
+                                    OpSchema::Single,
+                                    true,
+                                    1,
+                                    OpSchema::Differentiable)
+                                .Input(
+                                    1,
+                                    "image_shape",
+                                    "The shape of the spatial dimensions of the image after rearranging the column blocks. "
+                                    "This is a 1-dim tensor with size of at least 2, containing the value [H_img, W_img] "
+                                    "for a 2-D image or [dim_i1, dim_i2, ..., dim_iN] for a N-D image.",
+                                    "tensor(int64)",
+                                    OpSchema::Single,
+                                    true,
+                                    1,
+                                    OpSchema::NonDifferentiable)
+                                .Input(
+                                    2,
+                                    "block_shape",
+                                    "The shape of the block to apply on the input."
+                                    "This is a 1-dim tensor of size of at least 2, containing the value [H_block, W_block] "
+                                    "for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block. "
+                                    "Dilations, pads and strides are applied to block_shape under the hood. "
+                                    "The kernel window start at the top-left of the block and slides to the right and down, "
+                                    "similarly to how Convolution kernels do.",
+                                    "tensor(int64)",
+                                    OpSchema::Single,
+                                    true,
+                                    1,
+                                    OpSchema::NonDifferentiable)
+                                .Output(
+                                    0,
+                                    "output",
+                                    "Output tensor produced by rearranging blocks into an image.",
+                                    "T",
+                                    OpSchema::Single,
+                                    true,
+                                    1,
+                                    OpSchema::Differentiable)
+                                .TypeConstraint(
+                                    "T",
+                                    OpSchema::all_tensor_types_with_bfloat(),
+                                    "Constrain input and output types to all numeric tensor types.")
+                                .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { col2imShapeInference(ctx); }));
 
 constexpr const char* GridSample_ver1_doc = R"DOC(
       Given an `input` and a flow-field `grid`, computes the `output` using `input` values and pixel locations from `grid`.
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 0625999303e6d..adb1d68b8727e 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -4,11 +4,10 @@
 #include "core/providers/cpu/tensor/col2im.h"
 #include "core/util/math_cpuonly.h"
 
-
 namespace onnxruntime {
 
-#define REGISTER_COL2IM_TYPED_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS)                  \
-  ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                           \
+#define REGISTER_COL2IM_TYPED_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS)         \
+  ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(                                               \
       OP_TYPE,                                                                     \
       VERSION,                                                                     \
       TYPE,                                                                        \
@@ -26,9 +25,9 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   int64_t image_shape_size = 1;
   int64_t kernel_shape_size = 1;
   TensorShapeVector adjusted_kernel_shape_dims;
-  for (auto i=0; i < image_shape->Shape().Size(); ++i) {
-    image_shape_size *=  image_shape->Data<int64_t>()[i];
-    kernel_shape_size *=  kernel_shape->Data<int64_t>()[i];
+  for (auto i = 0; i < image_shape->Shape().Size(); ++i) {
+    image_shape_size *= image_shape->Data<int64_t>()[i];
+    kernel_shape_size *= kernel_shape->Data<int64_t>()[i];
     adjusted_kernel_shape_dims.push_back(col2im_attrs_.dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1);
   }
   TensorShape col_shape = col_tensor->Shape();
@@ -40,9 +39,9 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   TensorShapeVector batched_image_shape_dims, adjusted_image_shape_dims;
   batched_image_shape_dims.insert(batched_image_shape_dims.begin(), {N, C});
-  for (auto i=0; i < image_shape->Shape()[0]; ++i) {
+  for (auto i = 0; i < image_shape->Shape()[0]; ++i) {
     batched_image_shape_dims.push_back(image_shape->Data<int64_t>()[i]);
-    adjusted_image_shape_dims.push_back(image_shape->Data<int64_t>()[i]-adjusted_kernel_shape[i]+1);
+    adjusted_image_shape_dims.push_back(image_shape->Data<int64_t>()[i] - adjusted_kernel_shape[i] + 1);
   }
   TensorShape batched_image_shape(batched_image_shape_dims);
   T* image_data = context->Output(0, batched_image_shape)->template MutableData<T>();
@@ -51,36 +50,36 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   for (auto image_id = 0; image_id < N; ++image_id) {
     if (image_shape->Shape()[0] == 2) {
       math::Col2im<T, CPUMathUtil, StorageOrder::NCHW>(
-        col_data + image_id * col_data_stride,
-        C,
-        image_shape->Data<int64_t>()[0],
-        image_shape->Data<int64_t>()[1],
-        kernel_shape->Data<int64_t>()[0],
-        kernel_shape->Data<int64_t>()[1],
-        col2im_attrs_.dilations[0],
-        col2im_attrs_.dilations[1],
-        col2im_attrs_.pads[0],
-        col2im_attrs_.pads[1],
-        col2im_attrs_.pads[2],
-        col2im_attrs_.pads[3],
-        col2im_attrs_.strides[0],
-        col2im_attrs_.strides[1],
-        image_data + image_id * col_stride,
-        &CPUMathUtil::Instance());
+          col_data + image_id * col_data_stride,
+          C,
+          image_shape->Data<int64_t>()[0],
+          image_shape->Data<int64_t>()[1],
+          kernel_shape->Data<int64_t>()[0],
+          kernel_shape->Data<int64_t>()[1],
+          col2im_attrs_.dilations[0],
+          col2im_attrs_.dilations[1],
+          col2im_attrs_.pads[0],
+          col2im_attrs_.pads[1],
+          col2im_attrs_.pads[2],
+          col2im_attrs_.pads[3],
+          col2im_attrs_.strides[0],
+          col2im_attrs_.strides[1],
+          image_data + image_id * col_stride,
+          &CPUMathUtil::Instance());
     } else {
       math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
-        col_data + image_id * col_data_stride,
-        image_shape->Data<int64_t>(),
-        adjusted_image_shape_dims.data(),
-        kernel_shape_size * C,
-        image_shape_size,
-        adjusted_kernel_shape.GetDims().data(),
-        col2im_attrs_.strides.data(),
-        col2im_attrs_.dilations.data(),
-        col2im_attrs_.pads.data(),
-        image_shape->Shape().Size(),
-        image_data + image_id * col_stride,
-        &CPUMathUtil::Instance());
+          col_data + image_id * col_data_stride,
+          image_shape->Data<int64_t>(),
+          adjusted_image_shape_dims.data(),
+          kernel_shape_size * C,
+          image_shape_size,
+          adjusted_kernel_shape.GetDims().data(),
+          col2im_attrs_.strides.data(),
+          col2im_attrs_.dilations.data(),
+          col2im_attrs_.pads.data(),
+          image_shape->Shape().Size(),
+          image_data + image_id * col_stride,
+          &CPUMathUtil::Instance());
     }
   }
   return Status::OK();
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
index 2e9db6ca0bc15..49ddbe1cbb300 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
@@ -32,11 +32,11 @@ struct Col2ImAttributes {
 
   explicit Col2ImAttributes(const OpKernelInfo& info) {
     // Make sure empty strides, pads or dilations are defaulted to 1 if necessary
-    ORT_THROW_IF_ERROR(info.GetAttrs("strides", strides).IsOK());
+    ORT_THROW_IF_ERROR(info.GetAttrs("strides", strides));
     gsl::span<const int64_t> pads_span;
-    ORT_THROW_IF_ERROR(info.GetAttrsAsSpan("pads", pads_span).IsOK());
+    ORT_THROW_IF_ERROR(info.GetAttrsAsSpan("pads", pads_span));
     pads.assign(pads_span.cbegin(), pads_span.cend());
-    ORT_THROW_IF_ERROR(info.GetAttrs("dilations", dilations).IsOK());
+    ORT_THROW_IF_ERROR(info.GetAttrs("dilations", dilations));
   }
 
   ~Col2ImAttributes() = default;
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index 82a5c558a2f59..8271822673421 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -1,5 +1,5 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
+#Copyright(c) Microsoft Corporation.All rights reserved.
+#Licensed under the MIT License.
 
 """
 Support for registering ONNX Runtime's built-in contrib ops with
@@ -8,7 +8,7 @@
 import typing
 
 try:
-    # TODO(justinchuby): Create a function to alert users when torch is not installed
+#TODO(justinchuby) : Create a function to alert users when torch is not installed
     import torch
 except ModuleNotFoundError:
     raise ModuleNotFoundError(
@@ -35,26 +35,26 @@ def register():
     """
 
     def grid_sampler(g, input, grid, mode, padding_mode, align_corners):
-        # mode
-        #   'bilinear'      : onnx::Constant[value={0}]
-        #   'nearest'       : onnx::Constant[value={1}]
-        #   'bicubic'       : onnx::Constant[value={2}]
-        # padding_mode
-        #   'zeros'         : onnx::Constant[value={0}]
-        #   'border'        : onnx::Constant[value={1}]
-        #   'reflection'    : onnx::Constant[value={2}]
+#mode
+#'bilinear' : onnx::Constant[value = {0 }]
+#'nearest' : onnx::Constant[value = {1 }]
+#'bicubic' : onnx::Constant[value = {2 }]
+#padding_mode
+#'zeros' : onnx::Constant[value = {0 }]
+#'border' : onnx::Constant[value = {1 }]
+#'reflection' : onnx::Constant[value = {2 }]
         mode = sym_help._maybe_get_const(mode, "i")
         padding_mode = sym_help._maybe_get_const(padding_mode, "i")
         mode_str = ["bilinear", "nearest", "bicubic"][mode]
         padding_mode_str = ["zeros", "border", "reflection"][padding_mode]
         align_corners = int(sym_help._maybe_get_const(align_corners, "b"))
 
-        # From opset v13 onward, the output shape can be specified with
-        # (N, C, H, W) (N, H_out, W_out, 2) => (N, C, H_out, W_out)
-        # input_shape = input.type().sizes()
-        # gird_shape = grid.type().sizes()
-        # output_shape = input_shape[:2] + gird_shape[1:3]
-        # g.op(...).setType(input.type().with_sizes(output_shape))
+#From opset v13 onward, the output shape can be specified with
+#(N, C, H, W)(N, H_out, W_out, 2) =>(N, C, H_out, W_out)
+#input_shape = input.type().sizes()
+#gird_shape = grid.type().sizes()
+#output_shape = input_shape[ : 2] + gird_shape[1 : 3]
+#g.op(...).setType(input.type().with_sizes(output_shape))
 
         return g.op(
             "com.microsoft::GridSample",
@@ -74,7 +74,7 @@ def inverse(g, self):
 
     @torch.onnx.symbolic_helper.parse_args("v", "s")
     def gelu(g, self: torch._C.Value, approximate: str = "none"):
-        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none"
+#Use microsoft::Gelu for performance if possible.It only supports approximate == "none"
         if approximate == "none":
             return g.op("com.microsoft::Gelu", self).setType(self.type())
         return torch.onnx.symbolic_opset9.gelu(g, self, approximate)
@@ -103,7 +103,7 @@ def unregister():
         try:
             torch.onnx.unregister_custom_op_symbolic(name, _OPSET_VERSION)
         except AttributeError:
-            # unregister_custom_op_symbolic is not available before PyTorch 1.12
+#unregister_custom_op_symbolic is not available before PyTorch 1.12
             namespace, kind = name.split("::")
             for version in sym_help._onnx_stable_opsets:
                 if version >= _OPSET_VERSION and sym_registry.is_registered_op(kind, namespace, version):
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
index 0801f3792509e..3031975c0df2d 100644
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ b/onnxruntime/test/contrib_ops/col2im_test.cc
@@ -12,24 +12,23 @@ namespace onnxruntime {
 namespace contrib {
 namespace test {
 
-
 namespace {
 template <typename T>
-std::vector<T> TransposeSerializedVector(std::vector<T> &input, size_t N, size_t C, size_t H, size_t W) {
-    size_t input_size = input.size();
-    if (input_size == 0) {
-        throw std::runtime_error("Invalid input");
-    }
-    std::vector<T> trans_vec(input);
-
-    for (size_t n = 0; n < N; ++n)
-      for (size_t c = 0; c < C; ++c)
-        for (size_t h = 0; h < H; ++h)
-          for (size_t w = 0; w < W; ++w)
-              trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] = \
-                input[n * (C * H * W) + c * (H * W) + (w + W * h)];
-
-    return trans_vec;
+std::vector<T> TransposeSerializedVector(std::vector<T>& input, size_t N, size_t C, size_t H, size_t W) {
+  size_t input_size = input.size();
+  if (input_size == 0) {
+    throw std::runtime_error("Invalid input");
+  }
+  std::vector<T> trans_vec(input);
+
+  for (size_t n = 0; n < N; ++n)
+    for (size_t c = 0; c < C; ++c)
+      for (size_t h = 0; h < H; ++h)
+        for (size_t w = 0; w < W; ++w)
+          trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] =
+              input[n * (C * H * W) + c * (H * W) + (w + W * h)];
+
+  return trans_vec;
 }
 
 }  // namespace
@@ -46,9 +45,9 @@ TEST(Col2ImContribOpTest, simple4dNCHW) {
   std::iota(output.begin(), output.end(), 1.0f);
 
   input = TransposeSerializedVector(output, 1, 1, 5, 5);
-  test.AddInput<float>("input", {1, 5, 5},  input);
-  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+  test.AddInput<float>("input", {1, 5, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
 
   test.AddOutput<float>("output", {1, 1, 5, 5}, output);
   test.Run();
@@ -65,9 +64,9 @@ TEST(Col2ImContribOpTest, with2Images3channelsNonSquare4dNCHW) {
   std::vector<float> output(120);
   std::iota(output.begin(), output.end(), 1.0f);
   input = TransposeSerializedVector(output, 2, 3, 4, 5);
-  test.AddInput<float>("input", {2, 15, 4},  input);
-  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{4, 5});
-  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+  test.AddInput<float>("input", {2, 15, 4}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
 
   test.AddOutput<float>("output", {2, 3, 4, 5}, output);
   test.Run();
@@ -80,33 +79,33 @@ TEST(Col2ImContribOpTest, with2Images2channelsNonSquareDilationPadStride4dNCHW)
   test.AddAttribute("dilations", std::vector<int64_t>{2, 2});
   test.AddAttribute("pads", std::vector<int64_t>{2, 2, 2, 2});
 
-  std::vector<float> input{ 0., 0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0.,
-                            0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0., 0.,
-                            0., 0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0.,
-                            0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0., 0.,
-                            0., 0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0.,
-                            0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0., 0.,
-                            0., 0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0.,
-                            0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0., 0.};
-  std::vector<float> output { 2., 0., 6., 0., 10.,
-                              0., 0., 0., 0., 0.,
-                              22., 0., 26., 0., 30.,
-                              0., 0., 0., 0., 0.,
-                              42., 0., 46., 0., 50.,
-                              0., 0., 0., 0., 0.,
-                              62., 0., 66., 0., 70.,
-                              0., 0., 0., 0., 0.,
-                              82., 0., 86., 0., 90.,
-                              0., 0., 0., 0., 0.,
-                              102., 0., 106., 0., 110.,
-                              0., 0., 0., 0., 0.,
-                              122., 0., 126., 0., 130.,
-                              0., 0., 0., 0., 0.,
-                              142., 0., 146., 0., 150.,
-                              0., 0., 0., 0., 0.};
-  test.AddInput<float>("input", {2, 4, 16},  input);
-  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{4, 5});
-  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 2});
+  std::vector<float> input{0., 0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0., 0.};
+  std::vector<float> output{2., 0., 6., 0., 10.,
+                            0., 0., 0., 0., 0.,
+                            22., 0., 26., 0., 30.,
+                            0., 0., 0., 0., 0.,
+                            42., 0., 46., 0., 50.,
+                            0., 0., 0., 0., 0.,
+                            62., 0., 66., 0., 70.,
+                            0., 0., 0., 0., 0.,
+                            82., 0., 86., 0., 90.,
+                            0., 0., 0., 0., 0.,
+                            102., 0., 106., 0., 110.,
+                            0., 0., 0., 0., 0.,
+                            122., 0., 126., 0., 130.,
+                            0., 0., 0., 0., 0.,
+                            142., 0., 146., 0., 150.,
+                            0., 0., 0., 0., 0.};
+  test.AddInput<float>("input", {2, 4, 16}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 2});
 
   test.AddOutput<float>("output", {2, 2, 4, 5}, output);
   test.Run();
@@ -123,9 +122,9 @@ TEST(Col2ImContribOpTest, with3channels4dNCHW) {
   std::vector<float> output(75);
   std::iota(output.begin(), output.end(), 1.0f);
   input = TransposeSerializedVector(output, 1, 3, 5, 5);
-  test.AddInput<float>("input", {1, 15, 5},  input);
-  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+  test.AddInput<float>("input", {1, 15, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
 
   test.AddOutput<float>("output", {1, 3, 5, 5}, output);
   test.Run();
@@ -142,9 +141,9 @@ TEST(Col2ImContribOpTest, with2Images3channels4dNCHW) {
   std::vector<float> output(150);
   std::iota(output.begin(), output.end(), 1.0f);
   input = TransposeSerializedVector(output, 2, 3, 5, 5);
-  test.AddInput<float>("input", {2, 15, 5},  input);
-  test.AddInput<int64_t>("image_shape", {2},  std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2},  std::vector<int64_t>{1, 5});
+  test.AddInput<float>("input", {2, 15, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
 
   test.AddOutput<float>("output", {2, 3, 5, 5}, output);
   test.Run();
@@ -161,9 +160,9 @@ TEST(Col2ImContribOpTest, simple5dNCHWD) {
   std::vector<float> output(25);
   std::iota(output.begin(), output.end(), 1.0f);
   input = TransposeSerializedVector(output, 1, 1, 5, 5);
-  test.AddInput<float>("input", {1, 5, 5},  input);
-  test.AddInput<int64_t>("image_shape", {3},  std::vector<int64_t>{1, 5, 5});
-  test.AddInput<int64_t>("block_shape", {3},  std::vector<int64_t>{1, 1, 5});
+  test.AddInput<float>("input", {1, 5, 5}, input);
+  test.AddInput<int64_t>("image_shape", {3}, std::vector<int64_t>{1, 5, 5});
+  test.AddInput<int64_t>("block_shape", {3}, std::vector<int64_t>{1, 1, 5});
   test.AddOutput<float>("output", {1, 1, 1, 5, 5}, output);
   test.Run();
 }

From 7d176824ce5824eacc26a7da19ddf514ac75c9c9 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Wed, 11 Jan 2023 16:34:06 -0800
Subject: [PATCH 18/30] use onnx spec for Col2Im

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 .../core/graph/contrib_ops/contrib_defs.cc    | 188 ------------------
 onnxruntime/core/graph/contrib_ops/ms_opset.h |   2 -
 .../providers/cpu/cpu_execution_provider.cc   |   2 +
 .../core/providers/cpu/tensor/col2im.cc       |  53 +++--
 .../core/providers/cpu/tensor/col2im.h        |   1 +
 .../providers/cpu/tensor/col2im_attributes.h  |  16 +-
 .../tools/pytorch_export_contrib_ops.py       |  20 +-
 onnxruntime/test/contrib_ops/col2im_test.cc   | 172 ----------------
 .../python/contrib_ops/onnx_test_col2im.py    |  55 -----
 .../test_col2im/test_data_set_0/input_0.pb    | Bin 117 -> 0 bytes
 .../test_col2im/test_data_set_0/input_1.pb    | Bin 35 -> 0 bytes
 .../test_col2im/test_data_set_0/input_2.pb    | Bin 35 -> 0 bytes
 .../test_col2im/test_data_set_0/output_0.pb   | Bin 120 -> 0 bytes
 .../test_col2im_5d/test_data_set_0/input_0.pb | Bin 498 -> 0 bytes
 .../test_col2im_5d/test_data_set_0/input_1.pb | Bin 43 -> 0 bytes
 .../test_col2im_5d/test_data_set_0/input_2.pb | Bin 43 -> 0 bytes
 .../test_data_set_0/output_0.pb               | Bin 503 -> 0 bytes
 .../test_data_set_0/input_0.pb                | Bin 97 -> 0 bytes
 .../test_data_set_0/input_1.pb                | Bin 35 -> 0 bytes
 .../test_data_set_0/input_2.pb                | Bin 35 -> 0 bytes
 .../test_data_set_0/output_0.pb               | Bin 165 -> 0 bytes
 .../test_data_set_0/input_0.pb                | Bin 318 -> 0 bytes
 .../test_data_set_0/input_1.pb                | Bin 35 -> 0 bytes
 .../test_data_set_0/input_2.pb                | Bin 35 -> 0 bytes
 .../test_data_set_0/output_0.pb               | Bin 120 -> 0 bytes
 .../test_data_set_0/input_0.pb                | Bin 162 -> 0 bytes
 .../test_data_set_0/input_1.pb                | Bin 35 -> 0 bytes
 .../test_data_set_0/input_2.pb                | Bin 35 -> 0 bytes
 .../test_data_set_0/output_0.pb               | Bin 120 -> 0 bytes
 .../test_col2im/test_data_set_0/input_0.pb    | Bin 117 -> 0 bytes
 .../test_col2im/test_data_set_0/input_1.pb    | Bin 35 -> 0 bytes
 .../test_col2im/test_data_set_0/input_2.pb    | Bin 35 -> 0 bytes
 .../test_col2im/test_data_set_0/output_0.pb   | Bin 145 -> 0 bytes
 33 files changed, 52 insertions(+), 457 deletions(-)
 delete mode 100644 onnxruntime/test/contrib_ops/col2im_test.cc
 delete mode 100644 onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_1.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_2.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/output_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_1.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_2.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/output_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_1.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_2.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/output_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_1.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_2.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/output_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_1.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_2.pb
 delete mode 100644 onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/output_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_0.pb
 delete mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_1.pb
 delete mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_2.pb
 delete mode 100644 onnxruntime/test/python/testdata/test_col2im/test_data_set_0/output_0.pb

diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index f05112b7a628a..15f8599f52534 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -955,194 +955,6 @@ ONNX_MS_OPERATOR_SET_SCHEMA(IsAllFinite, 1,
                                   updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
                                 }));
 
-void col2imShapeInference(InferenceContext& ctx) {
-  propagateElemTypeFromInputToOutput(ctx, 0, 0);
-
-  // All inputs shapes are required
-  if (!hasNInputShapes(ctx, 3)) {
-    return;
-  }
-
-  // Assuming image_shape has correct spatial dimensions and reused for next validation steps
-  //       An alternative is get the the number of spatial dimensions as an input
-  if (ctx.getInputType(1)->tensor_type().shape().dim_size() != 1) {
-    fail_shape_inference("image_shape tensor must have rank 1.");
-  }
-  size_t n_input_dims = static_cast<size_t>(ctx.getInputType(1)->tensor_type().shape().dim(0).dim_value());
-  std::vector<int64_t> image_shape = {};
-  const TensorProto* image_shape_data = ctx.getInputData(1);
-  if (image_shape_data) {
-    image_shape = ParseData<int64_t>(image_shape_data);
-    if (image_shape.size() != n_input_dims) {
-      fail_shape_inference("image_shape tensor must have ", n_input_dims, " spatial dimensions.");
-    }
-  }
-
-  std::vector<int64_t> pads = {};
-  if (getRepeatedAttribute(ctx, "pads", pads)) {
-    if ((pads.size() != 0) && (pads.size() != n_input_dims * 2)) {
-      fail_shape_inference("Attribute pads has incorrect size");
-    }
-  }
-
-  std::vector<int64_t> dilations = {};
-  if (getRepeatedAttribute(ctx, "dilations", dilations)) {
-    if ((dilations.size() != 0) && (dilations.size() != n_input_dims)) {
-      fail_shape_inference("Attribute dilations has incorrect size");
-    }
-  }
-
-  std::vector<int64_t> strides = {};
-  if (getRepeatedAttribute(ctx, "strides", strides)) {
-    if ((strides.size() != 0) && (strides.size() != n_input_dims)) {
-      fail_shape_inference("Attribute strides has incorrect size");
-    }
-  }
-
-  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
-  if (input_shape.dim_size() != 3) {
-    fail_shape_inference("input must have rank 3.");
-  }
-
-  std::vector<int64_t> block_shape = {};
-  const TensorProto* block_shape_data = ctx.getInputData(2);
-  if (block_shape_data) {
-    block_shape = ParseData<int64_t>(block_shape_data);
-    if (block_shape.size() != n_input_dims) {
-      fail_shape_inference("block_shape tensor must have ", n_input_dims, " spatial dimensions.");
-    }
-  }
-  if (ctx.getInputType(2)->tensor_type().shape().dim_size() != 1) {
-    fail_shape_inference("block_shape tensor must have rank 1.");
-  } else if (
-      (ctx.getInputType(2)->tensor_type().shape().dim(0).has_dim_value()) &&
-      (ctx.getInputType(2)->tensor_type().shape().dim(0).dim_value() != static_cast<int>(n_input_dims))) {
-    fail_shape_inference("block_shape tensor must have ", n_input_dims, " spatial dimensions.");
-  }
-
-  int64_t block_shape_size = 0;
-  if (static_cast<int>(block_shape.size()) > 0) {
-    block_shape_size = 1;
-    for (const auto& dim : block_shape) {
-      block_shape_size *= dim;
-    }
-  }
-
-  // Final shape will be (N, C, dim_1, ..., dim_N)
-  auto final_image_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
-
-  // Dimensions N and C are always present
-  Dim N, C;
-  if (ctx.getInputType(0)->tensor_type().shape().dim(0).has_dim_value()) {
-    N = input_shape.dim(0);  // Otherwise, N is unknown.
-  }
-  *final_image_shape->add_dim() = N;
-
-  if (block_shape_size > 0) {
-    C = input_shape.dim(1) / block_shape_size;  // Otherwise, C is unknown.
-  }
-  *final_image_shape->add_dim() = C;
-
-  // Image dimensions are dynamic
-  for (size_t i = 0; i < n_input_dims; ++i) {
-    Dim image_dim_i;
-    if (image_shape.size() > 0) {
-      image_dim_i.set_dim_value(image_shape[i]);  // Otherwise, spatial dimensions are unknown
-    }
-    *final_image_shape->add_dim() = image_dim_i;
-  }
-  return;
-}
-
-constexpr const char* Col2Im_ver1_doc = R"DOC(
-The operator rearranges column blocks back into a multidimensional image
-
-Col2Im behaves similarly to PyTorch's fold https://pytorch.org/docs/stable/generated/torch.nn.Fold.html,
-but it only supports *batched* multi-dimensional image tensors.
-
-NOTE: Although specifying image_shape looks redundant because it could be calculated from
-      convolution formulas, it is required as input for more advanced scenarios as explained
-      at PyTorch's implementation (https://github.com/pytorch/pytorch/blob/faac3dbce20a6068a3e530c11788896e81a73c64/aten/src/ATen/native/Col2Im.cpp#L10)
-
-)DOC";
-
-ONNX_MS_OPERATOR_SET_SCHEMA(Col2Im, 1,
-                            OpSchema()
-                                .SetDoc(Col2Im_ver1_doc)
-                                .Attr(
-                                    "dilations",
-                                    "1-dimensional tensor with dilation value along each spatial axis of the image. "
-                                    "If not present, the dilation defaults to 1 along each spatial axis of the image.",
-                                    AttributeProto::INTS,
-                                    OPTIONAL_VALUE)
-                                .Attr(
-                                    "pads",
-                                    "1-dimensional tensor with padding value for the beginning and ending along each "
-                                    "spatial axis, it can take any value greater than or equal to 0. "
-                                    "The value represent the number of pixels added to the beginning "
-                                    "and end part of the corresponding axis. `pads` format should be as follow "
-                                    "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels "
-                                    "added at the beginning of axis `i` and xi_end the same for the end of axis `i`. "
-                                    "If not present, the padding defaults to 0 along start and end of each spatial axis.",
-                                    AttributeProto::INTS,
-                                    OPTIONAL_VALUE)
-                                .Attr(
-                                    "strides",
-                                    "1-dimensional tensor with stride value along each spatial axis. "
-                                    "If not present, the stride defaults to 1 along each spatial axis.",
-                                    AttributeProto::INTS,
-                                    OPTIONAL_VALUE)
-                                .Input(
-                                    0,
-                                    "input",
-                                    "Input data tensor to be rearranged from column blocks back into an image. "
-                                    "This is a 3-dimensional tensor containing [N, C * n-ary-product(block_shape), L], "
-                                    "where N is batch dimension, C is image channel dimension and L is number of blocks.",
-                                    "T",
-                                    OpSchema::Single,
-                                    true,
-                                    1,
-                                    OpSchema::Differentiable)
-                                .Input(
-                                    1,
-                                    "image_shape",
-                                    "The shape of the spatial dimensions of the image after rearranging the column blocks. "
-                                    "This is a 1-dim tensor with size of at least 2, containing the value [H_img, W_img] "
-                                    "for a 2-D image or [dim_i1, dim_i2, ..., dim_iN] for a N-D image.",
-                                    "tensor(int64)",
-                                    OpSchema::Single,
-                                    true,
-                                    1,
-                                    OpSchema::NonDifferentiable)
-                                .Input(
-                                    2,
-                                    "block_shape",
-                                    "The shape of the block to apply on the input."
-                                    "This is a 1-dim tensor of size of at least 2, containing the value [H_block, W_block] "
-                                    "for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block. "
-                                    "Dilations, pads and strides are applied to block_shape under the hood. "
-                                    "The kernel window start at the top-left of the block and slides to the right and down, "
-                                    "similarly to how Convolution kernels do.",
-                                    "tensor(int64)",
-                                    OpSchema::Single,
-                                    true,
-                                    1,
-                                    OpSchema::NonDifferentiable)
-                                .Output(
-                                    0,
-                                    "output",
-                                    "Output tensor produced by rearranging blocks into an image.",
-                                    "T",
-                                    OpSchema::Single,
-                                    true,
-                                    1,
-                                    OpSchema::Differentiable)
-                                .TypeConstraint(
-                                    "T",
-                                    OpSchema::all_tensor_types_with_bfloat(),
-                                    "Constrain input and output types to all numeric tensor types.")
-                                .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { col2imShapeInference(ctx); }));
-
 constexpr const char* GridSample_ver1_doc = R"DOC(
       Given an `input` and a flow-field `grid`, computes the `output` using `input` values and pixel locations from `grid`.
       Currently, only spatial (4-D) inputs are supported. For `input` with shape (N, C, H, W) and `grid` with shape (N, H_out, W_out, 2),
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index 538a0ce1081eb..1f0af31a4bdd0 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -68,7 +68,6 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GatherND);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Gelu);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QuickGelu);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GreedySearch);
-class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Col2Im);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GridSample);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Inverse);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft);
@@ -155,7 +154,6 @@ class OpSet_Microsoft_ver1 {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Gelu)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QuickGelu)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GreedySearch)>());
-    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Col2Im)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GridSample)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Inverse)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft)>());
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index d1ad5dc5b715f..08352b2fb88d0 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -830,6 +830,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceSumSquare);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im);
 // Opset 18
 #if !defined(DISABLE_OPTIONAL_TYPE)
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, OptionalHasElement);
@@ -2128,6 +2129,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
                                                                 ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im)>,
 #if !defined(DISABLE_OPTIONAL_TYPE)
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, OptionalHasElement)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, OptionalGetElement)>,
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index adb1d68b8727e..bf91aa82d31cb 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -6,15 +6,11 @@
 
 namespace onnxruntime {
 
-#define REGISTER_COL2IM_TYPED_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS)         \
-  ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(                                               \
-      OP_TYPE,                                                                     \
-      VERSION,                                                                     \
-      TYPE,                                                                        \
-      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()), \
-      KERNEL_CLASS<TYPE>);
-
-REGISTER_COL2IM_TYPED_KERNEL(Col2Im, 1, float, Col2Im);
+ONNX_CPU_OPERATOR_KERNEL(
+    Col2Im,
+    18,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Col2Im<float>);
 
 template <typename T>
 Status Col2Im<T>::Compute(OpKernelContext* context) const {
@@ -22,13 +18,28 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   const auto* image_shape = context->Input<Tensor>(1);
   const auto* kernel_shape = context->Input<Tensor>(2);
 
+  size_t image_dim_size = image_shape->Shape().Size();
+  TensorShapeVector pads = col2im_attrs_.pads;
+  TensorShapeVector dilations = col2im_attrs_.dilations;
+  TensorShapeVector strides = col2im_attrs_.strides;
+
+  if (dilations.empty()) {
+    dilations.resize(image_dim_size, 1);
+  }
+  if (pads.empty()) {
+    pads.resize(image_dim_size * 2, 0);
+  }
+  if (strides.empty()) {
+    strides.resize(image_dim_size, 1);
+  }
+
   int64_t image_shape_size = 1;
   int64_t kernel_shape_size = 1;
   TensorShapeVector adjusted_kernel_shape_dims;
   for (auto i = 0; i < image_shape->Shape().Size(); ++i) {
     image_shape_size *= image_shape->Data<int64_t>()[i];
     kernel_shape_size *= kernel_shape->Data<int64_t>()[i];
-    adjusted_kernel_shape_dims.push_back(col2im_attrs_.dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1);
+    adjusted_kernel_shape_dims.push_back(dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1);
   }
   TensorShape col_shape = col_tensor->Shape();
   const auto N = col_shape[0];
@@ -56,14 +67,14 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
           image_shape->Data<int64_t>()[1],
           kernel_shape->Data<int64_t>()[0],
           kernel_shape->Data<int64_t>()[1],
-          col2im_attrs_.dilations[0],
-          col2im_attrs_.dilations[1],
-          col2im_attrs_.pads[0],
-          col2im_attrs_.pads[1],
-          col2im_attrs_.pads[2],
-          col2im_attrs_.pads[3],
-          col2im_attrs_.strides[0],
-          col2im_attrs_.strides[1],
+          dilations[0],
+          dilations[1],
+          pads[0],
+          pads[1],
+          pads[2],
+          pads[3],
+          strides[0],
+          strides[1],
           image_data + image_id * col_stride,
           &CPUMathUtil::Instance());
     } else {
@@ -74,9 +85,9 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
           kernel_shape_size * C,
           image_shape_size,
           adjusted_kernel_shape.GetDims().data(),
-          col2im_attrs_.strides.data(),
-          col2im_attrs_.dilations.data(),
-          col2im_attrs_.pads.data(),
+          strides.data(),
+          dilations.data(),
+          pads.data(),
           image_shape->Shape().Size(),
           image_data + image_id * col_stride,
           &CPUMathUtil::Instance());
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.h b/onnxruntime/core/providers/cpu/tensor/col2im.h
index b5849ecc9426b..50bdad46a72a0 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "core/framework/op_kernel.h"
 #include "core/providers/cpu/tensor/col2im_attributes.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
index 49ddbe1cbb300..c8e01e396656a 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
@@ -28,20 +28,18 @@
 namespace onnxruntime {
 
 struct Col2ImAttributes {
-  using Col2ImPadVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;
-
   explicit Col2ImAttributes(const OpKernelInfo& info) {
-    // Make sure empty strides, pads or dilations are defaulted to 1 if necessary
-    ORT_THROW_IF_ERROR(info.GetAttrs("strides", strides));
-    gsl::span<const int64_t> pads_span;
-    ORT_THROW_IF_ERROR(info.GetAttrsAsSpan("pads", pads_span));
-    pads.assign(pads_span.cbegin(), pads_span.cend());
-    ORT_THROW_IF_ERROR(info.GetAttrs("dilations", dilations));
+    if (!info.GetAttrs("strides", strides).IsOK())
+      ORT_ENFORCE(strides.empty());
+    if (!info.GetAttrs("dilations", dilations).IsOK())
+      ORT_ENFORCE(dilations.empty());
+    if (!info.GetAttrs("pads", pads).IsOK())
+      ORT_ENFORCE(pads.empty());
   }
 
   ~Col2ImAttributes() = default;
 
-  Col2ImPadVector pads;
+  TensorShapeVector pads;
   TensorShapeVector dilations;
   TensorShapeVector strides;
 };
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index 8b40b7fedb7c5..7df091df53dd5 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -1,5 +1,5 @@
-#Copyright(c) Microsoft Corporation.All rights reserved.
-#Licensed under the MIT License.
+# Copyright(c) Microsoft Corporation.All rights reserved.
+# Licensed under the MIT License.
 
 """
 Support for registering ONNX Runtime's built-in contrib ops with
@@ -8,7 +8,7 @@
 import typing
 
 try:
-O(justinchuby) : Create a function to alert users when torch is not installed
+    # TODO(justinchuby): Create a function to alert users when torch is not installed
     import torch
 except ModuleNotFoundError:
     raise ModuleNotFoundError(
@@ -48,12 +48,12 @@ def grid_sampler(g, input, grid, mode, padding_mode, align_corners):
         padding_mode_str = ["zeros", "border", "reflection"][padding_mode]
         align_corners = int(symbolic_helper._maybe_get_const(align_corners, "b"))
 
-#From opset v13 onward, the output shape can be specified with
-#(N, C, H, W)(N, H_out, W_out, 2) =>(N, C, H_out, W_out)
-#input_shape = input.type().sizes()
-#gird_shape = grid.type().sizes()
-#output_shape = input_shape[ : 2] + gird_shape[1 : 3]
-#g.op(...).setType(input.type().with_sizes(output_shape))
+        # From opset v13 onward, the output shape can be specified with
+        # (N, C, H, W)(N, H_out, W_out, 2) =>(N, C, H_out, W_out)
+        # input_shape = input.type().sizes()
+        # gird_shape = grid.type().sizes()
+        # output_shape = input_shape[ : 2] + gird_shape[1 : 3]
+        # g.op(...).setType(input.type().with_sizes(output_shape))
 
         return g.op(
             "com.microsoft::GridSample",
@@ -73,7 +73,7 @@ def inverse(g, self):
 
     @torch.onnx.symbolic_helper.parse_args("v", "s")
     def gelu(g, self: torch._C.Value, approximate: str = "none"):
-#Use microsoft::Gelu for performance if possible.It only supports approximate == "none"
+        # Use microsoft::Gelu for performance if possible.It only supports approximate == "none"
         if approximate == "none":
             return g.op("com.microsoft::Gelu", self).setType(self.type())
         return torch.onnx.symbolic_opset9.gelu(g, self, approximate)
diff --git a/onnxruntime/test/contrib_ops/col2im_test.cc b/onnxruntime/test/contrib_ops/col2im_test.cc
deleted file mode 100644
index 3031975c0df2d..0000000000000
--- a/onnxruntime/test/contrib_ops/col2im_test.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <stdexcept>
-#include "gtest/gtest.h"
-#include "test/providers/provider_test_utils.h"
-
-using namespace onnxruntime::test;
-#include "core/util/math.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace test {
-
-namespace {
-template <typename T>
-std::vector<T> TransposeSerializedVector(std::vector<T>& input, size_t N, size_t C, size_t H, size_t W) {
-  size_t input_size = input.size();
-  if (input_size == 0) {
-    throw std::runtime_error("Invalid input");
-  }
-  std::vector<T> trans_vec(input);
-
-  for (size_t n = 0; n < N; ++n)
-    for (size_t c = 0; c < C; ++c)
-      for (size_t h = 0; h < H; ++h)
-        for (size_t w = 0; w < W; ++w)
-          trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] =
-              input[n * (C * H * W) + c * (H * W) + (w + W * h)];
-
-  return trans_vec;
-}
-
-}  // namespace
-
-TEST(Col2ImContribOpTest, simple4dNCHW) {
-  OpTester test("Col2Im", 1, kMSDomain);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(25);
-  std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), 1.0f);
-
-  input = TransposeSerializedVector(output, 1, 1, 5, 5);
-  test.AddInput<float>("input", {1, 5, 5}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {1, 1, 5, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImContribOpTest, with2Images3channelsNonSquare4dNCHW) {
-  OpTester test("Col2Im", 1, kMSDomain);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(120);
-  std::vector<float> output(120);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 2, 3, 4, 5);
-  test.AddInput<float>("input", {2, 15, 4}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {2, 3, 4, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImContribOpTest, with2Images2channelsNonSquareDilationPadStride4dNCHW) {
-  OpTester test("Col2Im", 1, kMSDomain);
-
-  test.AddAttribute("strides", std::vector<int64_t>{2, 2});
-  test.AddAttribute("dilations", std::vector<int64_t>{2, 2});
-  test.AddAttribute("pads", std::vector<int64_t>{2, 2, 2, 2});
-
-  std::vector<float> input{0., 0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0., 0.};
-  std::vector<float> output{2., 0., 6., 0., 10.,
-                            0., 0., 0., 0., 0.,
-                            22., 0., 26., 0., 30.,
-                            0., 0., 0., 0., 0.,
-                            42., 0., 46., 0., 50.,
-                            0., 0., 0., 0., 0.,
-                            62., 0., 66., 0., 70.,
-                            0., 0., 0., 0., 0.,
-                            82., 0., 86., 0., 90.,
-                            0., 0., 0., 0., 0.,
-                            102., 0., 106., 0., 110.,
-                            0., 0., 0., 0., 0.,
-                            122., 0., 126., 0., 130.,
-                            0., 0., 0., 0., 0.,
-                            142., 0., 146., 0., 150.,
-                            0., 0., 0., 0., 0.};
-  test.AddInput<float>("input", {2, 4, 16}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 2});
-
-  test.AddOutput<float>("output", {2, 2, 4, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImContribOpTest, with3channels4dNCHW) {
-  OpTester test("Col2Im", 1, kMSDomain);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(75);
-  std::vector<float> output(75);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 1, 3, 5, 5);
-  test.AddInput<float>("input", {1, 15, 5}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {1, 3, 5, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImContribOpTest, with2Images3channels4dNCHW) {
-  OpTester test("Col2Im", 1, kMSDomain);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(150);
-  std::vector<float> output(150);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 2, 3, 5, 5);
-  test.AddInput<float>("input", {2, 15, 5}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {2, 3, 5, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImContribOpTest, simple5dNCHWD) {
-  OpTester test("Col2Im", 1, kMSDomain);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0, 0, 0});
-
-  std::vector<float> input(25);
-  std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 1, 1, 5, 5);
-  test.AddInput<float>("input", {1, 5, 5}, input);
-  test.AddInput<int64_t>("image_shape", {3}, std::vector<int64_t>{1, 5, 5});
-  test.AddInput<int64_t>("block_shape", {3}, std::vector<int64_t>{1, 1, 5});
-  test.AddOutput<float>("output", {1, 1, 1, 5, 5}, output);
-  test.Run();
-}
-
-}  // namespace test
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py b/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
deleted file mode 100644
index 31c5f129fad6b..0000000000000
--- a/onnxruntime/test/python/contrib_ops/onnx_test_col2im.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-#
-# Test reference implementation and model for ONNX Runtime conrtib op trilu
-
-import unittest
-
-import numpy as np
-import onnx
-from onnx_contrib_ops_helper import expect
-
-
-class ONNXReferenceImplementationTest(unittest.TestCase):
-    def test_col2im(self) -> None:
-        inputs = np.array(
-            [
-                [
-                    [1.0, 6.0, 11.0, 16.0, 21.0],  # (1, 5, 5)
-                    [2.0, 7.0, 12.0, 17.0, 22.0],
-                    [3.0, 8.0, 13.0, 18.0, 23.0],
-                    [4.0, 9.0, 14.0, 19.0, 24.0],
-                    [5.0, 0.0, 15.0, 20.0, 25.0],
-                ]
-            ]
-        ).astype(np.float32)
-        image_shape = np.array([5, 5]).astype(np.int64)
-        block_shape = np.array([1, 5]).astype(np.int64)
-        node = onnx.helper.make_node(
-            "Col2Im", ["input", "image_shape", "block_shape"], ["col2im_reference_implementation"]
-        )
-
-        col2im_reference_implementation = np.array(
-            [
-                [
-                    [
-                        [1.0, 2.0, 3.0, 4.0, 5.0],  # (1, 1, 5, 5)
-                        [6.0, 7.0, 8.0, 9.0, 0.0],
-                        [11.0, 12.0, 13.0, 14.0, 15.0],
-                        [16.0, 17.0, 18.0, 19.0, 20.0],
-                        [21.0, 22.0, 23.0, 24.0, 25.0],
-                    ]
-                ]
-            ]
-        ).astype(np.float32)
-
-        expect(
-            node,
-            inputs=[inputs, image_shape, block_shape],
-            outputs=[col2im_reference_implementation],
-            name="test_col2im",
-        )
-
-
-if __name__ == "__main__":
-    unittest.main(module=__name__, buffer=True)
diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_0.pb
deleted file mode 100644
index 164166b2c84e8c0968a316c70ceb85e9b5fea07e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 117
zcmd;J<Y47s6<~B?&CDw(E%8cWU}&&sU^w8wz+m7AWCQUEAZ7sa9{{lfklg{q8-UmW
o$Y%i30YE$fh<5;S15iu=NGAaC3?M!L#0x;`fS>@x2I3Qr0LGRS@&Et;

diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_1.pb
deleted file mode 100644
index e2e47c174ce48b0b6cc775ccbad84426c3925a39..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/input_2.pb
deleted file mode 100644
index c0b7595628c4bb8bd1859c490f6242ca6bdbf7cc..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
gcmd;J5@2`YPRhwo&W<n6NGwS85@2M209Ggs0DrUvN&o-=

diff --git a/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im/test_data_set_0/output_0.pb
deleted file mode 100644
index 28d470182c4b3a2a641ea74976b4ce1774f912c2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 120
zcmWl~ArgQf6h+Zb(D{+2xC6mxatQ{z1n5Mg(MU8JjYOB>QoO#oXPz20+Gu0V(5@+l
l6vss*_afp583h$D#EV$`>v)F=GXgR;>^M+x;=+xJ$A70x6@35z

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_0.pb
deleted file mode 100644
index 0b66e3fbccc21c2a88060142326527a6fd6ca537..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 498
zcmWm4F(?FJ0EXfJLR~W4U@#aA27|$1Fev9TNQN5>27|#sNjID_7%oYYBuST~OVTAt
zk}gS?q)U<{U6L+Il60fdV|ePT_lriN_0fh{BvCgpm6}cuRBA%V^n_51htSb$ljoja
zVu=u1>0*#kf^UVmzQ_?}g4ReI-6R<!?i=(Zs}!hEt2NR7|6z!6GX61rlq^L)iE7Ps
z(8n;7<osLo6gf&%Y0z5eq@NK|q?u=hEIS-<N|75LdF7KC7FcDIT@ER5$t_P*_+pkt
v*4Sc?BhI+ujxuk2lVOQ<w#jqMIoI6t%sW5KvCIZJ_Br8#5)ZuaK{fFQcK&2$

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_1.pb
deleted file mode 100644
index 5505bdad286eddb0d654d28ce6d3b0e28ef6353a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43
icmd;J7GQVc&dg0rPmM3mNGwS8l3-?l02V0C3Z((6Jq1w!

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/input_2.pb
deleted file mode 100644
index 3abcacd89b1f6387825606c3360b099e828b7df7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43
icmd;J7GQVcPRhwo&W<n6NGwS8l3-+j02s{*<pTh!-UU<u

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_5d/test_data_set_0/output_0.pb
deleted file mode 100644
index cdb3bba06438e3657315a850363cb23029214e83..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 503
zcmWN=F(`y#007VrNB>~B!C){L3<kpu27_`ggJdum3<iV2pxkiEV7Men(j`ffE=iXp
zNxCFmk|gPpBuST~OOny()q7D*R2$Vrv8X;?lWLfn&ZMU^1C_cEvOOV$BxcQ9vSR2B
zA#SS)lMXp%+N_J_+%Rv!Ba4>2vFx)ItN!|DNQDqHveCFrHrryWZMNHC!cM#FHff*z
z4m#wpBaS-exD!sAHsg%5W}S211s7d%*%foHy5_nYZo1{Rd3W4(&w~3Nc<7PGo_K1}
nGta%S<fT_$d*iKl-dpy;N1uH5#aG{~`0j_FR{iqZAAeK-gCS(_

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_0.pb
deleted file mode 100644
index bc96c5aca2112690e216f9b700c9ecef2e343fae..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 97
zcmd;J<Y3`o6<~B?&CDw(E%6FqU}&&sU|8V5z#!nrzz_h$9YD+g<R1WH1t2>Ch$jHC
c1Caj!hz)@30wA6N#0@|`14s_U2I2*d02b*GVE_OC

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_1.pb
deleted file mode 100644
index ed056b38ede071201a58c4d489ee72565a9de9e6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2J102mDbe@g{O

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/input_2.pb
deleted file mode 100644
index ea04f67ddf5b80dd13a9f42589cd7104b5e46f7a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
fcmd;J5@2`YPRhwo&W<n6NGwS85@2G002mDbf2;*a

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_dilations/test_data_set_0/output_0.pb
deleted file mode 100644
index cefb7d5de2c8bdbd6589283d37bb43ded3b5c740..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 165
zcmd;J<Y44r<6skDbYjadEh#81@tVNMz|dgNfC(IcLXMci3P7<2EaDS@Vh6B@UjT_Y
PVVc7M6q|uX{Q^e-RWl2p

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_0.pb
deleted file mode 100644
index cccac0d652c49c5e2d42e6108db2f5912452797b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 318
zcmWm5AxHxO7zN;e;^2bd1%tt`uwXDO7#0RQ4}-|Ug27-|Feu2lFkxZgkTHgc3=uId
z8VrlZMWb<X(P%XK89sRM-b*S;HQ7!pdG)$q+zw9PHzLY*#4?L$9Lc=##V=_dQKw0V
z9s|Za5`Jqk+bIiHgl>m@T6DQ&$b=c0U)Xk;@xhu+-7W|J8_u|5#FQsW-_*9pD@(pf
jbbB;7=A0W!?s;bJTeB^g^T`iex*CU^a6!Qx54_}m`4%=I

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_1.pb
deleted file mode 100644
index e2e47c174ce48b0b6cc775ccbad84426c3925a39..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/input_2.pb
deleted file mode 100644
index c0b7595628c4bb8bd1859c490f6242ca6bdbf7cc..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
gcmd;J5@2`YPRhwo&W<n6NGwS85@2M209Ggs0DrUvN&o-=

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_pads/test_data_set_0/output_0.pb
deleted file mode 100644
index b1faf2aed1971aadecc616ff3237cafe72606a92..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 120
zcmWm3Ar3%L0EhAa3+zQvu-V)MiP=<9&<WlR1d|-&9Yb&kNs^qxC;aw>&l+oNV1vjM
r;ppbc&GjHDmpl$!=n)svf(<)5thn*ug~y4clu~5)pJ;^!Eu!)RQ05ig

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_0.pb
deleted file mode 100644
index f33a7620e97e8b2934587759212fbf1350d5effd..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 162
qcmd;J<ly9B5nyy;&CDw(E%BPb$bbPF>=_tH!59V*5I{E@RXG6prw@7n

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_1.pb
deleted file mode 100644
index e2e47c174ce48b0b6cc775ccbad84426c3925a39..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/input_2.pb
deleted file mode 100644
index 19b497c93ccceed2813a63a90e568d62835d8ed1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
fcmd;J5@2`YPRhwo&W<n6NGwS85@2S402mDbf5Zh!

diff --git a/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/node/test_col2im_strides/test_data_set_0/output_0.pb
deleted file mode 100644
index df3f1bfa1e652a0d8dd2c78be6f6560e110bb6d7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 120
zcmd;J<Y44r<zN+HbYjadEh#81@k(I;g9dvB26POP0EwXh2dEg7hRMTdm|7SgLIVJ-
CxeZwW

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_0.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_0.pb
deleted file mode 100644
index 164166b2c84e8c0968a316c70ceb85e9b5fea07e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 117
zcmd;J<Y47s6<~B?&CDw(E%8cWU}&&sU^w8wz+m7AWCQUEAZ7sa9{{lfklg{q8-UmW
o$Y%i30YE$fh<5;S15iu=NGAaC3?M!L#0x;`fS>@x2I3Qr0LGRS@&Et;

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_1.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_1.pb
deleted file mode 100644
index e2e47c174ce48b0b6cc775ccbad84426c3925a39..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
fcmd;J5@2`Y&dg0rPmM3mNGwS85@2P302mDbe=`L}

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_2.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/input_2.pb
deleted file mode 100644
index c0b7595628c4bb8bd1859c490f6242ca6bdbf7cc..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 35
gcmd;J5@2`YPRhwo&W<n6NGwS85@2M209Ggs0DrUvN&o-=

diff --git a/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/output_0.pb b/onnxruntime/test/python/testdata/test_col2im/test_data_set_0/output_0.pb
deleted file mode 100644
index a0327f2382805a66df87597f4f6d304d58f2f3e0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 145
zcmWl~u?>ST6h+bR0?8#CWCg?mft0DT0YT(PvV;vn4B(QIG9@J?WlGA7!6@83-J@rX
zH8!zHYHpXNYjfwTLEk!PXtZ*^Z`ErU>(O=L{t}5-5h(}}3w9ikQY6KH8DHRmCwfFo
OnDN1a6&t?T@heYoXdiw6


From a867572ee3d49479ac4d2a012ad2abd738015119 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Wed, 11 Jan 2023 16:42:02 -0800
Subject: [PATCH 19/30] remove undeeded changes

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 onnxruntime/contrib_ops/cpu/col2im.cc         | 22 -------------------
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |  2 --
 .../tools/pytorch_export_contrib_ops.py       | 13 ++++-------
 3 files changed, 4 insertions(+), 33 deletions(-)
 delete mode 100644 onnxruntime/contrib_ops/cpu/col2im.cc

diff --git a/onnxruntime/contrib_ops/cpu/col2im.cc b/onnxruntime/contrib_ops/cpu/col2im.cc
deleted file mode 100644
index d6ed5495e49aa..0000000000000
--- a/onnxruntime/contrib_ops/cpu/col2im.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-#include "core/providers/cpu/tensor/col2im.h"
-#include "core/providers/common.h"
-
-namespace onnxruntime {
-namespace contrib {
-
-#define REGISTER_KERNEL_TYPED(T)                                                \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
-      Col2Im,                                                                   \
-      kMSDomain,                                                                \
-      1,                                                                        \
-      T,                                                                        \
-      kCpuExecutionProvider,                                                    \
-      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Col2Im<T>);
-
-REGISTER_KERNEL_TYPED(float)
-
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 7c02f4055ac8f..a04ef0d71b113 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -10,7 +10,6 @@ namespace contrib {
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp);
 
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Col2Im);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch);
@@ -193,7 +192,6 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp)>,
 
     // add more kernels here
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Col2Im)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch)>,
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index 7df091df53dd5..d3c537035f6ab 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -1,4 +1,4 @@
-# Copyright(c) Microsoft Corporation.All rights reserved.
+# Copyright(c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
 """
@@ -49,10 +49,10 @@ def grid_sampler(g, input, grid, mode, padding_mode, align_corners):
         align_corners = int(symbolic_helper._maybe_get_const(align_corners, "b"))
 
         # From opset v13 onward, the output shape can be specified with
-        # (N, C, H, W)(N, H_out, W_out, 2) =>(N, C, H_out, W_out)
+        # (N, C, H, W) (N, H_out, W_out, 2) => (N, C, H_out, W_out)
         # input_shape = input.type().sizes()
         # gird_shape = grid.type().sizes()
-        # output_shape = input_shape[ : 2] + gird_shape[1 : 3]
+        # output_shape = input_shape[:2] + gird_shape[1:3]
         # g.op(...).setType(input.type().with_sizes(output_shape))
 
         return g.op(
@@ -73,7 +73,7 @@ def inverse(g, self):
 
     @torch.onnx.symbolic_helper.parse_args("v", "s")
     def gelu(g, self: torch._C.Value, approximate: str = "none"):
-        # Use microsoft::Gelu for performance if possible.It only supports approximate == "none"
+        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none"
         if approximate == "none":
             return g.op("com.microsoft::Gelu", self).setType(self.type())
         return torch.onnx.symbolic_opset9.gelu(g, self, approximate)
@@ -90,11 +90,6 @@ def tril(g, self, diagonal):
 
     _reg(tril)
 
-    def col2im(g, self: torch._C.Value, image_shape, block_shape):
-        return g.op("com.microsoft::Col2Im", self, image_shape, block_shape)
-
-    _reg(col2im)
-
 
 def unregister():
     """Unregister ONNX Runtime's built-in contrib ops."""

From 028b5550e20e31c957038b3042ae75f187de7ef5 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Wed, 11 Jan 2023 22:30:19 -0800
Subject: [PATCH 20/30] fix col2imnd

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 .../core/providers/cpu/tensor/col2im.cc       | 66 +++++++++++--------
 .../core/providers/cpu/tensor/col2im.h        | 13 +++-
 .../providers/cpu/tensor/col2im_attributes.h  | 47 -------------
 .../tools/pytorch_export_contrib_ops.py       |  2 +-
 .../onnx_backend_test_series_filters.jsonc    |  1 -
 5 files changed, 51 insertions(+), 78 deletions(-)
 delete mode 100644 onnxruntime/core/providers/cpu/tensor/col2im_attributes.h

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index bf91aa82d31cb..c022fe77b6650 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -2,10 +2,12 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/tensor/col2im.h"
+#include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 
 namespace onnxruntime {
 
+// math::Col2im and math::Col2imNd only support float data type
 ONNX_CPU_OPERATOR_KERNEL(
     Col2Im,
     18,
@@ -18,28 +20,40 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   const auto* image_shape = context->Input<Tensor>(1);
   const auto* kernel_shape = context->Input<Tensor>(2);
 
-  size_t image_dim_size = image_shape->Shape().Size();
-  TensorShapeVector pads = col2im_attrs_.pads;
-  TensorShapeVector dilations = col2im_attrs_.dilations;
-  TensorShapeVector strides = col2im_attrs_.strides;
-
-  if (dilations.empty()) {
-    dilations.resize(image_dim_size, 1);
+  size_t image_dim_number = image_shape->Shape().Size();
+  TensorShapeVector dilations;
+  if (dilations_.empty()) {
+    dilations.resize(image_dim_number, 1);
+  } else {
+    ORT_ENFORCE(dilations_.size() == image_dim_number, "size of 'dilations' attribute, if provided, should equal to the number of image dimmensions.");
+    dilations = dilations_;
   }
-  if (pads.empty()) {
-    pads.resize(image_dim_size * 2, 0);
+
+  TensorShapeVector pads;
+  if (pads_.empty()) {
+    pads.resize(image_dim_number * 2, 0);
+  } else {
+    ORT_ENFORCE(pads_.size() == 2 * image_dim_number, "size of 'pads' attribute, if provided, should equal to twice the number of image dimmensions.");
+    pads = pads_;
   }
-  if (strides.empty()) {
-    strides.resize(image_dim_size, 1);
+
+  TensorShapeVector strides;
+  if (strides_.empty()) {
+    strides.resize(image_dim_number, 1);
+  } else {
+    ORT_ENFORCE(strides_.size() == image_dim_number, "size of 'strides' attribute, if provided, should equal to the number of image dimmensions.");
+    strides = strides_;
   }
 
   int64_t image_shape_size = 1;
   int64_t kernel_shape_size = 1;
   TensorShapeVector adjusted_kernel_shape_dims;
-  for (auto i = 0; i < image_shape->Shape().Size(); ++i) {
-    image_shape_size *= image_shape->Data<int64_t>()[i];
-    kernel_shape_size *= kernel_shape->Data<int64_t>()[i];
-    adjusted_kernel_shape_dims.push_back(dilations[i] * (kernel_shape->Data<int64_t>()[i] - 1) + 1);
+  auto image_dims = image_shape->Data<int64_t>();
+  auto kernel_dims = kernel_shape->Data<int64_t>();
+  for (auto i = 0; i < image_dim_number; ++i) {
+    image_shape_size *= image_dims[i];
+    kernel_shape_size *= kernel_dims[i];
+    adjusted_kernel_shape_dims.push_back(dilations[i] * (kernel_dims[i] - 1) + 1);
   }
   TensorShape col_shape = col_tensor->Shape();
   const auto N = col_shape[0];
@@ -50,23 +64,23 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   TensorShapeVector batched_image_shape_dims, adjusted_image_shape_dims;
   batched_image_shape_dims.insert(batched_image_shape_dims.begin(), {N, C});
-  for (auto i = 0; i < image_shape->Shape()[0]; ++i) {
-    batched_image_shape_dims.push_back(image_shape->Data<int64_t>()[i]);
-    adjusted_image_shape_dims.push_back(image_shape->Data<int64_t>()[i] - adjusted_kernel_shape[i] + 1);
+  for (auto i = 0; i < image_dim_number; ++i) {
+    batched_image_shape_dims.push_back(image_dims[i]);
+    adjusted_image_shape_dims.push_back(image_dims[i] - adjusted_kernel_shape[i] + 1);
   }
   TensorShape batched_image_shape(batched_image_shape_dims);
   T* image_data = context->Output(0, batched_image_shape)->template MutableData<T>();
 
   const T* col_data = col_tensor->template Data<T>();
   for (auto image_id = 0; image_id < N; ++image_id) {
-    if (image_shape->Shape()[0] == 2) {
+    if (image_dim_number == 2) {
       math::Col2im<T, CPUMathUtil, StorageOrder::NCHW>(
           col_data + image_id * col_data_stride,
           C,
-          image_shape->Data<int64_t>()[0],
-          image_shape->Data<int64_t>()[1],
-          kernel_shape->Data<int64_t>()[0],
-          kernel_shape->Data<int64_t>()[1],
+          image_dims[0],
+          image_dims[1],
+          kernel_dims[0],
+          kernel_dims[1],
           dilations[0],
           dilations[1],
           pads[0],
@@ -80,15 +94,15 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
     } else {
       math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
           col_data + image_id * col_data_stride,
-          image_shape->Data<int64_t>(),
+          image_dims,
           adjusted_image_shape_dims.data(),
           kernel_shape_size * C,
-          image_shape_size,
+          image_shape_size * C,
           adjusted_kernel_shape.GetDims().data(),
           strides.data(),
           dilations.data(),
           pads.data(),
-          image_shape->Shape().Size(),
+          image_dim_number,
           image_data + image_id * col_stride,
           &CPUMathUtil::Instance());
     }
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.h b/onnxruntime/core/providers/cpu/tensor/col2im.h
index 50bdad46a72a0..2f2894a7f22fc 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.h
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.h
@@ -4,20 +4,27 @@
 #pragma once
 
 #include "core/framework/op_kernel.h"
-#include "core/providers/cpu/tensor/col2im_attributes.h"
 
 namespace onnxruntime {
 
 template <typename T>
 class Col2Im final : public OpKernel {
  public:
-  explicit Col2Im(const OpKernelInfo& info) : OpKernel(info), col2im_attrs_(info) {
+  explicit Col2Im(const OpKernelInfo& info) : OpKernel(info) {
+    if (!info.GetAttrs("strides", strides_).IsOK())
+      ORT_ENFORCE(strides_.empty());
+    if (!info.GetAttrs("dilations", dilations_).IsOK())
+      ORT_ENFORCE(dilations_.empty());
+    if (!info.GetAttrs("pads", pads_).IsOK())
+      ORT_ENFORCE(pads_.empty());
   }
 
   Status Compute(OpKernelContext* context) const override;
 
  private:
-  Col2ImAttributes col2im_attrs_;
+  TensorShapeVector pads_;
+  TensorShapeVector dilations_;
+  TensorShapeVector strides_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h b/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
deleted file mode 100644
index c8e01e396656a..0000000000000
--- a/onnxruntime/core/providers/cpu/tensor/col2im_attributes.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
-* Copyright (c) 2016-present, Facebook, Inc.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/* Modifications Copyright (c) Microsoft. */
-
-#pragma once
-
-#include "core/common/common.h"
-#include "core/providers/common.h"
-#include "core/util/math.h"
-
-#include "core/common/inlined_containers.h"
-#include "core/framework/op_kernel.h"
-#include "core/framework/op_node_proto_helper.h"
-
-namespace onnxruntime {
-
-struct Col2ImAttributes {
-  explicit Col2ImAttributes(const OpKernelInfo& info) {
-    if (!info.GetAttrs("strides", strides).IsOK())
-      ORT_ENFORCE(strides.empty());
-    if (!info.GetAttrs("dilations", dilations).IsOK())
-      ORT_ENFORCE(dilations.empty());
-    if (!info.GetAttrs("pads", pads).IsOK())
-      ORT_ENFORCE(pads.empty());
-  }
-
-  ~Col2ImAttributes() = default;
-
-  TensorShapeVector pads;
-  TensorShapeVector dilations;
-  TensorShapeVector strides;
-};
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index d3c537035f6ab..9af57eda6ae90 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -1,4 +1,4 @@
-# Copyright(c) Microsoft Corporation. All rights reserved.
+# Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
 """
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 6d49d33e94841..503ca8de0eba4 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -122,7 +122,6 @@
         "^test_center_crop_pad_*",
         "^test_clip_default_int8_max_expanded_cpu",
         "^test_clip_default_int8_min_expanded_cpu",
-        "^test_col2im_*",
         "^test_constant_pad_axes_cpu",
         "^test_constant_pad_cpu",
         "^test_edge_pad_cpu",

From ffa42f22bae91beebd6fad6cde3142ce01349f50 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Wed, 18 Jan 2023 13:12:32 -0500
Subject: [PATCH 21/30] Linting

---
 onnxruntime/core/providers/cpu/tensor/col2im.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index c022fe77b6650..013c2d1f7933b 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -50,7 +50,7 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   TensorShapeVector adjusted_kernel_shape_dims;
   auto image_dims = image_shape->Data<int64_t>();
   auto kernel_dims = kernel_shape->Data<int64_t>();
-  for (auto i = 0; i < image_dim_number; ++i) {
+  for (size_t i = 0; i < image_dim_number; ++i) {
     image_shape_size *= image_dims[i];
     kernel_shape_size *= kernel_dims[i];
     adjusted_kernel_shape_dims.push_back(dilations[i] * (kernel_dims[i] - 1) + 1);
@@ -64,7 +64,7 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
 
   TensorShapeVector batched_image_shape_dims, adjusted_image_shape_dims;
   batched_image_shape_dims.insert(batched_image_shape_dims.begin(), {N, C});
-  for (auto i = 0; i < image_dim_number; ++i) {
+  for (size_t i = 0; i < image_dim_number; ++i) {
     batched_image_shape_dims.push_back(image_dims[i]);
     adjusted_image_shape_dims.push_back(image_dims[i] - adjusted_kernel_shape[i] + 1);
   }

From c1a77f9cd60e8280a0d14e4019d8c50ba5ba18d1 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Thu, 19 Jan 2023 15:01:45 -0800
Subject: [PATCH 22/30] to use narrow cast

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 onnxruntime/core/providers/cpu/tensor/col2im.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/col2im.cc b/onnxruntime/core/providers/cpu/tensor/col2im.cc
index 013c2d1f7933b..b2e7d1c8e0bad 100644
--- a/onnxruntime/core/providers/cpu/tensor/col2im.cc
+++ b/onnxruntime/core/providers/cpu/tensor/col2im.cc
@@ -20,7 +20,7 @@ Status Col2Im<T>::Compute(OpKernelContext* context) const {
   const auto* image_shape = context->Input<Tensor>(1);
   const auto* kernel_shape = context->Input<Tensor>(2);
 
-  size_t image_dim_number = image_shape->Shape().Size();
+  size_t image_dim_number = onnxruntime::narrow<size_t>(image_shape->Shape().Size());
   TensorShapeVector dilations;
   if (dilations_.empty()) {
     dilations.resize(image_dim_number, 1);

From ac4e2f17702661a790cd6defd67e567eb4d66798 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Thu, 19 Jan 2023 15:16:46 -0800
Subject: [PATCH 23/30] remove test_col2im_pads

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index d5bf210804f18..3d937e87ee712 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -102,6 +102,7 @@
         "^test_if_opt",
         "^test_loop16_seq_none",
         "^test_identity_opt",
+        "^test_col2im_pads*",           // remove this when using ONNX with this: https://github.com/onnx/onnx/pull/4769
         // Following tests are for opset 16 ops and are not yet implemented in ORT
         "^test_roialign_aligned_*",
         //GPU failures

From a44b07dbb418181f80963c5c4b34dc2adafd543a Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Thu, 19 Jan 2023 18:47:29 -0800
Subject: [PATCH 24/30] disable android col2im_pads tests, update doc

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 docs/ContribOperators.md      | 54 -----------------------------------
 docs/OperatorKernels.md       |  2 +-
 onnxruntime/test/onnx/main.cc |  1 +
 3 files changed, 2 insertions(+), 55 deletions(-)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 55d63ea551ce3..7f327c80cf989 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -13,7 +13,6 @@ Do not modify directly.*
   * <a href="#com.microsoft.BitmaskBiasDropout">com.microsoft.BitmaskBiasDropout</a>
   * <a href="#com.microsoft.BitmaskDropout">com.microsoft.BitmaskDropout</a>
   * <a href="#com.microsoft.CDist">com.microsoft.CDist</a>
-  * <a href="#com.microsoft.Col2Im">com.microsoft.Col2Im</a>
   * <a href="#com.microsoft.ComplexMul">com.microsoft.ComplexMul</a>
   * <a href="#com.microsoft.ComplexMulConj">com.microsoft.ComplexMulConj</a>
   * <a href="#com.microsoft.ConvTransposeWithDynamicPads">com.microsoft.ConvTransposeWithDynamicPads</a>
@@ -795,59 +794,6 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
-### <a name="com.microsoft.Col2Im"></a><a name="com.microsoft.col2im">**com.microsoft.Col2Im**</a>
-
-  The operator rearranges column blocks back into a multidimensional image
-  
-  Col2Im behaves similarly to PyTorch's fold https://pytorch.org/docs/stable/generated/torch.nn.Fold.html,
-  but it only supports *batched* multi-dimensional image tensors.
-  
-  NOTE: Although specifying image_shape looks redundant because it could be calculated from
-        convolution formulas, it is required as input for more advanced scenarios as explained
-        at PyTorch's implementation (https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Col2Im.cpp#L10)
-  
-
-#### Version
-
-This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>dilations</tt> : list of ints</dt>
-<dd>1-dimensional tensor with dilation value along each spatial axis of the image. If not present, the dilation defaults to 1 along each spatial axis of the image.</dd>
-<dt><tt>pads</tt> : list of ints</dt>
-<dd>1-dimensional tensor with padding value for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin is the number of pixels added at the beginning of axis `i` and xi_end the same for the end of axis `i`. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
-<dt><tt>strides</tt> : list of ints</dt>
-<dd>1-dimensional tensor with stride value along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>input</tt> : T</dt>
-<dd>Input data tensor to be rearranged from column blocks back into an image. This is a 3-dimensional tensor containing [N, C * n-ary-product(block_shape), L], where N is batch dimension, C is image channel dimension and L is number of blocks.</dd>
-<dt><tt>image_shape</tt> : tensor(int64)</dt>
-<dd>The shape of the spatial dimensions of the image after rearranging the column blocks.This is a 1-dim tensor with size of at least 2, containing the value [H_img, W_img]  for a 2-D image or [dim_i1, dim_i2, ..., dim_iN] for a N-D image.</dd>
-<dt><tt>block_shape</tt> : tensor(int64)</dt>
-<dd>The shape of the block to apply on the input.This is a 1-dim tensor of size of at least 2, containing the value [H_block, W_block]  for a 2-D image or [dim_b1, dim_b2, ..., dim_bN] for a N-D block.Dilations, pads and strides are applied to block_shape under the hood.The kernel window start at the top-left of the block and slides to the right and down,similarly to how Convolution kernels do.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>output</tt> : T</dt>
-<dd>Output tensor produced by rearranging blocks into an image.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
-<dd>Constrain input and output types to all numeric tensor types.</dd>
-</dl>
-
-
 ### <a name="com.microsoft.ComplexMul"></a><a name="com.microsoft.complexmul">**com.microsoft.ComplexMul**</a>
 
 #### Version
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 1090d55129039..66f441e6db0f6 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -54,6 +54,7 @@ Do not modify directly.*
 |||12|**T** = tensor(double), tensor(float), tensor(int64), tensor(int8), tensor(uint64), tensor(uint8)|
 |||11|**T** = tensor(float)|
 |||[6, 10]|**T** = tensor(float)|
+|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(float)|
 |Compress|*in* input:**T**<br> *in* condition:**T1**<br> *out* output:**T**|11+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
 |||[9, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
 |Concat|*in* inputs:**T**<br> *out* concat_result:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -416,7 +417,6 @@ Do not modify directly.*
 |BiasGelu|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float)|
 |BifurcationDetector|*in* src_tokens:**T**<br> *in* cur_tokens:**T**<br> *in* prev_suffix_match_idx:**T**<br> *in* pred_tokens:**T**<br> *out* tokens:**T**<br> *out* suffix_match_idx:**T**|1+|**T** = tensor(int64)|
 |CDist|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(double), tensor(float)|
-|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |CropAndResize|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *in* crop_size:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int32)|
 |DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(float)|
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 82298db8191ae..fecc9bf8320d4 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -685,6 +685,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     {"test_scatternd_add", "Opset 16 not supported yet."},
     {"test_scatternd_multiply", "Opset 16 not supported yet."},
     {"test_scatter_elements_with_duplicate_indices", "Opset 16 not supported yet."},
+    {"test_col2im_pad", "onnx 18 test data error."},
 
 #if defined(DISABLE_OPTIONAL_TYPE)
     {"test_optional_get_element", "Optional type not supported in this build flavor."},

From ab8ec1d60509b7dfcc5e1b99445bd63a2b997dbe Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Fri, 20 Jan 2023 10:05:45 -0800
Subject: [PATCH 25/30] typo

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 onnxruntime/test/onnx/main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index fecc9bf8320d4..5c099c18b041c 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -685,7 +685,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     {"test_scatternd_add", "Opset 16 not supported yet."},
     {"test_scatternd_multiply", "Opset 16 not supported yet."},
     {"test_scatter_elements_with_duplicate_indices", "Opset 16 not supported yet."},
-    {"test_col2im_pad", "onnx 18 test data error."},
+    {"test_col2im_pads", "onnx 18 test data error."},
 
 #if defined(DISABLE_OPTIONAL_TYPE)
     {"test_optional_get_element", "Optional type not supported in this build flavor."},

From 7dc5f1ce9e79f90a5a4664ff662e7120df98257a Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Fri, 20 Jan 2023 11:38:08 -0800
Subject: [PATCH 26/30] col2im_pads

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 onnxruntime/test/onnx/main.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 5c099c18b041c..922c8a4c5047a 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -686,6 +686,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     {"test_scatternd_multiply", "Opset 16 not supported yet."},
     {"test_scatter_elements_with_duplicate_indices", "Opset 16 not supported yet."},
     {"test_col2im_pads", "onnx 18 test data error."},
+    {"col2im_pads", "onnx 18 test data error."},
 
 #if defined(DISABLE_OPTIONAL_TYPE)
     {"test_optional_get_element", "Optional type not supported in this build flavor."},

From ac500b6ba03161f48b2973365bd7c5d7bd0aa2d1 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Fri, 20 Jan 2023 12:26:16 -0800
Subject: [PATCH 27/30] remove test_col2im_pads

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 onnxruntime/test/onnx/main.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 922c8a4c5047a..11d635519dfe7 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -685,7 +685,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     {"test_scatternd_add", "Opset 16 not supported yet."},
     {"test_scatternd_multiply", "Opset 16 not supported yet."},
     {"test_scatter_elements_with_duplicate_indices", "Opset 16 not supported yet."},
-    {"test_col2im_pads", "onnx 18 test data error."},
     {"col2im_pads", "onnx 18 test data error."},
 
 #if defined(DISABLE_OPTIONAL_TYPE)

From 00b5555b7145cb1918d3d83bb7506b474a36f5b5 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Mon, 23 Jan 2023 13:29:12 -0800
Subject: [PATCH 28/30] bring back test

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 .../test/providers/cpu/tensor/col2im_test.cc  | 169 ++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 onnxruntime/test/providers/cpu/tensor/col2im_test.cc

diff --git a/onnxruntime/test/providers/cpu/tensor/col2im_test.cc b/onnxruntime/test/providers/cpu/tensor/col2im_test.cc
new file mode 100644
index 0000000000000..3a4539024e5a9
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/tensor/col2im_test.cc
@@ -0,0 +1,169 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <stdexcept>
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+#include "core/util/math.h"
+
+namespace onnxruntime {
+namespace test {
+
+namespace {
+template <typename T>
+std::vector<T> TransposeSerializedVector(std::vector<T>& input, size_t N, size_t C, size_t H, size_t W) {
+  size_t input_size = input.size();
+  if (input_size == 0) {
+    throw std::runtime_error("Invalid input");
+  }
+  std::vector<T> trans_vec(input);
+
+  for (size_t n = 0; n < N; ++n)
+    for (size_t c = 0; c < C; ++c)
+      for (size_t h = 0; h < H; ++h)
+        for (size_t w = 0; w < W; ++w)
+          trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] =
+              input[n * (C * H * W) + c * (H * W) + (w + W * h)];
+
+  return trans_vec;
+}
+
+}  // namespace
+
+TEST(Col2ImOpTest, Simple4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(25);
+  std::vector<float> output(25);
+  std::iota(output.begin(), output.end(), 1.0f);
+
+  input = TransposeSerializedVector(output, 1, 1, 5, 5);
+  test.AddInput<float>("input", {1, 5, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {1, 1, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With2Images3channelsNonSquare4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(120);
+  std::vector<float> output(120);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 2, 3, 4, 5);
+  test.AddInput<float>("input", {2, 15, 4}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {2, 3, 4, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With2Images2channelsNonSquareDilationPadStride4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{2, 2});
+  test.AddAttribute("dilations", std::vector<int64_t>{2, 2});
+  test.AddAttribute("pads", std::vector<int64_t>{2, 2, 2, 2});
+
+  std::vector<float> input{0., 0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0., 0.};
+  std::vector<float> output{2., 0., 6., 0., 10.,
+                            0., 0., 0., 0., 0.,
+                            22., 0., 26., 0., 30.,
+                            0., 0., 0., 0., 0.,
+                            42., 0., 46., 0., 50.,
+                            0., 0., 0., 0., 0.,
+                            62., 0., 66., 0., 70.,
+                            0., 0., 0., 0., 0.,
+                            82., 0., 86., 0., 90.,
+                            0., 0., 0., 0., 0.,
+                            102., 0., 106., 0., 110.,
+                            0., 0., 0., 0., 0.,
+                            122., 0., 126., 0., 130.,
+                            0., 0., 0., 0., 0.,
+                            142., 0., 146., 0., 150.,
+                            0., 0., 0., 0., 0.};
+  test.AddInput<float>("input", {2, 4, 16}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 2});
+
+  test.AddOutput<float>("output", {2, 2, 4, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With3channels4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(75);
+  std::vector<float> output(75);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 1, 3, 5, 5);
+  test.AddInput<float>("input", {1, 15, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {1, 3, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With2Images3channels4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(150);
+  std::vector<float> output(150);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 2, 3, 5, 5);
+  test.AddInput<float>("input", {2, 15, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {2, 3, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, Simple5dNCHWD) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0, 0, 0});
+
+  std::vector<float> input(25);
+  std::vector<float> output(25);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 1, 1, 5, 5);
+  test.AddInput<float>("input", {1, 5, 5}, input);
+  test.AddInput<int64_t>("image_shape", {3}, std::vector<int64_t>{1, 5, 5});
+  test.AddInput<int64_t>("block_shape", {3}, std::vector<int64_t>{1, 1, 5});
+  test.AddOutput<float>("output", {1, 1, 1, 5, 5}, output);
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime

From 7a8b8bcbea17a2281ad33e49be72e99012d55c0f Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Tue, 24 Jan 2023 12:22:05 -0800
Subject: [PATCH 29/30] remove col2im_test.cc to experiment React Native CI

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 .../test/providers/cpu/tensor/col2im_test.cc  | 169 ------------------
 1 file changed, 169 deletions(-)
 delete mode 100644 onnxruntime/test/providers/cpu/tensor/col2im_test.cc

diff --git a/onnxruntime/test/providers/cpu/tensor/col2im_test.cc b/onnxruntime/test/providers/cpu/tensor/col2im_test.cc
deleted file mode 100644
index 3a4539024e5a9..0000000000000
--- a/onnxruntime/test/providers/cpu/tensor/col2im_test.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <stdexcept>
-#include "gtest/gtest.h"
-#include "test/providers/provider_test_utils.h"
-
-#include "core/util/math.h"
-
-namespace onnxruntime {
-namespace test {
-
-namespace {
-template <typename T>
-std::vector<T> TransposeSerializedVector(std::vector<T>& input, size_t N, size_t C, size_t H, size_t W) {
-  size_t input_size = input.size();
-  if (input_size == 0) {
-    throw std::runtime_error("Invalid input");
-  }
-  std::vector<T> trans_vec(input);
-
-  for (size_t n = 0; n < N; ++n)
-    for (size_t c = 0; c < C; ++c)
-      for (size_t h = 0; h < H; ++h)
-        for (size_t w = 0; w < W; ++w)
-          trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] =
-              input[n * (C * H * W) + c * (H * W) + (w + W * h)];
-
-  return trans_vec;
-}
-
-}  // namespace
-
-TEST(Col2ImOpTest, Simple4dNCHW) {
-  OpTester test("Col2Im", 18);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(25);
-  std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), 1.0f);
-
-  input = TransposeSerializedVector(output, 1, 1, 5, 5);
-  test.AddInput<float>("input", {1, 5, 5}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {1, 1, 5, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImOpTest, With2Images3channelsNonSquare4dNCHW) {
-  OpTester test("Col2Im", 18);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(120);
-  std::vector<float> output(120);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 2, 3, 4, 5);
-  test.AddInput<float>("input", {2, 15, 4}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {2, 3, 4, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImOpTest, With2Images2channelsNonSquareDilationPadStride4dNCHW) {
-  OpTester test("Col2Im", 18);
-
-  test.AddAttribute("strides", std::vector<int64_t>{2, 2});
-  test.AddAttribute("dilations", std::vector<int64_t>{2, 2});
-  test.AddAttribute("pads", std::vector<int64_t>{2, 2, 2, 2});
-
-  std::vector<float> input{0., 0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0.,
-                           0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0., 0.};
-  std::vector<float> output{2., 0., 6., 0., 10.,
-                            0., 0., 0., 0., 0.,
-                            22., 0., 26., 0., 30.,
-                            0., 0., 0., 0., 0.,
-                            42., 0., 46., 0., 50.,
-                            0., 0., 0., 0., 0.,
-                            62., 0., 66., 0., 70.,
-                            0., 0., 0., 0., 0.,
-                            82., 0., 86., 0., 90.,
-                            0., 0., 0., 0., 0.,
-                            102., 0., 106., 0., 110.,
-                            0., 0., 0., 0., 0.,
-                            122., 0., 126., 0., 130.,
-                            0., 0., 0., 0., 0.,
-                            142., 0., 146., 0., 150.,
-                            0., 0., 0., 0., 0.};
-  test.AddInput<float>("input", {2, 4, 16}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 2});
-
-  test.AddOutput<float>("output", {2, 2, 4, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImOpTest, With3channels4dNCHW) {
-  OpTester test("Col2Im", 18);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(75);
-  std::vector<float> output(75);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 1, 3, 5, 5);
-  test.AddInput<float>("input", {1, 15, 5}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {1, 3, 5, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImOpTest, With2Images3channels4dNCHW) {
-  OpTester test("Col2Im", 18);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-  std::vector<float> input(150);
-  std::vector<float> output(150);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 2, 3, 5, 5);
-  test.AddInput<float>("input", {2, 15, 5}, input);
-  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
-  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
-
-  test.AddOutput<float>("output", {2, 3, 5, 5}, output);
-  test.Run();
-}
-
-TEST(Col2ImOpTest, Simple5dNCHWD) {
-  OpTester test("Col2Im", 18);
-
-  test.AddAttribute("strides", std::vector<int64_t>{1, 1, 1});
-  test.AddAttribute("dilations", std::vector<int64_t>{1, 1, 1});
-  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0, 0, 0});
-
-  std::vector<float> input(25);
-  std::vector<float> output(25);
-  std::iota(output.begin(), output.end(), 1.0f);
-  input = TransposeSerializedVector(output, 1, 1, 5, 5);
-  test.AddInput<float>("input", {1, 5, 5}, input);
-  test.AddInput<int64_t>("image_shape", {3}, std::vector<int64_t>{1, 5, 5});
-  test.AddInput<int64_t>("block_shape", {3}, std::vector<int64_t>{1, 1, 5});
-  test.AddOutput<float>("output", {1, 1, 1, 5, 5}, output);
-  test.Run();
-}
-
-}  // namespace test
-}  // namespace onnxruntime

From bc25103f3b1ee6e92dc76c147819529f68c442f0 Mon Sep 17 00:00:00 2001
From: Liqun Fu <liqfu@microsoft.com>
Date: Tue, 24 Jan 2023 12:29:04 -0800
Subject: [PATCH 30/30] add col2im_test.cc back because the main branch is
 having the same error with ReactNative CI

Signed-off-by: Liqun Fu <liqfu@microsoft.com>
---
 .../test/providers/cpu/tensor/col2im_test.cc  | 169 ++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 onnxruntime/test/providers/cpu/tensor/col2im_test.cc

diff --git a/onnxruntime/test/providers/cpu/tensor/col2im_test.cc b/onnxruntime/test/providers/cpu/tensor/col2im_test.cc
new file mode 100644
index 0000000000000..3a4539024e5a9
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/tensor/col2im_test.cc
@@ -0,0 +1,169 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <stdexcept>
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+#include "core/util/math.h"
+
+namespace onnxruntime {
+namespace test {
+
+namespace {
+template <typename T>
+std::vector<T> TransposeSerializedVector(std::vector<T>& input, size_t N, size_t C, size_t H, size_t W) {
+  size_t input_size = input.size();
+  if (input_size == 0) {
+    throw std::runtime_error("Invalid input");
+  }
+  std::vector<T> trans_vec(input);
+
+  for (size_t n = 0; n < N; ++n)
+    for (size_t c = 0; c < C; ++c)
+      for (size_t h = 0; h < H; ++h)
+        for (size_t w = 0; w < W; ++w)
+          trans_vec[n * (C * H * W) + c * (H * W) + (h + H * w)] =
+              input[n * (C * H * W) + c * (H * W) + (w + W * h)];
+
+  return trans_vec;
+}
+
+}  // namespace
+
+TEST(Col2ImOpTest, Simple4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(25);
+  std::vector<float> output(25);
+  std::iota(output.begin(), output.end(), 1.0f);
+
+  input = TransposeSerializedVector(output, 1, 1, 5, 5);
+  test.AddInput<float>("input", {1, 5, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {1, 1, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With2Images3channelsNonSquare4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(120);
+  std::vector<float> output(120);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 2, 3, 4, 5);
+  test.AddInput<float>("input", {2, 15, 4}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {2, 3, 4, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With2Images2channelsNonSquareDilationPadStride4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{2, 2});
+  test.AddAttribute("dilations", std::vector<int64_t>{2, 2});
+  test.AddAttribute("pads", std::vector<int64_t>{2, 2, 2, 2});
+
+  std::vector<float> input{0., 0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 1., 3., 5., 0., 11., 13., 15., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 21., 23., 25., 0., 31., 33., 35., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 41., 43., 45., 0., 51., 53., 55., 0., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0.,
+                           0., 0., 0., 0., 61., 63., 65., 0., 71., 73., 75., 0., 0., 0., 0., 0.};
+  std::vector<float> output{2., 0., 6., 0., 10.,
+                            0., 0., 0., 0., 0.,
+                            22., 0., 26., 0., 30.,
+                            0., 0., 0., 0., 0.,
+                            42., 0., 46., 0., 50.,
+                            0., 0., 0., 0., 0.,
+                            62., 0., 66., 0., 70.,
+                            0., 0., 0., 0., 0.,
+                            82., 0., 86., 0., 90.,
+                            0., 0., 0., 0., 0.,
+                            102., 0., 106., 0., 110.,
+                            0., 0., 0., 0., 0.,
+                            122., 0., 126., 0., 130.,
+                            0., 0., 0., 0., 0.,
+                            142., 0., 146., 0., 150.,
+                            0., 0., 0., 0., 0.};
+  test.AddInput<float>("input", {2, 4, 16}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{4, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 2});
+
+  test.AddOutput<float>("output", {2, 2, 4, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With3channels4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(75);
+  std::vector<float> output(75);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 1, 3, 5, 5);
+  test.AddInput<float>("input", {1, 15, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {1, 3, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, With2Images3channels4dNCHW) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  std::vector<float> input(150);
+  std::vector<float> output(150);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 2, 3, 5, 5);
+  test.AddInput<float>("input", {2, 15, 5}, input);
+  test.AddInput<int64_t>("image_shape", {2}, std::vector<int64_t>{5, 5});
+  test.AddInput<int64_t>("block_shape", {2}, std::vector<int64_t>{1, 5});
+
+  test.AddOutput<float>("output", {2, 3, 5, 5}, output);
+  test.Run();
+}
+
+TEST(Col2ImOpTest, Simple5dNCHWD) {
+  OpTester test("Col2Im", 18);
+
+  test.AddAttribute("strides", std::vector<int64_t>{1, 1, 1});
+  test.AddAttribute("dilations", std::vector<int64_t>{1, 1, 1});
+  test.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0, 0, 0});
+
+  std::vector<float> input(25);
+  std::vector<float> output(25);
+  std::iota(output.begin(), output.end(), 1.0f);
+  input = TransposeSerializedVector(output, 1, 1, 5, 5);
+  test.AddInput<float>("input", {1, 5, 5}, input);
+  test.AddInput<int64_t>("image_shape", {3}, std::vector<int64_t>{1, 5, 5});
+  test.AddInput<int64_t>("block_shape", {3}, std::vector<int64_t>{1, 1, 5});
+  test.AddOutput<float>("output", {1, 1, 1, 5, 5}, output);
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime