diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 615491d3b0b54..0447e4814d37d 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -77,6 +77,17 @@ if(HAS_DEPRECATED_COPY)
   set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/tensor/onehot.cc" PROPERTIES COMPILE_FLAGS -Wno-deprecated-copy)
   set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/tensor/where_op.cc" PROPERTIES COMPILE_FLAGS -Wno-deprecated-copy)
 endif()
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT MSVC)
+  # For x86 platforms it is important to pass this flag to compiler. Without this gemmlowp will use slow reference code.
+  # These optimizations are not enabled on MSVC so excluding it.
+  message("enabling optimizations for gemmlowp")
+  set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/math/matmul_integer.cc" PROPERTIES COMPILE_FLAGS "-msse4.1")
+  set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/math/quantize_linear_matmul.cc" PROPERTIES COMPILE_FLAGS "-msse4.1")
+  set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/nn/qlinearconv.cc" PROPERTIES COMPILE_FLAGS "-msse4.1")
+  set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/nn/conv_integer.cc" PROPERTIES COMPILE_FLAGS "-msse4.1")
+endif()
+
 set(gemmlowp_src ${PROJECT_SOURCE_DIR}/external/gemmlowp)
 set(re2_src ${ONNXRUNTIME_ROOT}/../cmake/external/re2)
 target_include_directories(onnxruntime_providers PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${gemmlowp_src} ${re2_src})
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs
index 1738fc4ba2067..5f89bad8bbe9b 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs
@@ -303,7 +303,7 @@ internal static NodeMetadata GetMetadataFromTypeInfo(IntPtr typeInfo)
             OnnxValueType valueType;
             unsafe
             {
-                NativeApiStatus.VerifySuccess(NativeMethods.OrtOnnxTypeFromTypeInfo(typeInfo, new IntPtr(&valueType)));
+                NativeApiStatus.VerifySuccess(NativeMethods.OrtGetOnnxTypeFromTypeInfo(typeInfo, new IntPtr(&valueType)));
             }
             if (valueType != OnnxValueType.ONNX_TYPE_TENSOR && valueType != OnnxValueType.ONNX_TYPE_SPARSETENSOR)
             {
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs
index 7fd2c33c6a6cb..4c213ec66d58e 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs
@@ -87,7 +87,7 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
         [DllImport(nativeLib, CharSet = charSet)]
         public static extern IntPtr /*(OrtStatus*)*/OrtSessionGetOutputName(
                                                 IntPtr /*(OrtSession*)*/ session,
-                                                UIntPtr index, 
+                                                UIntPtr index,
                                                 IntPtr /*(OrtAllocator*)*/ allocator,
                                                 out IntPtr /*(char**)*/name);
 
@@ -253,7 +253,7 @@ public enum MemoryType
         public static extern IntPtr /*(OrtStatus*)*/ OrtGetValueType(IntPtr /*(OrtValue*)*/ value, IntPtr /*(OnnxValueType*)*/ onnxtype);
 
         [DllImport(nativeLib, CharSet = charSet)]
-        public static extern IntPtr /*(OrtStatus*)*/ OrtOnnxTypeFromTypeInfo(IntPtr /*(OrtTypeInfo*)*/ typeinfo, IntPtr /*(OnnxValueType*)*/ onnxtype);
+        public static extern IntPtr /*(OrtStatus*)*/ OrtGetOnnxTypeFromTypeInfo(IntPtr /*(OrtTypeInfo*)*/ typeinfo, IntPtr /*(OnnxValueType*)*/ onnxtype);
 
         [DllImport(nativeLib, CharSet = charSet)]
         public static extern IntPtr /*(OrtStatus*)*/ OrtGetValueCount(IntPtr /*(OrtValue*)*/ value, out IntPtr /*(size_t*)*/ count);
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 1b14ae61f34cd..6848fc31e453c 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -186,10 +186,10 @@ ORT_API_STATUS(OrtCreateEnvWithCustomLogger, OrtLoggingFunction logging_function
 // execution of OrtCreateSession, or does the OrtSession retain a handle to the file/directory
 // and continue to access throughout the OrtSession lifetime?
 //  What sort of access is needed to model_path : read or read/write?
-ORT_API_STATUS(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path,
+ORT_API_STATUS(OrtCreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
                _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
 
-ORT_API_STATUS(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
+ORT_API_STATUS(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
                _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
 
 ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess,
@@ -203,43 +203,43 @@ ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess,
 ORT_API_STATUS(OrtCreateSessionOptions, _Outptr_ OrtSessionOptions** options);
 
 // create a copy of an existing OrtSessionOptions
-ORT_API_STATUS(OrtCloneSessionOptions, _In_ OrtSessionOptions* in_options, _Outptr_ OrtSessionOptions** out_options);
-ORT_API_STATUS(OrtEnableSequentialExecution, _In_ OrtSessionOptions* options);
-ORT_API_STATUS(OrtDisableSequentialExecution, _In_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtCloneSessionOptions, _In_ const OrtSessionOptions* in_options, _Outptr_ OrtSessionOptions** out_options);
+ORT_API_STATUS(OrtEnableSequentialExecution, _Inout_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtDisableSequentialExecution, _Inout_ OrtSessionOptions* options);
 
 // Enable profiling for this session.
-ORT_API_STATUS(OrtEnableProfiling, _In_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
-ORT_API_STATUS(OrtDisableProfiling, _In_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtEnableProfiling, _Inout_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
+ORT_API_STATUS(OrtDisableProfiling, _Inout_ OrtSessionOptions* options);
 
 // Enable the memory pattern optimization.
 // The idea is if the input shapes are the same, we could trace the internal memory allocation
 // and generate a memory pattern for future request. So next time we could just do one allocation
 // with a big chunk for all the internal memory allocation.
 // Note: memory pattern optimization is only available when SequentialExecution enabled.
-ORT_API_STATUS(OrtEnableMemPattern, _In_ OrtSessionOptions* options);
-ORT_API_STATUS(OrtDisableMemPattern, _In_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtEnableMemPattern, _Inout_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtDisableMemPattern, _Inout_ OrtSessionOptions* options);
 
 // Enable the memory arena on CPU
 // Arena may pre-allocate memory for future usage.
 // set this option to false if you don't want it.
-ORT_API_STATUS(OrtEnableCpuMemArena, _In_ OrtSessionOptions* options);
-ORT_API_STATUS(OrtDisableCpuMemArena, _In_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtEnableCpuMemArena, _Inout_ OrtSessionOptions* options);
+ORT_API_STATUS(OrtDisableCpuMemArena, _Inout_ OrtSessionOptions* options);
 
 // < logger id to use for session output
-ORT_API_STATUS(OrtSetSessionLogId, _In_ OrtSessionOptions* options, const char* logid);
+ORT_API_STATUS(OrtSetSessionLogId, _Inout_ OrtSessionOptions* options, const char* logid);
 
 // < applies to session load, initialization, etc
-ORT_API_STATUS(OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, int session_log_verbosity_level);
+ORT_API_STATUS(OrtSetSessionLogVerbosityLevel, _Inout_ OrtSessionOptions* options, int session_log_verbosity_level);
 
 // Set Graph optimization level.
 // Available options are : 0, 1, 2.
 // 0 -> Disable all optimizations
 // 1 -> Enable basic optimizations
 // 2 -> Enable all optimizations
-ORT_API_STATUS(OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, int graph_optimization_level);
+ORT_API_STATUS(OrtSetSessionGraphOptimizationLevel, _Inout_ OrtSessionOptions* options, int graph_optimization_level);
 
 // How many threads in the session thread pool.
-ORT_API_STATUS(OrtSetSessionThreadPoolSize, _In_ OrtSessionOptions* options, int session_thread_pool_size);
+ORT_API_STATUS(OrtSetSessionThreadPoolSize, _Inout_ OrtSessionOptions* options, int session_thread_pool_size);
 
 /**
   * To use additional providers, you must build ORT with the extra providers enabled. Then call one of these
@@ -278,7 +278,7 @@ ORT_API_STATUS(OrtSessionGetOutputName, _In_ const OrtSession* sess, size_t inde
  */
 ORT_API_STATUS(OrtCreateRunOptions, _Outptr_ OrtRunOptions** out);
 
-ORT_API_STATUS(OrtRunOptionsSetRunLogVerbosityLevel, _In_ OrtRunOptions* options, int value);
+ORT_API_STATUS(OrtRunOptionsSetRunLogVerbosityLevel, _Inout_ OrtRunOptions* options, int value);
 ORT_API_STATUS(OrtRunOptionsSetRunTag, _In_ OrtRunOptions*, _In_ const char* run_tag);
 
 ORT_API_STATUS(OrtRunOptionsGetRunLogVerbosityLevel, _In_ const OrtRunOptions* options, _Out_ int* out);
@@ -286,8 +286,8 @@ ORT_API_STATUS(OrtRunOptionsGetRunTag, _In_ const OrtRunOptions*, _Out_ const ch
 
 // Set a flag so that any running OrtRun* calls that are using this instance of OrtRunOptions
 // will exit as soon as possible if the flag is true.
-// flag can be either 1 (true) or 0 (false)
-ORT_API_STATUS(OrtRunOptionsSetTerminate, _In_ OrtRunOptions* options, _In_ int flag);
+ORT_API_STATUS(OrtRunOptionsEnableTerminate, _Inout_ OrtRunOptions* options);
+ORT_API_STATUS(OrtRunOptionsDisableTerminate, _Inout_ OrtRunOptions* options);
 
 /**
  * Create a tensor from an allocator. OrtReleaseValue will also release the buffer inside the output value
@@ -321,7 +321,7 @@ ORT_API_STATUS(OrtIsTensor, _In_ const OrtValue* value, _Out_ int* out);
  * \param s each A string array. Each string in this array must be null terminated.
  * \param s_len length of s
  */
-ORT_API_STATUS(OrtFillStringTensor, _In_ OrtValue* value, _In_ const char* const* s, size_t s_len);
+ORT_API_STATUS(OrtFillStringTensor, _Inout_ OrtValue* value, _In_ const char* const* s, size_t s_len);
 /**
  * \param value A tensor created from OrtCreateTensor... function.
  * \param len total data length, not including the trailing '\0' chars.
@@ -368,19 +368,19 @@ ORT_API_STATUS(OrtGetTensorMemSizeInBytesFromTensorProto, _In_ const void* input
 /**
  * Don't free the 'out' value
  */
-ORT_API_STATUS(OrtCastTypeInfoToTensorInfo, _In_ OrtTypeInfo*, _Out_ const OrtTensorTypeAndShapeInfo** out);
+ORT_API_STATUS(OrtCastTypeInfoToTensorInfo, _In_ const OrtTypeInfo*, _Out_ const OrtTensorTypeAndShapeInfo** out);
 
 /**
  * Return OnnxType from OrtTypeInfo
  */
-ORT_API_STATUS(OrtOnnxTypeFromTypeInfo, _In_ const OrtTypeInfo*, _Out_ enum ONNXType* out);
+ORT_API_STATUS(OrtGetOnnxTypeFromTypeInfo, _In_ const OrtTypeInfo*, _Out_ enum ONNXType* out);
 
 /**
  * The 'out' value should be released by calling OrtReleaseTensorTypeAndShapeInfo
  */
 ORT_API_STATUS(OrtCreateTensorTypeAndShapeInfo, _Outptr_ OrtTensorTypeAndShapeInfo** out);
 
-ORT_API_STATUS(OrtSetTensorElementType, _In_ OrtTensorTypeAndShapeInfo*, enum ONNXTensorElementDataType type);
+ORT_API_STATUS(OrtSetTensorElementType, _Inout_ OrtTensorTypeAndShapeInfo*, enum ONNXTensorElementDataType type);
 
 /**
  * \param info Created from OrtCreateTensorTypeAndShapeInfo() function
@@ -525,7 +525,7 @@ ORT_API_STATUS(OrtGetValueCount, _In_ const OrtValue* value, _Out_ size_t* out);
    * sequence. 'in' should be an arrary of N OrtValues.
    * \value_type should be either map or sequence.
    */
-ORT_API_STATUS(OrtCreateValue, _In_ OrtValue** in, size_t num_values, enum ONNXType value_type,
+ORT_API_STATUS(OrtCreateValue, _In_ const OrtValue* const* in, size_t num_values, enum ONNXType value_type,
                _Outptr_ OrtValue** out);
 
 /*
@@ -561,12 +561,12 @@ struct OrtCustomOpApi {
   OrtStatus*(ORT_API_CALL* SetDimensions)(OrtTensorTypeAndShapeInfo* info, _In_ const int64_t* dim_values, size_t dim_count);
   OrtStatus*(ORT_API_CALL* GetTensorMutableData)(_Inout_ OrtValue* value, _Outptr_ void** data);
 
-  void(ORT_API_CALL* ReleaseTensorTypeAndShapeInfo)(OrtTensorTypeAndShapeInfo* input);
+  void(ORT_API_CALL* ReleaseTensorTypeAndShapeInfo)(_In_ OrtTensorTypeAndShapeInfo* input);
 
-  OrtStatus*(ORT_API_CALL* KernelContext_GetInputCount)(const OrtKernelContext* context, _Out_ size_t* out);
-  OrtStatus*(ORT_API_CALL* KernelContext_GetInput)(const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out);
-  OrtStatus*(ORT_API_CALL* KernelContext_GetOutputCount)(const OrtKernelContext* context, _Out_ size_t* out);
-  OrtStatus*(ORT_API_CALL* KernelContext_GetOutput)(OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out);
+  OrtStatus*(ORT_API_CALL* KernelContext_GetInputCount)(_In_ const OrtKernelContext* context, _Out_ size_t* out);
+  OrtStatus*(ORT_API_CALL* KernelContext_GetInput)(_In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out);
+  OrtStatus*(ORT_API_CALL* KernelContext_GetOutputCount)(_In_ const OrtKernelContext* context, _Out_ size_t* out);
+  OrtStatus*(ORT_API_CALL* KernelContext_GetOutput)(_Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out);
 };
 typedef struct OrtCustomOpApi OrtCustomOpApi;
 
@@ -607,13 +607,13 @@ ORT_API_STATUS(OrtCreateCustomOpDomain, _In_ const char* domain, _Outptr_ OrtCus
  * Add custom ops to the OrtCustomOpDomain
  *  Note: The OrtCustomOp* pointer must remain valid until the OrtCustomOpDomain using it is released
 */
-ORT_API_STATUS(OrtCustomOpDomain_Add, _In_ OrtCustomOpDomain* custom_op_domain, _In_ OrtCustomOp* op);
+ORT_API_STATUS(OrtCustomOpDomain_Add, _Inout_ OrtCustomOpDomain* custom_op_domain, _In_ OrtCustomOp* op);
 
 /*
  * Add a custom op domain to the OrtSessionOptions
  *  Note: The OrtCustomOpDomain* must not be deleted until the sessions using it are released
 */
-ORT_API_STATUS(OrtAddCustomOpDomain, _In_ OrtSessionOptions* options, _In_ OrtCustomOpDomain* custom_op_domain);
+ORT_API_STATUS(OrtAddCustomOpDomain, _Inout_ OrtSessionOptions* options, _In_ OrtCustomOpDomain* custom_op_domain);
 /*
  * END EXPERIMENTAL
 */
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index dc10322a01b94..e21e87596781e 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -123,7 +123,8 @@ struct RunOptions : Base<OrtRunOptions> {
   RunOptions& SetRunTag(const char* run_tag);
   const char* GetRunTag() const;
 
-  RunOptions& SetTerminate(bool flag);
+  RunOptions& EnableTerminate();
+  RunOptions& DisableTerminate();
 };
 
 struct SessionOptions : Base<OrtSessionOptions> {
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index c03b61137de92..0fbbbde445b16 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -113,8 +113,13 @@ inline const char* RunOptions::GetRunTag() const {
   return out;
 }
 
-inline RunOptions& RunOptions::SetTerminate(bool flag) {
-  ORT_THROW_ON_ERROR(OrtRunOptionsSetTerminate(p_, flag ? 1 : 0));
+inline RunOptions& RunOptions::EnableTerminate() {
+  ORT_THROW_ON_ERROR(OrtRunOptionsEnableTerminate(p_));
+  return *this;
+}
+
+inline RunOptions& RunOptions::DisableTerminate() {
+  ORT_THROW_ON_ERROR(OrtRunOptionsDisableTerminate(p_));
   return *this;
 }
 
@@ -284,7 +289,7 @@ inline Unowned<TensorTypeAndShapeInfo> TypeInfo::GetTensorTypeAndShapeInfo() con
 
 inline ONNXType TypeInfo::GetONNXType() const {
   ONNXType out;
-  ORT_THROW_ON_ERROR(OrtOnnxTypeFromTypeInfo(p_, &out));
+  ORT_THROW_ON_ERROR(OrtGetOnnxTypeFromTypeInfo(p_, &out));
   return out;
 }
 
@@ -405,7 +410,7 @@ inline std::string CustomOpApi::KernelInfoGetAttribute<std::string>(_In_ const O
     OrtReleaseStatus(status);
     out.resize(size);
     ORT_THROW_ON_ERROR(api_.KernelInfoGetAttribute_string(info, name, &out[0], &size));
-    out.resize(size - 1); // remove the terminating character '\0'
+    out.resize(size - 1);  // remove the terminating character '\0'
   } else {
     ORT_THROW_ON_ERROR(status);
   }
diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc
index 18c46a994f4d2..d8eb1b2354027 100644
--- a/onnxruntime/core/common/profiler.cc
+++ b/onnxruntime/core/common/profiler.cc
@@ -72,6 +72,11 @@ std::string Profiler::EndProfiling() {
     profile_with_logger_ = false;
     return std::string();
   }
+
+  if (session_logger_) {
+    LOGS(*session_logger_, INFO) << "Writing profiler data to file " << profile_stream_file_;
+  }
+
   std::lock_guard<OrtMutex> lock(mutex_);
   profile_stream_ << "[\n";
 
diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h
index 3e0496282719c..48ecf5747467a 100644
--- a/onnxruntime/core/common/profiler.h
+++ b/onnxruntime/core/common/profiler.h
@@ -44,7 +44,10 @@ class Profiler {
   */
   TimePoint StartTime() const;
 
-  bool FEnabled() const {
+  /*
+   Whether data collection and output from this profiler is enabled.
+   */
+  bool IsEnabled() const {
     return enabled_;
   }
 
diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.cc b/onnxruntime/core/framework/onnxruntime_typeinfo.cc
index fcb6c143e5397..4f00ec89dde4a 100644
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.cc
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.cc
@@ -21,12 +21,12 @@ OrtTypeInfo::~OrtTypeInfo() {
   OrtReleaseTensorTypeAndShapeInfo(data);
 }
 
-ORT_API_STATUS_IMPL(OrtOnnxTypeFromTypeInfo, _In_ const struct OrtTypeInfo* input, ONNXType* out) {
+ORT_API_STATUS_IMPL(OrtGetOnnxTypeFromTypeInfo, _In_ const struct OrtTypeInfo* input, ONNXType* out) {
   *out = input->type;
   return nullptr;
 }
 
-ORT_API_STATUS_IMPL(OrtCastTypeInfoToTensorInfo, _In_ struct OrtTypeInfo* input, const struct OrtTensorTypeAndShapeInfo** out) {
+ORT_API_STATUS_IMPL(OrtCastTypeInfoToTensorInfo, _In_ const struct OrtTypeInfo* input, const struct OrtTensorTypeAndShapeInfo** out) {
   *out = input->type == ONNX_TYPE_TENSOR ? input->data : nullptr;
   return nullptr;
 }
diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc
index dccdfc46d96de..72ee80cd421ee 100644
--- a/onnxruntime/core/framework/parallel_executor.cc
+++ b/onnxruntime/core/framework/parallel_executor.cc
@@ -35,8 +35,8 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v
                                  const std::unordered_map<size_t, CustomAllocator>& fetch_allocators,
                                  const logging::Logger& logger) {
   TimePoint tp;
-  bool f_profiler_enabled = session_state.Profiler().FEnabled();
-  if (f_profiler_enabled) {
+  const bool is_profiler_enabled = session_state.Profiler().IsEnabled();
+  if (is_profiler_enabled) {
     tp = session_state.Profiler().StartTime();
   }
 
@@ -102,7 +102,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v
     }
   }
 
-  if (f_profiler_enabled) {
+  if (is_profiler_enabled) {
     session_state.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "ParallelExecutor::Execute", tp);
   }
 
@@ -121,7 +121,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
   auto graph_viewer = session_state.GetGraphViewer();
   TimePoint sync_time_begin;
   TimePoint kernel_begin_time;
-  bool f_profiler_enabled = session_state.Profiler().FEnabled();
+  const bool f_profiler_enabled = session_state.Profiler().IsEnabled();
 
   // Avoid context switching if possible.
   while (keep_running) {
diff --git a/onnxruntime/core/framework/run_options.cc b/onnxruntime/core/framework/run_options.cc
index d446dc42ca3fc..079be56fc5ae4 100644
--- a/onnxruntime/core/framework/run_options.cc
+++ b/onnxruntime/core/framework/run_options.cc
@@ -33,10 +33,12 @@ ORT_API_STATUS_IMPL(OrtRunOptionsGetRunTag, _In_ const OrtRunOptions* options, c
   return nullptr;
 }
 
-ORT_API_STATUS_IMPL(OrtRunOptionsSetTerminate, _In_ OrtRunOptions* options, int flag) {
-  if (!(flag == 0 || flag == 1)) {
-    return OrtCreateStatus(ORT_INVALID_ARGUMENT, "Invalid value for flag. Should be either 0 or 1");
-  }
-  options->terminate = flag;
+ORT_API_STATUS_IMPL(OrtRunOptionsEnableTerminate, _Inout_ OrtRunOptions* options) {
+  options->terminate = true;
+  return nullptr;
+}
+
+ORT_API_STATUS_IMPL(OrtRunOptionsDisableTerminate, _Inout_ OrtRunOptions* options) {
+  options->terminate = false;
   return nullptr;
 }
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 44425c4ec705e..bd45bbfdc0b01 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -27,12 +27,12 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
                                    std::vector<OrtValue>& fetches,
                                    const std::unordered_map<size_t, CustomAllocator>& fetch_allocators,
                                    const logging::Logger& logger) {
-  bool f_profiler_enabled = session_state.Profiler().FEnabled();
+  const bool is_profiler_enabled = session_state.Profiler().IsEnabled();
   TimePoint tp;
   TimePoint sync_time_begin;
   TimePoint kernel_begin_time;
 
-  if (f_profiler_enabled) {
+  if (is_profiler_enabled) {
     tp = session_state.Profiler().StartTime();
   }
 
@@ -65,7 +65,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger,
                                               p_op_kernel->Node().ImplicitInputDefs(), terminate_flag_);
     // TODO: log kernel outputs?
-    if (f_profiler_enabled) {
+    if (is_profiler_enabled) {
       sync_time_begin = session_state.Profiler().StartTime();
     }
 
@@ -104,7 +104,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node());
 #endif
 
-    if (f_profiler_enabled) {
+    if (is_profiler_enabled) {
       session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
                                                      p_op_kernel->Node().Name() + "_fence_before",
                                                      sync_time_begin,
@@ -128,7 +128,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
       return Status(compute_status.Category(), compute_status.Code(), msg_string);
     }
 
-    if (f_profiler_enabled) {
+    if (is_profiler_enabled) {
       session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
                                                      p_op_kernel->Node().Name() + "_kernel_time",
                                                      kernel_begin_time,
@@ -159,7 +159,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
       }
     }
 
-    if (f_profiler_enabled) {
+    if (is_profiler_enabled) {
       session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT,
                                                      p_op_kernel->Node().Name() + "_fence_after",
                                                      sync_time_begin,
@@ -199,7 +199,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     }
   }
 
-  if (f_profiler_enabled) {
+  if (is_profiler_enabled) {
     session_state.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", tp);
   }
 
diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.cc b/onnxruntime/core/optimizer/matmul_add_fusion.cc
index a0c392ef75354..33bc507990a40 100644
--- a/onnxruntime/core/optimizer/matmul_add_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_add_fusion.cc
@@ -43,10 +43,13 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level)
     auto matmul_input_defs = matmul_node.MutableInputDefs();
     auto add_input_defs = add_node.MutableInputDefs();
 
-    // Gemm only support float, so the inputs of MatMul
+    // Gemm requires that inputs be the same data type and both floating point (float32/float16).
     auto matmul_type = matmul_input_defs[0]->Type();
     auto add_type = add_input_defs[0]->Type();
-    if ((*matmul_type) != "tensor(float)" || (*add_type) != "tensor(float)") {
+    if ((*matmul_type) != (*add_type)) {
+      continue;
+    }
+    if ((*matmul_type) != "tensor(float)" && (*matmul_type) != "tensor(float16)") {
       continue;
     }
 
diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc
index e195afb8ebc33..1a2003aca9b2d 100644
--- a/onnxruntime/core/optimizer/nchwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nchwc_transformer.cc
@@ -18,14 +18,18 @@ class NchwcTransformerImpl {
   void Transform(Node& node);
   void Finalize(bool& modified);
 
-  static constexpr int kNchwcDims = 4;
+  static constexpr int kNchwcBatchChannelDims = 2;
+  static constexpr int kNchwcSpatialDims = 2;
+  static constexpr int kNchwcDims = kNchwcBatchChannelDims + kNchwcSpatialDims;
 
  private:
   // Associate the following state with each created NCHWc output keyed off the
   // original NodeArg.
   struct NchwcArgument {
     // Symbolic shape information for this NCHWc output. Each dimension stores
-    // the original NodeArg* that sourced the value.
+    // the original NodeArg* that sourced the value. Spatial dimensions also
+    // track the number of times the original value has been shifted down due
+    // to a stride count of 2.
     //
     // For example, the first Conv node that takes NCHW input will create a
     // NchwcArgument with the shape referencing itself. Other NCHWc nodes that
@@ -39,6 +43,30 @@ class NchwcTransformerImpl {
     // fusion that can be detected using this additional shape hint.
     struct Shape {
       const NodeArg* dims_[kNchwcDims];
+      size_t shifts_[kNchwcSpatialDims];
+
+      Shape(const NodeArg* initial_dim) {
+        std::fill_n(dims_, kNchwcDims, initial_dim);
+        std::fill_n(shifts_, kNchwcSpatialDims, 0);
+      }
+
+      bool IsDimEqual(const Shape& other, int dim) const {
+        bool is_dim_equal = false;
+        // Test if this dimension is derived from the same NodeArg.
+        if (dims_[dim] == other.dims_[dim]) {
+          if (dim >= kNchwcBatchChannelDims) {
+            // Test if the NodeArg has been shifted down the same number of
+            // times due to striding.
+            int spatial_dim = dim - kNchwcBatchChannelDims;
+            if (shifts_[spatial_dim] == other.shifts_[spatial_dim]) {
+              is_dim_equal = true;
+            }
+          } else {
+            is_dim_equal = true;
+          }
+        }
+        return is_dim_equal;
+      }
     };
 
     // Stores the node that generated the NCHWc output.
@@ -181,7 +209,7 @@ void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node,
                                                   NchwcArgument::Shape& output_shape,
                                                   const ONNX_NAMESPACE::TensorProto* filter_shape) {
   // Skip the leading batch and channel counts.
-  const int kernel_size = kNchwcDims - 2;
+  const int kernel_size = kNchwcSpatialDims;
 
   // Maintain the batch count dimension from the NCHWc input.
   output_shape.dims_[0] = input_shape.dims_[0];
@@ -221,11 +249,18 @@ void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node,
   }
 
   for (int i = 0; i < kernel_size; i++) {
-    if ((strides_attr != nullptr && strides_attr->ints(i) != 1) ||
-        (dilations_attr != nullptr && dilations_attr->ints(i) != 1)) {
+    if (dilations_attr != nullptr && dilations_attr->ints(i) != 1) {
       continue;
     }
 
+    int64_t stride = 1;
+    if (strides_attr != nullptr) {
+      stride = strides_attr->ints(i);
+      if (stride != 1 && stride != 2) {
+        continue;
+      }
+    }
+
     int64_t padding = 0;
     if (pads_attr != nullptr) {
       padding = pads_attr->ints(i) + pads_attr->ints(i + kernel_size);
@@ -238,8 +273,14 @@ void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node,
       kernel = filter_shape->dims(2 + i);
     }
 
+    // Maintain the spatial dimension from the NCHWc input if the implicit or
+    // explicit padding results in the same symbolic dimension before applying
+    // the stride. When the stride is 2, then the actual output dimensions is
+    // half the original value. Track the number of times the symbolic dimension
+    // has been halved in the shifts field.
     if (padding + 1 == kernel || auto_pad_same_shape) {
-      output_shape.dims_[2 + i] = input_shape.dims_[2 + i];
+      output_shape.dims_[kNchwcBatchChannelDims + i] = input_shape.dims_[kNchwcBatchChannelDims + i];
+      output_shape.shifts_[i] = input_shape.shifts_[i] + static_cast<size_t>(stride) - 1;
     }
   }
 }
@@ -396,8 +437,7 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
     nchwc_node.MutableInputDefs()[2] = nchwc_conv_B_arg;
   }
 
-  NchwcArgument::Shape output_shape;
-  std::fill_n(output_shape.dims_, kNchwcDims, output_defs[0]);
+  NchwcArgument::Shape output_shape(output_defs[0]);
 
   if (do_reorder_input) {
     auto it = nchwc_args_.find(input_defs[0]);
@@ -450,8 +490,7 @@ void NchwcTransformerImpl::TransformPool(Node& node) {
                                     kMSNchwcDomain);
   nchwc_node.SetExecutionProviderType(node.GetExecutionProviderType());
 
-  NchwcArgument::Shape output_shape;
-  std::fill_n(output_shape.dims_, kNchwcDims, output_defs[0]);
+  NchwcArgument::Shape output_shape(output_defs[0]);
 
   auto it = nchwc_args_.find(input_defs[0]);
   if (it == nchwc_args_.end()) {
@@ -492,7 +531,7 @@ void NchwcTransformerImpl::TransformAdd(Node& node) {
     auto* nchwc_input_n = nchwc_inputs[n];
     for (int i = 0; i < kNchwcDims; i++) {
       // Test if this dimension is derived from the same NodeArg.
-      if (nchwc_input_0->shape_.dims_[i] != nchwc_input_n->shape_.dims_[i]) {
+      if (!nchwc_input_0->shape_.IsDimEqual(nchwc_input_n->shape_, i)) {
         // Check if ONNX shape inferencing has computed a precise dimension value.
         auto* nchwc_input_n_shape = input_defs[n]->Shape();
         if ((nchwc_input_0_shape == nullptr) || (nchwc_input_n_shape == nullptr)) {
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
index 8c202586f25ea..c5be268f59e2d 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
@@ -238,7 +238,7 @@ class UniDirectionalGru {
 
 // #define DUMP_MATRIXES to provide lots of diagnostic output
 #if defined(DUMP_MATRIXES)
-#define DumpMatrix(...) ::onnxruntime::rnn::detail::DumpMatrixImpl(__VA_ARGS__)
+#define DumpMatrix(...) onnxruntime::rnn::detail::DumpMatrixImpl(__VA_ARGS__)
 #else
 #define DumpMatrix(...) ((void)0)
 #endif
@@ -591,8 +591,9 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
 
   // for each item in sequence run all calculations
   for (int step = 0; step < max_sequence_length; step++) {
+#if defined(DUMP_MATRIXES)
     const std::string seqno_str = " [seqno=" + std::to_string(step) + "]";
-
+#endif
     DumpMatrix("Ht-1" + seqno_str, &*prev_Ht, batch_size_, hidden_size_);
 
     out_added_offset = (step * batch_size_) * hidden_size_x3;
@@ -657,7 +658,9 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
       }
     }
 
+#if defined(DUMP_MATRIXES)
     std::string label = linear_before_reset_ ? "rt (.) (Ht-1 * (Rh^T) + Rbh)" : "rt (.) Ht-1";
+#endif
     DumpMatrix(label + seqno_str, &*cur_h_local, batch_size_, hidden_size_);
 
     if (linear_before_reset_) {
@@ -676,7 +679,9 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
         }
       }
     } else {
+#if defined(DUMP_MATRIXES)
       label += " * Rh^T";
+#endif
 
       // out_H currently contains Xt*(Wh^T).
       auto out_H = outputZRH_.begin() + out_added_offset + hidden_size_x2;
@@ -708,9 +713,11 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
 
     for (int r = 0; r < batch_size_; r++) {
       if (step >= min_sequence_length && step >= sequence_lengths[r]) {
-        if (output_sequence) {
+        // if we need output for every step,
+        // or we need to set prev_Ht for an empty sequence to avoid warnings about using uninitialized values
+        if (output_sequence || (step == 0 && sequence_lengths[r] == 0)) {
           auto fill_output = output + r * hidden_size_;
-          std::fill_n(fill_output, hidden_size_, T{});
+          std::fill_n(&*fill_output, hidden_size_, T{});
         }
 
         continue;
@@ -772,28 +779,29 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
   // copy last output to final_hidden_state
   for (int i = 0; i < batch_size_; i++) {
     const int seq_len = sequence_lengths[i];
-    if (seq_len == 0) {
-      auto final_hidden_state_dst = final_hidden_state.begin() + i * hidden_size_;
-      std::fill_n(final_hidden_state_dst, hidden_size_, T{});
-      continue;
-    }
     if (output_sequence) {
-      auto src = outputs.subspan((seq_len - 1) * output_step_length + i * hidden_size_, hidden_size_);
-      auto dest = final_hidden_state.subspan(i * hidden_size_, hidden_size_);
-      gsl::copy(src, dest);
+      if (seq_len == 0) {
+        auto final_hidden_state_dst = final_hidden_state.begin() + i * hidden_size_;
+        std::fill_n(&*final_hidden_state_dst, hidden_size_, T{});
+      } else {
+        auto src = outputs.subspan((seq_len - 1) * output_step_length + i * hidden_size_, hidden_size_);
+        auto dest = final_hidden_state.subspan(i * hidden_size_, hidden_size_);
+        gsl::copy(src, dest);
+      }
     }
   }
 
-  // zero any values beyond the evaluated steps
+  // zero any values beyond the evaluated steps if the maximum explicit sequence length we saw (max_sequence_length)
+  // was shorter than the maximum possible sequence length (seq_length_)
   if (output_sequence && max_sequence_length < seq_length_) {
     if (output_step_length == batch_size_ * hidden_size_) {  // contiguous
       const auto span_to_zero = outputs.subspan(
           max_sequence_length * output_step_length, (seq_length_ - max_sequence_length) * output_step_length);
-      std::fill_n(span_to_zero.begin(), span_to_zero.size(), T{});
+      std::fill_n(&*span_to_zero.begin(), span_to_zero.size(), T{});
     } else {
       for (int i = max_sequence_length; i < seq_length_; ++i) {  // non-contiguous
         const auto span_to_zero = outputs.subspan(i * output_step_length, batch_size_ * hidden_size_);
-        std::fill_n(span_to_zero.begin(), span_to_zero.size(), T{});
+        std::fill_n(&*span_to_zero.begin(), span_to_zero.size(), T{});
       }
     }
   }
diff --git a/onnxruntime/core/providers/cpu/symbols.txt b/onnxruntime/core/providers/cpu/symbols.txt
index fc4859442c667..fc7560f5b7696 100644
--- a/onnxruntime/core/providers/cpu/symbols.txt
+++ b/onnxruntime/core/providers/cpu/symbols.txt
@@ -51,7 +51,7 @@ OrtGetValueCount
 OrtGetValueType
 OrtGetVersionString
 OrtIsTensor
-OrtOnnxTypeFromTypeInfo
+OrtGetOnnxTypeFromTypeInfo
 OrtReleaseAllocator
 OrtReleaseAllocatorInfo
 OrtReleaseCustomOpDomain
@@ -69,7 +69,8 @@ OrtRunOptionsGetRunLogVerbosityLevel
 OrtRunOptionsGetRunTag
 OrtRunOptionsSetRunLogVerbosityLevel
 OrtRunOptionsSetRunTag
-OrtRunOptionsSetTerminate
+OrtRunOptionsEnableTerminate
+OrtRunOptionsDisableTerminate
 OrtSessionGetInputCount
 OrtSessionGetInputName
 OrtSessionGetInputTypeInfo
diff --git a/onnxruntime/core/providers/cpu/tensor/identity_op.h b/onnxruntime/core/providers/cpu/tensor/identity_op.h
index 5a583e48f1679..7cea426ff9a5d 100644
--- a/onnxruntime/core/providers/cpu/tensor/identity_op.h
+++ b/onnxruntime/core/providers/cpu/tensor/identity_op.h
@@ -43,7 +43,18 @@ class IdentityOp final : public OpKernel {
     }
 
     if (is_dropout) {
-      context->Output(1, std::vector<int64_t>());
+      Tensor* mask = context->Output(1, shape);
+      // a 'nullptr' returned would make it an unused optional output
+      if (mask != nullptr) {
+        // Opset 7 differs with Opset 10 in that the type of the 'mask'
+        // output is tied with the type of the input in Opset 7 whereas
+        // the type of 'mask' in Opset 10 is 'bool' always
+        // so we have a common solution
+        void* mask_data = mask->MutableDataRaw();
+        // In 'test'/'inference' mode, there are no input values dropped out
+        // so fill the buffer with 0/false
+        memset(mask_data, 0, mask->SizeInBytes());
+      }
     }
 
     return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index bd7544396c11d..6509cf01fdf9a 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -64,10 +64,6 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
 CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kCudaExecutionProvider}, device_id_(info.device_id) {
   CUDA_CALL_THROW(cudaSetDevice(device_id_));
-  // create streams, default is nullptr
-  streams_[kCudaStreamDefault] = nullptr;
-  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
-  CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
 
   DeviceAllocatorRegistrationInfo default_allocator_info(
       {OrtMemTypeDefault, [](int id) { return std::make_unique<CUDAAllocator>(id); }, std::numeric_limits<size_t>::max()});
@@ -93,9 +89,6 @@ CUDAExecutionProvider::~CUDAExecutionProvider() {
     CUDA_CALL_THROW(cudaEventDestroy(e));
     it = deferred_release_cpu_ptr_.erase(it);
   }
-  CUDA_CALL_THROW(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
-  CUDA_CALL_THROW(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
-
   ReleasePerThreadStuffs();
 }
 
@@ -207,7 +200,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Un
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Squeeze);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Identity);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, Dropout);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, Dropout);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Gather);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Gemm);
@@ -522,6 +515,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Shrink);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Shrink);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
 
 static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -532,7 +526,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Identity)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Gather)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Gemm)>,
@@ -847,6 +841,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Shrink)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Shrink)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index ed3a509f853e4..bd6e25b18b7bb 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -40,11 +40,6 @@ class CUDAExecutionProvider : public IExecutionProvider {
     return GetPerThreadContext().CudnnHandle();
   }
 
-  cudaStream_t GetStream(int queue_id) const {
-    ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams);
-    return streams_[queue_id];
-  }
-
   template <typename T>
   const T* GetConstOnes(size_t count) {
     return GetPerThreadContext().template GetConstOnes<T>(count);
@@ -69,7 +64,6 @@ class CUDAExecutionProvider : public IExecutionProvider {
   int GetDeviceId() const { return device_id_; }
 
  private:
-  cudaStream_t streams_[kTotalCudaStreams];
   int device_id_;
 
   struct DeferredReleaseCPUPtrs {
diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc
index ec930946aa8dc..8fae7ae8b0d34 100644
--- a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc
+++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc
@@ -12,6 +12,11 @@ GPUDataTransfer::GPUDataTransfer() {
   CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
 }
 
+GPUDataTransfer::~GPUDataTransfer() {
+  CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
+  CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
+}
+
 bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
   return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED
          || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h
index f0acae0f8d448..0f3d4687eb5e5 100644
--- a/onnxruntime/core/providers/cuda/gpu_data_transfer.h
+++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h
@@ -18,6 +18,7 @@ enum CUDAStreamType : int {
 class GPUDataTransfer : public IDataTransfer {
  public:
   GPUDataTransfer();
+  ~GPUDataTransfer();
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 
diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc
index 599fa39f406b2..897e981877ad0 100644
--- a/onnxruntime/core/providers/cuda/tensor/expand.cc
+++ b/onnxruntime/core/providers/cuda/tensor/expand.cc
@@ -21,6 +21,11 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
   ORT_RETURN_IF_ERROR(ComputeOutputShape(Node().Name(), input0.Shape(), output_dims, output_shape));
   auto rank = output_shape.NumDimensions();
   auto& output_tensor = *ctx->Output(0, output_shape);
+
+  if (0 == output_shape.Size()) {
+    return Status::OK();
+  }
+
   auto input_shape = input0.Shape().GetDims();
 
   // pad input_dims with 1 to make ranks match
@@ -40,6 +45,8 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
     for (auto i = 0; i < rank; i++) {
       in_span[i] = fast_divmod(static_cast<int>(input_shape[i]));
       out_span[i] = fast_divmod(static_cast<int>(output_shape[i]));
+      // output_shape[i] won't be 0 here, it's covered in (0 == output_shape.Size())
+      // a null output will be returned for that case
       subdim_size /= output_shape[i];
       sdm_span[i] = static_cast<int>(subdim_size);
     }
diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.cc b/onnxruntime/core/providers/cuda/tensor/identity_op.cc
index e9a4125c43188..890bdf5cacf87 100644
--- a/onnxruntime/core/providers/cuda/tensor/identity_op.cc
+++ b/onnxruntime/core/providers/cuda/tensor/identity_op.cc
@@ -5,13 +5,28 @@
 
 namespace onnxruntime {
 namespace cuda {
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Dropout,
+    kOnnxDomain,
+    7, 9,
+    kCudaExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(), 
+                              DataTypeImpl::GetTensorType<float>(), 
+                              DataTypeImpl::GetTensorType<double>()})
+        .Alias(0, 0),
+    IdentityOp<true>);
+
 ONNX_OPERATOR_KERNEL_EX(
     Dropout,
     kOnnxDomain,
-    7,
+    10,
     kCudaExecutionProvider,
     KernelDefBuilder()
-        .TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(), DataTypeImpl::GetTensorType<float>(), DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(),
+                              DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>())
         .Alias(0, 0),
     IdentityOp<true>);
 
diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.h b/onnxruntime/core/providers/cuda/tensor/identity_op.h
index d83fd541b3cb6..31dd544030b20 100644
--- a/onnxruntime/core/providers/cuda/tensor/identity_op.h
+++ b/onnxruntime/core/providers/cuda/tensor/identity_op.h
@@ -30,7 +30,18 @@ class IdentityOp final : public CudaKernel {
     }
 
     if (is_dropout) {
-      context->Output(1, std::vector<int64_t>());
+      Tensor* mask = context->Output(1, shape);
+      // a 'nullptr' returned would make it an unused optional output
+      if (mask != nullptr) {
+        // Opset 7 differs with Opset 10 in that the type of the 'mask'
+        // output is tied with the type of the input in Opset 7 whereas
+        // the type of 'mask' in Opset 10 is 'bool' always
+        // so we have a common solution
+        void* mask_data = mask->MutableDataRaw();
+        // In 'test'/'inference' mode, there are no input values dropped out
+        // so fill the buffer with 0/false
+        CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask_data, 0, mask->SizeInBytes()));
+      }
     }
 
     return Status::OK();
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index fadb56745a765..710ab2db8121f 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -28,7 +28,7 @@ ORT_API(void, OrtReleaseSessionOptions, OrtSessionOptions* ptr) {
   delete ptr;
 }
 
-ORT_API_STATUS_IMPL(OrtCloneSessionOptions, OrtSessionOptions* input, OrtSessionOptions** out) {
+ORT_API_STATUS_IMPL(OrtCloneSessionOptions, const OrtSessionOptions* input, OrtSessionOptions** out) {
   API_IMPL_BEGIN
   *out = new OrtSessionOptions(*input);
   return nullptr;
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 3605850cfad7a..38b7699aa4d35 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -29,28 +29,28 @@ ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_int64, _In_ const OrtKernelInfo* i
   return onnxruntime::ToOrtStatus(status);
 }
 
-ORT_API_STATUS_IMPL(OrtKernelContext_GetInputCount, const OrtKernelContext* context, _Out_ size_t* out) {
+ORT_API_STATUS_IMPL(OrtKernelContext_GetInputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) {
   *out = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->InputCount();
   return nullptr;
 };
 
-ORT_API_STATUS_IMPL(OrtKernelContext_GetOutputCount, const OrtKernelContext* context, _Out_ size_t* out) {
+ORT_API_STATUS_IMPL(OrtKernelContext_GetOutputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) {
   *out = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->OutputCount();
   return nullptr;
 };
 
-ORT_API_STATUS_IMPL(OrtKernelContext_GetInput, const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out) {
+ORT_API_STATUS_IMPL(OrtKernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out) {
   *out = reinterpret_cast<const OrtValue*>(reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->GetInputMLValue(index));
   return nullptr;
 };
 
-ORT_API_STATUS_IMPL(OrtKernelContext_GetOutput, OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) {
+ORT_API_STATUS_IMPL(OrtKernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) {
   onnxruntime::TensorShape shape(dim_values, dim_count);
   *out = reinterpret_cast<OrtValue*>(reinterpret_cast<onnxruntime::OpKernelContextInternal*>(context)->OutputMLValue(index, shape));
   return nullptr;
 };
 
-ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t *size) {
+ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t* size) {
   std::string value;
   auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttr<std::string>(name, &value);
   if (status.IsOK()) {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 98cbb126cbf2e..10ceef943af0d 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -220,7 +220,7 @@ common::Status InferenceSession::Load(std::function<common::Status(std::shared_p
     status = Status(common::ONNXRUNTIME, common::RUNTIME_EXCEPTION, "Encountered unknown exception in Load()");
   }
 
-  if (session_profiler_.FEnabled()) {
+  if (session_profiler_.IsEnabled()) {
     session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, event_name, tp);
   }
 
@@ -550,7 +550,7 @@ common::Status InferenceSession::Initialize() {
     LOGS(*session_logger_, ERROR) << status.ErrorMessage();
   }
 
-  if (session_profiler_.FEnabled()) {
+  if (session_profiler_.IsEnabled()) {
     session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, "session_initialization", tp);
   }
   return status;
@@ -742,7 +742,7 @@ Status InferenceSession::Run(const RunOptions& run_options, const std::vector<st
   }
 
   --current_num_runs_;
-  if (session_profiler_.FEnabled()) {
+  if (session_profiler_.IsEnabled()) {
     session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, "model_run", tp);
   }
 
@@ -859,7 +859,12 @@ void InferenceSession::StartProfiling(const logging::Logger* logger_ptr) {
 
 std::string InferenceSession::EndProfiling() {
   if (is_model_loaded_) {
-    return session_profiler_.EndProfiling();
+    if (session_profiler_.IsEnabled()) {
+      return session_profiler_.EndProfiling();
+    } else {
+      LOGS(*session_logger_, VERBOSE) << "Profiler is disabled.";
+      return std::string();
+    }
   }
   LOGS(*session_logger_, ERROR) << "Could not write a profile because no model was loaded.";
   return std::string();
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 84e3b2a02ddc5..4cc41de5b54ea 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -147,7 +147,7 @@ ORT_API_STATUS_IMPL(OrtGetStringTensorDataLength, _In_ const OrtValue* value, _O
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtFillStringTensor, _In_ OrtValue* value, _In_ const char* const* s, size_t s_len) {
+ORT_API_STATUS_IMPL(OrtFillStringTensor, _Inout_ OrtValue* value, _In_ const char* const* s, size_t s_len) {
   TENSOR_READWRITE_API_BEGIN
   auto* dst = tensor->MutableData<std::string>();
   auto len = static_cast<size_t>(tensor->Shape().Size());
@@ -365,7 +365,7 @@ ORT_API_STATUS_IMPL(OrtAddCustomOpDomain, _In_ OrtSessionOptions* options, OrtCu
 
 namespace {
 template <typename Loader>
-OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* options,
+OrtStatus* CreateSessionImpl(_In_ const OrtEnv* env, _In_ const OrtSessionOptions* options,
                              Loader loader, _Outptr_ OrtSession** out) {
   auto sess = std::make_unique<::onnxruntime::InferenceSession>(
       options == nullptr ? onnxruntime::SessionOptions() : options->value, env->loggingManager);
@@ -395,7 +395,7 @@ OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* opt
 }
 }  // namespace
 
-ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path,
+ORT_API_STATUS_IMPL(OrtCreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
                     _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) {
   API_IMPL_BEGIN
   const auto loader = [model_path](InferenceSession& sess) {
@@ -405,7 +405,7 @@ ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* mo
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
+ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
                     _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) {
   API_IMPL_BEGIN
   const auto loader = [model_data, model_data_length](InferenceSession& sess) {
@@ -415,7 +415,7 @@ ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtRun, _In_ OrtSession* sess,
+ORT_API_STATUS_IMPL(OrtRun, _Inout_ OrtSession* sess,
                     _In_ const OrtRunOptions* run_options,
                     _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len,
                     _In_ const char* const* output_names1, size_t output_names_len, _Outptr_ OrtValue** output) {
@@ -477,7 +477,7 @@ ORT_API_STATUS_IMPL(OrtRun, _In_ OrtSession* sess,
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtGetTensorMutableData, _In_ OrtValue* value, _Outptr_ void** output) {
+ORT_API_STATUS_IMPL(OrtGetTensorMutableData, _Inout_ OrtValue* value, _Outptr_ void** output) {
   TENSOR_READWRITE_API_BEGIN
   //TODO: test if it's a string tensor
   *output = tensor->MutableDataRaw();
@@ -933,7 +933,7 @@ ORT_API_STATUS_IMPL(OrtGetValue, const OrtValue* value, int index, OrtAllocator*
 ///////////////////
 // OrtCreateValue
 template <typename T>
-static OrtStatus* OrtCreateValueImplSeqHelperMap(OrtValue** const in, size_t num_values, OrtValue** out) {
+static OrtStatus* OrtCreateValueImplSeqHelperMap(const OrtValue* const* in, size_t num_values, OrtValue** out) {
   using SeqType = std::vector<T>;
   auto vec_ptr = std::make_unique<SeqType>();
   vec_ptr->reserve(num_values);
@@ -951,7 +951,7 @@ static OrtStatus* OrtCreateValueImplSeqHelperMap(OrtValue** const in, size_t num
 }
 
 template <typename T>
-static OrtStatus* OrtCreateValueImplSeqHelper(OrtValue** in, size_t num_values, OrtValue** out) {
+static OrtStatus* OrtCreateValueImplSeqHelper(const OrtValue* const* in, size_t num_values, OrtValue** out) {
   using SeqType = std::vector<T>;
   auto vec_ptr = std::make_unique<SeqType>();
   vec_ptr->reserve(num_values);
@@ -972,7 +972,7 @@ static OrtStatus* OrtCreateValueImplSeqHelper(OrtValue** in, size_t num_values,
   return nullptr;
 }
 
-static OrtStatus* OrtCreateValueImplSeq(OrtValue** in, size_t num_values, OrtValue** out) {
+static OrtStatus* OrtCreateValueImplSeq(const OrtValue* const* in, size_t num_values, OrtValue** out) {
   // We only support limited sequence types. For the sake of simplicity the type of the first
   // OrtValue* in OrtValue** will determine the type of the vector used to create the output OrtValue
   // this type should be either a tensor of limited types or map of limited types
@@ -1069,7 +1069,7 @@ static OrtStatus* OrtCreateValueImplMapHelper(const Tensor& key_tensor, const Te
   }
 }
 
-static OrtStatus* OrtCreateValueImplMap(OrtValue** in, size_t num_values, OrtValue** out) {
+static OrtStatus* OrtCreateValueImplMap(const OrtValue* const* in, size_t num_values, OrtValue** out) {
   if (num_values != NUM_MAP_INDICES) {
     return OrtCreateStatus(ORT_FAIL, "For map type num_values MUST be 2");
   }
@@ -1102,7 +1102,7 @@ static OrtStatus* OrtCreateValueImplMap(OrtValue** in, size_t num_values, OrtVal
   return OrtCreateStatus(ORT_FAIL, "Key type is not supported yet.");
 }
 
-static OrtStatus* OrtCreateValueImpl(OrtValue** in, size_t num_values, enum ONNXType value_type, OrtValue** out) {
+static OrtStatus* OrtCreateValueImpl(const OrtValue* const* in, size_t num_values, enum ONNXType value_type, OrtValue** out) {
   if (num_values <= 0) {
     return OrtCreateStatus(ORT_FAIL, "Number of values should be at least 1.");
   }
@@ -1115,7 +1115,7 @@ static OrtStatus* OrtCreateValueImpl(OrtValue** in, size_t num_values, enum ONNX
   return OrtCreateStatus(ORT_FAIL, "Input is not of type sequence or map.");
 }
 
-ORT_API_STATUS_IMPL(OrtCreateValue, OrtValue** in, size_t num_values, enum ONNXType value_type, OrtValue** out) {
+ORT_API_STATUS_IMPL(OrtCreateValue, const OrtValue* const* in, size_t num_values, enum ONNXType value_type, OrtValue** out) {
   API_IMPL_BEGIN
   return OrtCreateValueImpl(in, num_values, value_type, out);
   API_IMPL_END
diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
index 9e99f23917e3b..68e4821e8f114 100644
--- a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
@@ -188,8 +188,14 @@ void NchwcOptimizerTester(const std::function<void(NchwcTestHelper& helper)>& bu
   ASSERT_TRUE(num_outputs == level3_fetches.size());
 
   for (size_t i = 0; i < num_outputs; i++) {
+    double per_sample_tolerance = 0.0;
+    double relative_per_sample_tolerance = 0.0;
     std::pair<COMPARE_RESULT, std::string> ret =
-        CompareOrtValue(level3_fetches[i], level2_fetches[i], 0.0, 0.0, false);
+        CompareOrtValue(level3_fetches[i],
+                        level2_fetches[i],
+                        per_sample_tolerance,
+                        relative_per_sample_tolerance,
+                        false);
     EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS);
   }
 }
@@ -756,13 +762,58 @@ TEST(NchwcOptimizerTests, ShapeInferencing) {
   };
 
   // The NCHWc optimizer does a limited amount of symbolic shape inferencing to
-  // handle models such as YoloV3 which can handle variable height/width. Without
+  // handle models such as YoloV3 which can have variable height/width. Without
   // shape inferencing, the transformer would be unable to detect that the inputs
   // to the Add node have identical shapes and thus is eligble for Conv/Add
   // fusion.
   NchwcOptimizerTester(build_test_case, check_nchwc_graph);
 }
 
+TEST(NchwcOptimizerTests, ShapeInferencing2) {
+  auto build_test_case = [&](NchwcTestHelper& helper) {
+    ONNX_NAMESPACE::TypeProto type_proto;
+    type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+    type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+    type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_height");
+    type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_width");
+
+    auto* input_arg = helper.MakeInput({1, 1, 49, 98}, type_proto);
+    auto* output_arg = helper.MakeOutput();
+
+    auto* conv1_output_arg = helper.MakeIntermediate();
+    helper.AddConvNode(input_arg, conv1_output_arg, {16, 1, 1, 1});
+
+    auto* conv2a1_output_arg = helper.MakeIntermediate();
+    auto& conv2a1_node = helper.AddConvNode(conv1_output_arg, conv2a1_output_arg, {16, 16, 2, 2});
+    conv2a1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 0, 0});
+    conv2a1_node.AddAttribute("strides", std::vector<int64_t>{2, 2});
+
+    auto* conv2a_output_arg = helper.MakeIntermediate();
+    auto& conv2a2_node = helper.AddConvNode(conv2a1_output_arg, conv2a_output_arg, {16, 16, 2, 2});
+    conv2a2_node.AddAttribute("auto_pad", "SAME_UPPER");
+
+    auto* conv2b_output_arg = helper.MakeIntermediate();
+    auto& conv2b_node = helper.AddConvNode(conv1_output_arg, conv2b_output_arg, {16, 16, 1, 1});
+    conv2b_node.AddAttribute("strides", std::vector<int64_t>{2, 2});
+
+    helper.AddNode("Add", {conv2a_output_arg, conv2b_output_arg}, {output_arg});
+  };
+
+  auto check_nchwc_graph = [&](NchwcInferenceSession& session) {
+    auto op_to_count = session.CountOpsInGraph();
+    EXPECT_EQ(op_to_count["nchwc.Conv"], 4);
+    EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 0);
+    EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1);
+    EXPECT_EQ(op_to_count["Add"], 0);
+  };
+
+  // Verify that convolutions using strides of 2 and variable height/width are
+  // recognized as eligible for Conv/Add fusion. This pattern occurs in models
+  // such as Faster-RCNN.
+  NchwcOptimizerTester(build_test_case, check_nchwc_graph);
+}
+
 TEST(NchwcOptimizerTests, MixedOutputUsage) {
   auto build_test_case = [&](NchwcTestHelper& helper) {
     auto* input_arg = helper.MakeInput({6, 5, 11, 11});
diff --git a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc
index 7b2e91b95afdf..802aaa84b310e 100644
--- a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc
@@ -23,5 +23,29 @@ TEST(Dropout, Opset10) {
   test.Run();
 }
 
+TEST(Dropout, WithOptionalOutputOpset10) {
+  OpTester test("Dropout", 10, kOnnxDomain);
+  std::vector<int64_t> dims{2, 2};
+  test.AddInput<float>("X", dims, {1.0f, 2.0f, 3.0f, 5.0f});
+  test.AddOutput<float>("Y", dims, {1.0f, 2.0f, 3.0f, 5.0f});
+  test.AddOutput<bool>("mask", dims, {false, false, false, false});
+  // The NGraph execution provider doesn't seem to support 'Dropout' with optional mask output
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kNGraphExecutionProvider});
+}
+
+TEST(Dropout, WithOptionalOutputOpset7) {
+  // Opset 7 differs with Opset 10 in that the type of the 'mask'
+  // output is tied with the type of the input in Opset 7 whereas
+  // the type of 'mask' in Opset 10 is 'bool' always
+  OpTester test("Dropout", 7, kOnnxDomain);
+  std::vector<int64_t> dims{2, 2};
+  test.AddInput<float>("X", dims, {1.0f, 2.0f, 3.0f, 5.0f});
+  test.AddOutput<float>("Y", dims, {1.0f, 2.0f, 3.0f, 5.0f});
+  test.AddOutput<float>("mask", dims, {0.0f, 0.0f, 0.0f, 0.0f});
+  // The NGraph execution provider doesn't seem to support 'Dropout' with optional mask output
+  // The TensorRT execution provider doesn't seem to support 'Dropout' with non-boolean mask output
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kNGraphExecutionProvider, kTensorrtExecutionProvider});
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml b/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml
index 751c351a7804d..3ac6d3bc198b7 100644
--- a/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml
+++ b/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml
@@ -209,3 +209,45 @@ jobs:
       displayName: 'Component Detection'
 
     - template: templates/clean-agent-build-directory-step.yml
+    
+- job: MacOS_py_Wheels
+  pool:
+    vmImage: 'macOS-10.13'
+  strategy:
+    matrix:
+      Python35:
+        python.version: '3.5'
+      Python36:
+        python.version: '3.6'
+      Python37:
+        python.version: '3.7'
+  steps:
+    - task: CondaEnvironment@1
+      inputs:
+        createCustomEnvironment: true
+        environmentName: 'py$(python.version)'
+        packageSpecs: 'python=$(python.version)'
+        cleanEnvironment: true
+
+    - script: |
+        sudo python -m pip install numpy==1.15.0
+        sudo xcode-select --switch /Applications/Xcode_10.app/Contents/Developer
+        ./build.sh --config Release --skip_submodule_sync --parallel --use_openmp --build_wheel      
+      displayName: 'Command Line Script' 
+      
+    - task: CopyFiles@2
+      displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+      inputs:
+        SourceFolder: '$(Build.SourcesDirectory)/build/Linux/Release/dist'
+        Contents: '*.whl'
+        TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+    - task: PublishBuildArtifacts@1
+      displayName: 'Publish Artifact: ONNXRuntime python wheel'
+      inputs:
+        ArtifactName: onnxruntime
+
+    - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0
+      displayName: 'Component Detection'
+
+    - template: templates/clean-agent-build-directory-step.yml
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml
index c69c292936d36..3d152e09ed92c 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml
@@ -51,6 +51,8 @@ jobs:
     NuPackScript: |
      mkdir $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native
      cd $(Build.BinariesDirectory)\arm64
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.pdb $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.lib $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native
      copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.dll $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native
      powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/zip.exe -OutFile zip.exe"
      zip -r win10-arm.zip runtimes
@@ -186,7 +188,9 @@ jobs:
          move win-x86\runtimes\win-x86\native\onnxruntime.dll %%~ni\runtimes\win-x86\native\onnxruntime.dll
          move win-x86\runtimes\win-x86\native\onnxruntime.lib %%~ni\runtimes\win-x86\native\onnxruntime.lib
          move win-x86\runtimes\win-x86\native\onnxruntime.pdb %%~ni\runtimes\win-x86\native\onnxruntime.pdb
-         move win10-arm\runtimes\win-x64\native\onnxruntime.dll %%~ni\runtimes\win10-arm\native\onnxruntime.dll
+         move win10-arm\runtimes\win10-arm\native\onnxruntime.lib %%~ni\runtimes\win10-arm\native\onnxruntime.lib
+         move win10-arm\runtimes\win10-arm\native\onnxruntime.dll %%~ni\runtimes\win10-arm\native\onnxruntime.dll
+         move win10-arm\runtimes\win10-arm\native\onnxruntime.pdb %%~ni\runtimes\win10-arm\native\onnxruntime.pdb
          move linux-x64\linux-x64\libonnxruntime.so %%~ni\runtimes\linux-x64\native\libonnxruntime.so
          move linux-x86\linux-x86\libonnxruntime.so %%~ni\runtimes\linux-x86\native\libonnxruntime.so
          unzip osx-x64.zip -d osx-x64
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml
index 93c94f35b6786..24805a2264f51 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml
@@ -100,6 +100,7 @@ jobs:
   - MacOS_CI_Dev
   condition: succeeded()
   steps:  
+
   - task: DownloadPipelineArtifact@0
     displayName: 'Download Pipeline Artifact - NuGet'
     inputs:
@@ -107,6 +108,7 @@ jobs:
       targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
     continueOnError: true
 
+
   - task: DownloadPipelineArtifact@0
     displayName: 'Download Pipeline Artifact - Win-x86'
     inputs:
@@ -150,4 +152,39 @@ jobs:
       artifactName: 'drop-signed-nuget'
       targetPath: '$(Build.ArtifactStagingDirectory)'
 
+
 - template: test_all_os.yml
+
+- job: Publish_NuGet_Package_And_Report
+  variables:
+  - group: Dashboard_MySQL_Secret
+  pool: 
+    name: Hosted Windows 2019 with VS2019
+    # AzureFileCopy@3 task has some bug that it depends on a particular version of azure power shell, 
+    # which is not available in OnnxRuntime build VMs, but available in the latest hosted agents. 
+    # So, all the copy/publish jobs are being run on hosted agent
+    # TODO: install the desired azureps on our VMs or use later bugfixed version of AzureFileCopy   
+    demands: azureps
+  condition: and (${{ parameters.DoEsrp }}, eq(variables['Build.SourceBranch'], 'refs/heads/master'))
+  dependsOn:
+  - NuGet_Test_Win
+  - NuGet_Test_Linux
+  - NuGet_Test_MacOS
+  steps:
+
+  - template: ../../templates/set-version-number-variables-step.yml
+  - template: upload-binary-sizes-from-nuget-package.yml
+    parameters:
+      downloadPath: $(Build.BinariesDirectory)/nuget-artifact/final-package
+      gitCommitHash: $(OnnxRuntimeGitCommitHashShort)  
+
+  - task: AzureFileCopy@3
+    displayName: 'Copy Signed NuGet Package to Blob Store'
+    condition: ne(variables['IsReleaseBuild'], 'true') # rlease build has a different package naming scheme
+    inputs:
+      sourcePath: '$(Build.BinariesDirectory)/nuget-artifact/final-package/Microsoft.ML.OnnxRuntime.$(OnnxRuntimeVersion)-dev-$(OnnxRuntimeGitCommitHashShort).nupkg'
+      azureSubscription: 'AIInfraBuildOnnxRuntimeOSS'
+      destination: azureBlob
+      storage: ortpackages
+      containerName: ortpackages
+
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml
new file mode 100644
index 0000000000000..2b1e0aca9a537
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml
@@ -0,0 +1,49 @@
+parameters:
+  gitCommitHash: ''
+  downloadPath: $(Build.BinariesDirectory)/nuget-artifact/final-package
+
+steps:
+- task: DownloadPipelineArtifact@0
+  displayName: 'Download Pipeline Artifact - Signed NuGet Package'
+  inputs:
+    artifactName: 'drop-signed-nuget'
+    targetPath: '${{ parameters.downloadPath }}'
+
+- task: UsePythonVersion@0
+  inputs:
+    versionSpec: '3.7' 
+    addToPath: true 
+    architecture: 'x64'
+
+- task: CmdLine@1
+  displayName: 'Install conda modules mysql-connector-python'
+  inputs:
+    filename: '%CONDA%\condabin\conda.bat'
+    arguments: 'install -q --insecure -y mysql-connector-python'
+  timeoutInMinutes: 10
+
+- task: CmdLine@2
+  displayName: 'Post binary sizes to the dashboard database using command line'
+  inputs:
+    script: |
+      echo changing directory to artifact download path
+      pushd "${{ parameters.downloadPath }}"
+      echo processing nupkg
+      FOR /R %%i IN (*.nupkg) do (
+        echo processing %%~ni.nupkg
+        copy %%~ni.nupkg %%~ni.zip
+        echo copied to zip
+        echo listing lib files in the zip
+        REM use a single .csv file to put the data 
+        echo os,arch,build_config,size > binary_size_data.txt
+        7z.exe l -slt %%~ni.zip runtimes\linux-x64\native\libonnxruntime.so | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo linux,x64,openmp,%%a >> binary_size_data.txt
+        7z.exe l -slt %%~ni.zip runtimes\linux-x86\native\libonnxruntime.so | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo linux,x86,openmp,%%a >> binary_size_data.txt
+        7z.exe l -slt %%~ni.zip runtimes\osx-x64\native\libonnxruntime.dylib | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo osx,x64,openmp,%%a >> binary_size_data.txt
+        7z.exe l -slt %%~ni.zip runtimes\win-x64\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo win,x64,openmp,%%a >> binary_size_data.txt
+        7z.exe l -slt %%~ni.zip runtimes\win-x86\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a  in ('more') do if not "%%a" == "" echo win,x86,openmp,%%a >> binary_size_data.txt
+        echo calling python script to post to database
+        %CONDA%\python.exe $(Build.SourcesDirectory)\tools\ci_build\github\windows\post_binary_sizes_to_dashboard.py --commit_hash=${{ parameters.gitCommitHash }} --size_data_file=binary_size_data.txt --build_project=Lotus --build_id=$(Build.BuildId) 
+      )
+
+  env: 
+    DASHBOARD_MYSQL_ORT_PASSWORD: $(dashboard-mysql-ort-password)
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml
index 007d40b7aef0f..dcbdcffb730d8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml
@@ -12,6 +12,10 @@ steps:
         FOR /F "tokens=* USEBACKQ" %%F IN (`git rev-parse HEAD`) DO (
             @echo ##vso[task.setvariable variable=OnnxRuntimeGitCommitHash;]%%F
         )
+
+        FOR /F "tokens=* USEBACKQ" %%F IN (`git rev-parse --short HEAD`) DO (
+            @echo ##vso[task.setvariable variable=OnnxRuntimeGitCommitHashShort;]%%F
+        )
         
     workingDirectory: '$(Build.SourcesDirectory)'
   condition: eq(variables['Agent.OS'], 'Windows_NT') 
@@ -26,5 +30,8 @@ steps:
         _OnnxRuntimeGitCommitHash=$(git rev-parse HEAD)
         echo "##vso[task.setvariable variable=OnnxRuntimeGitCommitHash;]$_OnnxRuntimeGitCommitHash"
 
+        _OnnxRuntimeGitCommitHash=$(git rev-parse --short=8 HEAD)
+        echo "##vso[task.setvariable variable=OnnxRuntimeGitCommitHashShort;]$_OnnxRuntimeGitCommitHash"
+
     workingDirectory: '$(Build.SourcesDirectory)'
   condition: not(eq(variables['Agent.OS'], 'Windows_NT'))
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 8ce6984dc55b4..bcd1cb8da9b76 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -13,10 +13,12 @@ parameters:
   MsbuildArguments: '/m'
   EnvSetupScript: 'setup_env.bat'
   CudaVersion: ''
+  AgentPool: 'Win-CPU'
 
 jobs:
 - job: ${{ parameters.JobName }}
   timeoutInMinutes: 120
+  pool: ${{ parameters.AgentPool }}
   variables:
     buildDirectory: '$(Build.BinariesDirectory)'
     BuildCommand: ${{ parameters.BuildCommand }}
diff --git a/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh b/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh
index 664684bd00c45..50c2b9880f719 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh
@@ -26,7 +26,7 @@ if [ ! -f /opt/onnxruntime-python/bin/python${PYTHON_VER} ]; then
   ln -s python /opt/onnxruntime-python/bin/python${PYTHON_VER}
 fi
 python -m pip install --upgrade --force-reinstall pip==19.1.1
-python -m pip install --upgrade --force-reinstall numpy==1.16.3
+python -m pip install --upgrade --force-reinstall numpy==1.15.0
 python -m pip install --upgrade --force-reinstall requests==2.21.0
 python -m pip install --upgrade --force-reinstall wheel==0.31.1
 python -m pip install --upgrade --force-reinstall setuptools==41.0.1
diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
new file mode 100644
index 0000000000000..7161b6f897457
--- /dev/null
+++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+
+import argparse
+import mysql.connector
+import xml.etree.ElementTree as ET
+import sys
+import os
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="ONNXRuntime binary size uploader for dashboard")
+    parser.add_argument("--commit_hash", help="Full Git commit hash")
+    parser.add_argument("--build_project", default='Lotus', choices=['Lotus','onnxruntime'], help="Lotus or onnxruntime build project, to construct the build URL")
+    parser.add_argument("--build_id", help="Build Id")
+    parser.add_argument("--size_data_file", help="Path to file that contains the binary size data")
+
+    return parser.parse_args()
+
+# Assumes size_data_file is a csv file with a header line, containing binary sizes and other attributes
+# CSV fields are:
+#    os,arch,build_config,size
+# No empty line or space between fields expected
+def get_binary_sizes(size_data_file):
+    binary_size = []
+    with open(size_data_file, 'r') as f:
+        line = f.readline()
+        headers = line.strip().split(',')
+        while line:
+            line = f.readline()
+            if not line:
+                break;    
+            linedata = line.strip().split(',') 
+            tablerow = {}
+            for i in range(0,len(headers)):
+                if headers[i] == 'size':
+                    tablerow[headers[i]] = int(linedata[i])
+                else:
+                    tablerow[headers[i]] = linedata[i]
+            binary_size.append(tablerow)
+    return binary_size
+
+
+def write_to_db(binary_size_data, args):
+    # connect to database
+
+    cnx = mysql.connector.connect(
+        user='ort@onnxruntimedashboard', 
+        password=os.environ.get('DASHBOARD_MYSQL_ORT_PASSWORD'), 
+        host='onnxruntimedashboard.mysql.database.azure.com', 
+        database='onnxruntime')
+
+    try:
+        cursor = cnx.cursor()
+
+        #delete old records
+        delete_query = ('DELETE FROM onnxruntime.binary_size '
+            'WHERE build_time < DATE_SUB(Now(), INTERVAL 30 DAY);'
+        )
+        
+        cursor.execute(delete_query)
+
+        #insert current records
+        for row in binary_size_data:
+            insert_query = ('INSERT INTO onnxruntime.binary_size '
+                '(build_time, build_project, build_id, commit_id, os, arch, build_config, size) '
+                'VALUES (Now(), "%s", "%s", "%s", "%s", "%s", "%s", %d) '
+                'ON DUPLICATE KEY UPDATE '
+                'build_time=Now(), build_project="%s", build_id="%s", size=%d;'
+            ) % (
+                args.build_project,
+                args.build_id,
+                args.commit_hash, 
+                row['os'],
+                row['arch'],
+                row['build_config'],
+                row['size'],
+
+                args.build_project,
+                args.build_id,
+                row['size']
+            )
+            cursor.execute(insert_query) 
+
+        cnx.commit()
+
+        # # Use below for debugging:
+        # cursor.execute('select * from onnxruntime.binary_size')
+        # for r in cursor:
+        #     print(r)
+            
+        cursor.close()
+        cnx.close()
+    except BaseException as e:
+        cnx.close()
+        raise e
+
+
+if __name__ == "__main__":
+    try:
+        args = parse_arguments()
+        binary_size_data = get_binary_sizes(args.size_data_file)
+        write_to_db(binary_size_data, args)
+    except BaseException as e:
+        print(str(e))
+        sys.exit(1)
+
+
+