diff --git a/BUILD.md b/BUILD.md
index 0cd27c95ff6e7..f6027ca8d8535 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -363,3 +363,17 @@ ls -l /code/onnxruntime/build/Linux/MinSizeRel/dist/*.whl
 
 ### Using other compilers
 (TODO)
+
+## Android Builds
+
+### Cross compiling on Linux
+
+1. Get Android NDK from https://developer.android.com/ndk/downloads. Please unzip it after downloading.
+
+2. Get a pre-compiled protoc:
+
+   You may get it from https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip. Please unzip it after downloading.
+
+3. Denote the unzip destination in step 1 as $ANDROID_NDK, append `-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DONNX_CUSTOM_PROTOC_EXECUTABLE=path/to/protoc` to your cmake args, run cmake and make to build it. 
+
+Note: For 32-bit devices, replace `-DANDROID_ABI=arm64-v8a` to `-DANDROID_ABI=armeabi-v7a`.
diff --git a/cmake/onnxruntime_server.cmake b/cmake/onnxruntime_server.cmake
index 7e85aae615bda..64889d3d84011 100644
--- a/cmake/onnxruntime_server.cmake
+++ b/cmake/onnxruntime_server.cmake
@@ -108,6 +108,14 @@ if(NOT WIN32)
   endif()
 endif()
 
+set(onnxruntime_SERVER_VERSION "local-build" CACHE STRING "Sever version")
+target_compile_definitions(${SERVER_APP_NAME} PUBLIC SRV_VERSION="${onnxruntime_SERVER_VERSION}")
+message(STATUS "ONNX Runtime Server version set to: ${onnxruntime_SERVER_VERSION}")
+
+set(onnxruntime_LATEST_COMMIT_ID "default" CACHE STRING "The latest commit id")
+target_compile_definitions(${SERVER_APP_NAME} PUBLIC LATEST_COMMIT_ID="${onnxruntime_LATEST_COMMIT_ID}")
+message(STATUS "ONNX Runtime Server latest commit id is: ${onnxruntime_LATEST_COMMIT_ID}")
+
 onnxruntime_add_include_to_target(${SERVER_APP_NAME} onnxruntime_session onnxruntime_server_lib gsl onnx onnx_proto server_proto)
 
 target_include_directories(${SERVER_APP_NAME} PRIVATE
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/CXX_Api_Sample.cpp b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/CXX_Api_Sample.cpp
index 39b9cccb19727..abb3a6fedc91d 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/CXX_Api_Sample.cpp
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/CXX_Api_Sample.cpp
@@ -98,7 +98,7 @@ int main(int argc, char* argv[]) {
 
   // create input tensor object from data values
   Ort::AllocatorInfo allocator_info = Ort::AllocatorInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-  Ort::Value input_tensor = Ort::Value::CreateTensor(allocator_info, input_tensor_values.data(), input_tensor_size * sizeof(float), input_node_dims.data(), 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+  Ort::Value input_tensor = Ort::Value::CreateTensor<float>(allocator_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
   assert(input_tensor.IsTensor());
 
   // score model & input tensor, get back output tensor
diff --git a/include/onnxruntime/core/optimizer/rewrite_rule.h b/include/onnxruntime/core/optimizer/rewrite_rule.h
index f481439e5ff00..fa8583bb1c922 100644
--- a/include/onnxruntime/core/optimizer/rewrite_rule.h
+++ b/include/onnxruntime/core/optimizer/rewrite_rule.h
@@ -66,7 +66,7 @@ class RewriteRule {
       @param[in] node The Node to apply the rewrite to.
       @param[out] rule_effect Enum to indicate if and how the graph was modified as a result of the rule application.
       @returns Status indicating success or providing error information */
-  common::Status CheckConditionAndApply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) {
+  common::Status CheckConditionAndApply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
     return SatisfyCondition(graph, node) ? Apply(graph, node, rule_effect) : Status::OK();
   }
 
@@ -79,11 +79,11 @@ class RewriteRule {
       evaluated if this condition function returns true. This can include a more complex pattern matching (conditions 
       on the ascending or descending nodes of the node for which this rule was triggered) or some other properties 
       of the nodes. */
-  virtual bool SatisfyCondition(const Graph& graph, const Node& node) = 0;
+  virtual bool SatisfyCondition(const Graph& graph, const Node& node) const = 0;
 
   /** This is the actual body of the rule that performs the graph transformation. The transformation happens in-place. 
       The return-value of node may be different from the input-value due to rewriting.
       The value of "rule_effect" indicates whether and how the graph was modified by the rule. */
-  virtual common::Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) = 0;
+  virtual common::Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const = 0;
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/optimizer/rule_based_graph_transformer.h b/include/onnxruntime/core/optimizer/rule_based_graph_transformer.h
index 4a3fe8159a4bb..97ce347873d8c 100644
--- a/include/onnxruntime/core/optimizer/rule_based_graph_transformer.h
+++ b/include/onnxruntime/core/optimizer/rule_based_graph_transformer.h
@@ -39,14 +39,14 @@ class RuleBasedGraphTransformer : public GraphTransformer {
   /** Gets the list of registered rewrite rules that will be triggered on nodes with the given op type 
       by this rule-based transformer.
       @returns a pointer to the vector containing all the registered rewrite rules. */
-  const std::vector<std::unique_ptr<RewriteRule>>* GetRewriteRulesForOpType(const std::string& op_type) const {
+  const std::vector<std::reference_wrapper<const RewriteRule>>* GetRewriteRulesForOpType(const std::string& op_type) const {
     auto rules = op_type_to_rules_.find(op_type);
     return (rules != op_type_to_rules_.cend()) ? &rules->second : nullptr;
   }
 
   /** Gets the rewrite rules that are evaluated on all nodes irrespective of their op type.
       @returns a pointer to the vector containing all such rewrite rules or nullptr if no such rule. */
-  const std::vector<std::unique_ptr<RewriteRule>>* GetAnyOpRewriteRules() const {
+  const std::vector<std::reference_wrapper<const RewriteRule>>* GetAnyOpRewriteRules() const {
     return &any_op_type_rules_;
   }
 
@@ -62,16 +62,18 @@ class RuleBasedGraphTransformer : public GraphTransformer {
       applying rules on this node.
       @returns Status indicating success or providing error information. */
   common::Status ApplyRulesOnNode(Graph& graph, Node& node,
-                                  const std::vector<std::unique_ptr<RewriteRule>>& rules,
+                                  const std::vector<std::reference_wrapper<const RewriteRule>>& rules,
                                   RewriteRule::RewriteRuleEffect& rule_effect) const;
 
  private:
   using RuleEffect = RewriteRule::RewriteRuleEffect;
   
+  // The list of unique pointers for all rules (so that rules can be registered for several op types).
+  std::vector<std::unique_ptr<RewriteRule>> rules_;
   // Map that associates a node's op type with the vector of rules that are registered to be triggered for that node.
-  std::unordered_map<std::string, std::vector<std::unique_ptr<RewriteRule>>> op_type_to_rules_;
+  std::unordered_map<std::string, std::vector<std::reference_wrapper<const RewriteRule>>> op_type_to_rules_;
   // Rules that will be evaluated regardless of the op type of the node.
-  std::vector<std::unique_ptr<RewriteRule>> any_op_type_rules_;
+  std::vector<std::reference_wrapper<const RewriteRule>> any_op_type_rules_;
 
   // Performs a single top-down traversal of the graph and applies all registered rules.
   common::Status ApplyImpl(Graph& graph, bool& modified, int graph_level) const override;
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index de35625185e39..6f1b72388253c 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -188,11 +188,11 @@ ORT_API_STATUS(OrtCreateEnvWithCustomLogger, OrtLoggingFunction logging_function
 ORT_API_STATUS(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path,
                _In_ const OrtSessionOptions* options, _Out_ OrtSession** out);
 
-ORT_API_STATUS(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, int model_data_len,
+ORT_API_STATUS(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
                _In_ const OrtSessionOptions* options, _Out_ OrtSession** out);
 
 ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess,
-               _In_ OrtRunOptions* run_options,
+               _In_ const OrtRunOptions* run_options,
                _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len,
                _In_ const char* const* output_names, size_t output_names_len, _Out_ OrtValue** output);
 
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 7e75c6b9caab4..674aa4f4c8f89 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -5,33 +5,11 @@
 #include "onnxruntime_c_api.h"
 #include <cstddef>
 #include <array>
-#include <algorithm>
 #include <memory>
 #include <stdexcept>
 #include <string>
 #include <vector>
 
-#define ORT_REDIRECT_SIMPLE_FUNCTION_CALL(NAME) \
-  decltype(Ort##NAME(value.get())) NAME() {     \
-    return Ort##NAME(value.get());              \
-  }
-
-#define ORT_DEFINE_DELETER(NAME)      \
-  template <>                         \
-  struct default_delete<Ort##NAME> {  \
-    void operator()(Ort##NAME* ptr) { \
-      OrtRelease##NAME(ptr);          \
-    }                                 \
-  };
-
-namespace std {
-ORT_DEFINE_DELETER(Allocator);
-ORT_DEFINE_DELETER(TypeInfo);
-ORT_DEFINE_DELETER(RunOptions);
-ORT_DEFINE_DELETER(SessionOptions);
-ORT_DEFINE_DELETER(TensorTypeAndShapeInfo);
-}  // namespace std
-
 namespace Ort {
 
 using std::nullptr_t;
@@ -56,7 +34,7 @@ struct Exception : std::exception {
   }
 
 #define ORT_DEFINE_RELEASE(NAME) \
-  inline void Release(Ort##NAME* ptr) { OrtRelease##NAME(ptr); }
+  inline void OrtRelease(Ort##NAME* ptr) { OrtRelease##NAME(ptr); }
 
 ORT_DEFINE_RELEASE(Allocator);
 ORT_DEFINE_RELEASE(AllocatorInfo);
@@ -69,11 +47,36 @@ ORT_DEFINE_RELEASE(TensorTypeAndShapeInfo);
 ORT_DEFINE_RELEASE(TypeInfo);
 ORT_DEFINE_RELEASE(Value);
 
+template <typename T>
+struct TypeToTensorType;
+template <>
+struct TypeToTensorType<float> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+template <>
+struct TypeToTensorType<double> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; };
+template <>
+struct TypeToTensorType<int8_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; };
+template <>
+struct TypeToTensorType<int16_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16; };
+template <>
+struct TypeToTensorType<int32_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; };
+template <>
+struct TypeToTensorType<int64_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; };
+template <>
+struct TypeToTensorType<uint8_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8; };
+template <>
+struct TypeToTensorType<uint16_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16; };
+template <>
+struct TypeToTensorType<uint32_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32; };
+template <>
+struct TypeToTensorType<uint64_t> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64; };
+
 template <typename T>
 struct Base {
   Base() = default;
-  Base(T* p) : p_{p} {}
-  ~Base() { Release(p_); }
+  Base(T* p) : p_{p} {
+    if (!p) throw Ort::Exception("Allocation failure", ORT_FAIL);
+  }
+  ~Base() { OrtRelease(p_); }
 
   operator T*() { return p_; }
   operator const T*() const { return p_; }
@@ -88,7 +91,7 @@ struct Base {
   Base(const Base&) = delete;
   Base(Base&& v) : p_{v.p_} { v.p_ = nullptr; }
   void operator=(Base&& v) {
-    Release(p_);
+    OrtRelease(p_);
     p_ = v.p_;
     v.p_ = nullptr;
   }
@@ -115,6 +118,8 @@ struct Value;
 struct Env : Base<OrtEnv> {
   Env(nullptr_t) {}
   Env(OrtLoggingLevel default_warning_level, _In_ const char* logid);
+  Env(OrtLoggingLevel default_warning_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param);
+  explicit Env(OrtEnv* p) : Base<OrtEnv>{p} {}
 };
 
 struct CustomOpDomain : Base<OrtCustomOpDomain> {
@@ -150,6 +155,9 @@ struct SessionOptions : Base<OrtSessionOptions> {
   SessionOptions& EnableCpuMemArena();
   SessionOptions& DisableCpuMemArena();
 
+  SessionOptions& EnableProfiling(const ORTCHAR_T* profile_file_prefix);
+  SessionOptions& DisableProfiling();
+
   SessionOptions& EnableMemPattern();
   SessionOptions& DisableMemPattern();
 
@@ -164,9 +172,14 @@ struct SessionOptions : Base<OrtSessionOptions> {
 struct Session : Base<OrtSession> {
   explicit Session(nullptr_t) {}
   Session(Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);
+  Session(Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options);
 
-  std::vector<Value> Run(RunOptions& run_options, const char* const* input_names, Value* input_values, size_t input_count,
-                         const char* const* output_names, size_t output_names_count);
+  // Run that will allocate the output values
+  std::vector<Value> Run(const RunOptions& run_options, const char* const* input_names, Value* input_values, size_t input_count,
+                         const char* const* output_names, size_t output_count);
+  // Run for when there is a list of prealloated outputs
+  void Run(const RunOptions& run_options, const char* const* input_names, Value* input_values, size_t input_count,
+           const char* const* output_names, Value* output_values, size_t output_count);
 
   size_t GetInputCount() const;
   size_t GetOutputCount() const;
@@ -183,6 +196,7 @@ struct TensorTypeAndShapeInfo : Base<OrtTensorTypeAndShapeInfo> {
   explicit TensorTypeAndShapeInfo(OrtTensorTypeAndShapeInfo* p) : Base<OrtTensorTypeAndShapeInfo>{p} {}
 
   ONNXTensorElementDataType GetElementType() const;
+  size_t GetElementCount() const;
 
   size_t GetDimensionsCount() const;
   void GetDimensions(int64_t* values, size_t values_count) const;
@@ -194,11 +208,18 @@ struct TypeInfo : Base<OrtTypeInfo> {
   explicit TypeInfo(OrtTypeInfo* p) : Base<OrtTypeInfo>{p} {}
 
   Unowned<TensorTypeAndShapeInfo> GetTensorTypeAndShapeInfo() const;
+  ONNXType GetONNXType() const;
 };
 
 struct Value : Base<OrtValue> {
-  static Value CreateTensor(const AllocatorInfo& info, void* p_data, size_t p_data_len, const int64_t* shape, size_t shape_len,
+  template <typename T>
+  static Value CreateTensor(const OrtAllocatorInfo* info, T* p_data, size_t p_data_element_count, const int64_t* shape, size_t shape_len);
+  static Value CreateTensor(const OrtAllocatorInfo* info, void* p_data, size_t p_data_byte_count, const int64_t* shape, size_t shape_len,
                             ONNXTensorElementDataType type);
+  template <typename T>
+  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len);
+  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type);
+
   static Value CreateMap(Value& keys, Value& values);
   static Value CreateSequence(std::vector<Value>& values);
 
@@ -215,6 +236,7 @@ struct Value : Base<OrtValue> {
   template <typename T>
   T* GetTensorMutableData();
 
+  TypeInfo GetTypeInfo() const;
   TensorTypeAndShapeInfo GetTensorTypeAndShapeInfo() const;
 };
 
@@ -275,6 +297,10 @@ inline Env::Env(OrtLoggingLevel default_warning_level, _In_ const char* logid) {
   ORT_THROW_ON_ERROR(OrtCreateEnv(default_warning_level, logid, &p_));
 }
 
+inline Env::Env(OrtLoggingLevel default_warning_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param) {
+  ORT_THROW_ON_ERROR(OrtCreateEnvWithCustomLogger(logging_function, logger_param, default_warning_level, logid, &p_));
+}
+
 inline CustomOpDomain::CustomOpDomain(const char* domain)
     : Base<OrtCustomOpDomain>{OrtCreateCustomOpDomain(domain)} {
 }
@@ -327,6 +353,16 @@ inline SessionOptions& SessionOptions::SetGraphOptimizationLevel(uint32_t graph_
   return *this;
 }
 
+inline SessionOptions& SessionOptions::EnableProfiling(const ORTCHAR_T* profile_file_prefix) {
+  OrtEnableProfiling(p_, profile_file_prefix);
+  return *this;
+}
+
+inline SessionOptions& SessionOptions::DisableProfiling() {
+  OrtDisableProfiling(p_);
+  return *this;
+}
+
 inline SessionOptions& SessionOptions::EnableMemPattern() {
   OrtEnableMemPattern(p_);
   return *this;
@@ -370,13 +406,25 @@ inline Session::Session(Env& env, const ORTCHAR_T* model_path, const SessionOpti
   ORT_THROW_ON_ERROR(OrtCreateSession(env, model_path, options, &p_));
 }
 
-inline std::vector<Value> Session::Run(RunOptions& run_options, const char* const* input_names, Value* input_values, size_t input_count,
+inline Session::Session(Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options) {
+  ORT_THROW_ON_ERROR(OrtCreateSessionFromArray(env, model_data, model_data_length, options, &p_));
+}
+
+inline std::vector<Value> Session::Run(const RunOptions& run_options, const char* const* input_names, Value* input_values, size_t input_count,
                                        const char* const* output_names, size_t output_names_count) {
-  std::vector<OrtValue*> ort_input_values(input_values, input_values + input_count);
-  std::vector<OrtValue*> ort_out(output_names_count);
-  ORT_THROW_ON_ERROR(OrtRun(p_, run_options, input_names, ort_input_values.data(), ort_input_values.size(), output_names, output_names_count, ort_out.data()));
-  std::vector<Value> out(ort_out.begin(), ort_out.end());
-  return out;
+  std::vector<Ort::Value> output_values;
+  for (size_t i = 0; i < output_names_count; i++)
+    output_values.emplace_back(nullptr);
+  Run(run_options, input_names, input_values, input_count, output_names, output_values.data(), output_names_count);
+  return output_values;
+}
+
+inline void Session::Run(const RunOptions& run_options, const char* const* input_names, Value* input_values, size_t input_count,
+                         const char* const* output_names, Value* output_values, size_t output_count) {
+  static_assert(sizeof(Value) == sizeof(OrtValue*), "Value is really just an array of OrtValue* in memory, so we can reinterpret_cast safely");
+  auto ort_input_values = reinterpret_cast<OrtValue**>(input_values);
+  auto ort_output_values = reinterpret_cast<OrtValue**>(output_values);
+  ORT_THROW_ON_ERROR(OrtRun(p_, run_options, input_names, ort_input_values, input_count, output_names, output_count, ort_output_values));
 }
 
 inline size_t Session::GetInputCount() const {
@@ -419,6 +467,10 @@ inline ONNXTensorElementDataType TensorTypeAndShapeInfo::GetElementType() const
   return OrtGetTensorElementType(p_);
 }
 
+inline size_t TensorTypeAndShapeInfo::GetElementCount() const {
+  return static_cast<size_t>(OrtGetTensorShapeElementCount(p_));
+}
+
 inline size_t TensorTypeAndShapeInfo::GetDimensionsCount() const {
   return OrtGetDimensionsCount(p_);
 }
@@ -437,13 +489,37 @@ inline Unowned<TensorTypeAndShapeInfo> TypeInfo::GetTensorTypeAndShapeInfo() con
   return Unowned<TensorTypeAndShapeInfo>{const_cast<OrtTensorTypeAndShapeInfo*>(OrtCastTypeInfoToTensorInfo(p_))};
 }
 
-inline Value Value::CreateTensor(const AllocatorInfo& info, void* p_data, size_t p_data_len, const int64_t* shape, size_t shape_len,
+inline ONNXType TypeInfo::GetONNXType() const {
+  return OrtOnnxTypeFromTypeInfo(p_);
+}
+
+template <typename T>
+inline Value Value::CreateTensor(const OrtAllocatorInfo* info, T* p_data, size_t p_data_element_count, const int64_t* shape, size_t shape_len) {
+  return CreateTensor(info, p_data, p_data_element_count * sizeof(T), shape, shape_len, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateTensor(const OrtAllocatorInfo* info, void* p_data, size_t p_data_byte_count, const int64_t* shape, size_t shape_len,
                                  ONNXTensorElementDataType type) {
   OrtValue* out;
-  ORT_THROW_ON_ERROR(OrtCreateTensorWithDataAsOrtValue(info, p_data, p_data_len, shape, shape_len, type, &out));
+  ORT_THROW_ON_ERROR(OrtCreateTensorWithDataAsOrtValue(info, p_data, p_data_byte_count, shape, shape_len, type, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len) {
+  return CreateTensor(allocator, shape, shape_len, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ORT_THROW_ON_ERROR(OrtCreateTensorAsOrtValue(allocator, shape, shape_len, type, &out));
   return Value{out};
 }
 
+ORT_API_STATUS(OrtCreateTensorAsOrtValue, _Inout_ OrtAllocator* allocator,
+               _In_ const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type,
+               _Out_ OrtValue** out);
+
 inline Value Value::CreateMap(Value& keys, Value& values) {
   OrtValue* out;
   OrtValue* inputs[2] = {keys, values};
@@ -456,7 +532,7 @@ inline Value Value::CreateSequence(std::vector<Value>& values) {
   std::vector<OrtValue*> values_ort{values.data(), values.data() + values.size()};
   ORT_THROW_ON_ERROR(OrtCreateValue(values_ort.data(), values_ort.size(), ONNX_TYPE_SEQUENCE, &out));
   return Value{out};
-}  // namespace Ort
+}
 
 inline bool Value::IsTensor() const {
   return OrtIsTensor(p_) != 0;
@@ -491,6 +567,12 @@ T* Value::GetTensorMutableData() {
   return out;
 }
 
+inline TypeInfo Value::GetTypeInfo() const {
+  OrtTypeInfo* output;
+  ORT_THROW_ON_ERROR(OrtGetTypeInfo(p_, &output));
+  return TypeInfo{output};
+}
+
 inline TensorTypeAndShapeInfo Value::GetTensorTypeAndShapeInfo() const {
   OrtTensorTypeAndShapeInfo* output;
   ORT_THROW_ON_ERROR(OrtGetTensorTypeAndShape(p_, &output));
@@ -499,79 +581,6 @@ inline TensorTypeAndShapeInfo Value::GetTensorTypeAndShapeInfo() const {
 
 }  // namespace Ort
 
-// Deprecated: Will be removed once all dependencies of it are removed
-#if 1
-namespace onnxruntime {
-
-class SessionOptionsWrapper {
- private:
-  std::unique_ptr<OrtSessionOptions> value;
-  OrtEnv* env_;
-  SessionOptionsWrapper(_In_ OrtEnv* env, OrtSessionOptions* p) : value(p), env_(env){};
-
- public:
-  operator OrtSessionOptions*() { return value.get(); }
-
-  //TODO: for the input arg, should we call addref here?
-  SessionOptionsWrapper(_In_ OrtEnv* env) : value(OrtCreateSessionOptions()), env_(env){};
-  ORT_REDIRECT_SIMPLE_FUNCTION_CALL(EnableSequentialExecution)
-  ORT_REDIRECT_SIMPLE_FUNCTION_CALL(DisableSequentialExecution)
-  ORT_REDIRECT_SIMPLE_FUNCTION_CALL(DisableProfiling)
-  ORT_REDIRECT_SIMPLE_FUNCTION_CALL(EnableMemPattern)
-  ORT_REDIRECT_SIMPLE_FUNCTION_CALL(DisableMemPattern)
-  ORT_REDIRECT_SIMPLE_FUNCTION_CALL(EnableCpuMemArena)
-  ORT_REDIRECT_SIMPLE_FUNCTION_CALL(DisableCpuMemArena)
-  void EnableProfiling(_In_ const ORTCHAR_T* profile_file_prefix) {
-    OrtEnableProfiling(value.get(), profile_file_prefix);
-  }
-
-  void SetSessionLogId(const char* logid) {
-    OrtSetSessionLogId(value.get(), logid);
-  }
-  void SetSessionLogVerbosityLevel(uint32_t session_log_verbosity_level) {
-    OrtSetSessionLogVerbosityLevel(value.get(), session_log_verbosity_level);
-  }
-  int SetSessionGraphOptimizationLevel(uint32_t graph_optimization_level) {
-    return OrtSetSessionGraphOptimizationLevel(value.get(), graph_optimization_level);
-  }
-  void SetSessionThreadPoolSize(int session_thread_pool_size) {
-    OrtSetSessionThreadPoolSize(value.get(), session_thread_pool_size);
-  }
-
-  SessionOptionsWrapper clone() const {
-    OrtSessionOptions* p = OrtCloneSessionOptions(value.get());
-    return SessionOptionsWrapper(env_, p);
-  }
-
-  OrtSession* OrtCreateSession(_In_ const ORTCHAR_T* model_path) {
-    OrtSession* ret = nullptr;
-    ORT_THROW_ON_ERROR(::OrtCreateSession(env_, model_path, value.get(), &ret));
-    return ret;
-  }
-};
-
-inline OrtValue* OrtCreateTensorAsOrtValue(_Inout_ OrtAllocator* env, const std::vector<int64_t>& shape, ONNXTensorElementDataType type) {
-  OrtValue* ret;
-  ORT_THROW_ON_ERROR(::OrtCreateTensorAsOrtValue(env, shape.data(), shape.size(), type, &ret));
-  return ret;
-}
-
-inline OrtValue* OrtCreateTensorWithDataAsOrtValue(_In_ const OrtAllocatorInfo* info, _In_ void* p_data, size_t p_data_len, const std::vector<int64_t>& shape, ONNXTensorElementDataType type) {
-  OrtValue* ret;
-  ORT_THROW_ON_ERROR(::OrtCreateTensorWithDataAsOrtValue(info, p_data, p_data_len, shape.data(), shape.size(), type, &ret));
-  return ret;
-}
-
-inline std::vector<int64_t> GetTensorShape(const OrtTensorTypeAndShapeInfo* info) {
-  size_t dims = OrtGetDimensionsCount(info);
-  std::vector<int64_t> ret(dims);
-  OrtGetDimensions(info, ret.data(), ret.size());
-  return ret;
-}
-
-}  // namespace onnxruntime
-#endif
-
 namespace Ort {
 struct CustomOpApi {
   CustomOpApi(const OrtCustomOpApi& api) : api_(api) {}
diff --git a/onnxruntime/core/common/task_thread_pool.h b/onnxruntime/core/common/task_thread_pool.h
index d8ea3e3476179..1cc0d64ecfd6b 100644
--- a/onnxruntime/core/common/task_thread_pool.h
+++ b/onnxruntime/core/common/task_thread_pool.h
@@ -66,7 +66,7 @@ class TaskThreadPool {
     std::packaged_task<void()> no_id;
     std::packaged_task<void(std::size_t)> with_id;
 
-    task_element_t(task_element_t&& other) {
+    task_element_t(task_element_t&& other) noexcept {
       run_with_id = other.run_with_id;
       no_id = std::move(other.no_id);
       with_id = std::move(other.with_id);
diff --git a/onnxruntime/core/framework/path_lib.h b/onnxruntime/core/framework/path_lib.h
index 3832fd5522f80..a7dd3878d60aa 100644
--- a/onnxruntime/core/framework/path_lib.h
+++ b/onnxruntime/core/framework/path_lib.h
@@ -236,7 +236,7 @@ void LoopDir(const std::string& dir_name, T func) {
     auto e = errno;
     char buf[1024];
     char* msg;
-#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+#if defined(__GLIBC__) && defined(_GNU_SOURCE) && !defined (__ANDROID__)
     msg = strerror_r(e, buf, sizeof(buf));
 #else
     if (strerror_r(e, buf, sizeof(buf)) != 0) {
@@ -270,4 +270,4 @@ inline T ReplaceFilename(const T& input, const T& new_value) {
   return ConcatPathComponent(ret, new_value);
 }
 
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/conv_add_fusion.cc b/onnxruntime/core/optimizer/conv_add_fusion.cc
index 59a8c010138a2..d0284e93ce2a7 100644
--- a/onnxruntime/core/optimizer/conv_add_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_add_fusion.cc
@@ -9,7 +9,7 @@ using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
 namespace onnxruntime {
 
-Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modified) {
+Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modified) const {
   auto& conv_node = node;
   const auto& add_node = *conv_node.OutputNodesBegin();
   const auto& conv_inputs = conv_node.InputDefs();
@@ -107,7 +107,7 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie
   return Status::OK();
 }
 
-bool ConvAddFusion::SatisfyCondition(const Graph& graph, const Node& node) {
+bool ConvAddFusion::SatisfyCondition(const Graph& graph, const Node& node) const {
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Conv", {1}) ||
       node.GetOutputEdgesCount() != 1) {
     return false;
diff --git a/onnxruntime/core/optimizer/conv_add_fusion.h b/onnxruntime/core/optimizer/conv_add_fusion.h
index 3fe4e92b5abcf..7763e249bd118 100644
--- a/onnxruntime/core/optimizer/conv_add_fusion.h
+++ b/onnxruntime/core/optimizer/conv_add_fusion.h
@@ -23,9 +23,9 @@ class ConvAddFusion : public RewriteRule {
   }
 
  private:
-  bool SatisfyCondition(const Graph& graph, const Node& node) override;
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
 
-  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/conv_bn_fusion.cc b/onnxruntime/core/optimizer/conv_bn_fusion.cc
index f13bf64eafa6e..1d22d7c02f93c 100644
--- a/onnxruntime/core/optimizer/conv_bn_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_bn_fusion.cc
@@ -9,7 +9,7 @@ using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
 namespace onnxruntime {
 
-Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) {
+Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
   auto& conv_node = node;
   const Node& bn_node = *conv_node.OutputNodesBegin();
 
@@ -142,7 +142,7 @@ Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff
   return Status::OK();
 }
 
-bool ConvBNFusion::SatisfyCondition(const Graph& graph, const Node& node) {
+bool ConvBNFusion::SatisfyCondition(const Graph& graph, const Node& node) const {
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Conv", {1}) ||
       node.GetOutputEdgesCount() != 1) {
     return false;
diff --git a/onnxruntime/core/optimizer/conv_bn_fusion.h b/onnxruntime/core/optimizer/conv_bn_fusion.h
index e23095bfdf49c..cdce82035f032 100644
--- a/onnxruntime/core/optimizer/conv_bn_fusion.h
+++ b/onnxruntime/core/optimizer/conv_bn_fusion.h
@@ -23,9 +23,9 @@ class ConvBNFusion : public RewriteRule {
   }
 
  private:
-  bool SatisfyCondition(const Graph& graph, const Node& node) override;
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
 
-  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/conv_mul_fusion.cc b/onnxruntime/core/optimizer/conv_mul_fusion.cc
index dd27f0357ff39..0e5cbfc5d583d 100644
--- a/onnxruntime/core/optimizer/conv_mul_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_mul_fusion.cc
@@ -9,7 +9,7 @@ using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
 namespace onnxruntime {
 
-Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) {
+Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
   auto& conv_node = node;
   const auto& mul_node = *conv_node.OutputNodesBegin();
   const auto& conv_inputs = conv_node.InputDefs();
@@ -105,7 +105,7 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef
   return Status::OK();
 }
 
-bool ConvMulFusion::SatisfyCondition(const Graph& graph, const Node& node) {
+bool ConvMulFusion::SatisfyCondition(const Graph& graph, const Node& node) const {
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Conv", {1}) ||
       node.GetOutputEdgesCount() != 1) {
     return false;
diff --git a/onnxruntime/core/optimizer/conv_mul_fusion.h b/onnxruntime/core/optimizer/conv_mul_fusion.h
index 62a39b624570a..bb6a35bf7f01a 100644
--- a/onnxruntime/core/optimizer/conv_mul_fusion.h
+++ b/onnxruntime/core/optimizer/conv_mul_fusion.h
@@ -22,9 +22,9 @@ class ConvMulFusion : public RewriteRule {
   }
 
  private:
-  bool SatisfyCondition(const Graph& graph, const Node& node) override;
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
 
-  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/dropout_elimination.cc b/onnxruntime/core/optimizer/dropout_elimination.cc
index f08643f0297bc..ff2f1a3477e18 100644
--- a/onnxruntime/core/optimizer/dropout_elimination.cc
+++ b/onnxruntime/core/optimizer/dropout_elimination.cc
@@ -10,7 +10,7 @@
 
 namespace onnxruntime {
 
-Status EliminateDropout::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) {
+Status EliminateDropout::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
   if (graph_utils::RemoveNode(graph, node)) {
     rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
   }
@@ -18,7 +18,7 @@ Status EliminateDropout::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule
   return Status::OK();
 }
 
-bool EliminateDropout::SatisfyCondition(const Graph& graph, const Node& node) {
+bool EliminateDropout::SatisfyCondition(const Graph& graph, const Node& node) const {
   // We currently support elimination for Dropout operator v1, v6, v7, and v10.
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Dropout", {1, 6, 7, 10})) {
     return false;
diff --git a/onnxruntime/core/optimizer/dropout_elimination.h b/onnxruntime/core/optimizer/dropout_elimination.h
index 2310eaa4366cb..e840767497e66 100644
--- a/onnxruntime/core/optimizer/dropout_elimination.h
+++ b/onnxruntime/core/optimizer/dropout_elimination.h
@@ -23,9 +23,9 @@ class EliminateDropout : public RewriteRule {
   }
 
  private:
-  bool SatisfyCondition(const Graph& graph, const Node& node) override;
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
 
-  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index c8b37a2a2b71f..5c03fc3538137 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -11,6 +11,8 @@
 #include "core/optimizer/conv_activation_fusion.h"
 #include "core/optimizer/gemm_activation_fusion.h"
 #include "core/optimizer/matmul_add_fusion.h"
+#include "core/optimizer/dropout_elimination.h"
+#include "core/optimizer/relu_clip_fusion.h"
 
 namespace onnxruntime {
 
@@ -27,6 +29,9 @@ std::vector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(TransformerLevel
     case TransformerLevel::Level1:
       rules.push_back(std::make_unique<EliminateIdentity>());
       rules.push_back(std::make_unique<EliminateSlice>());
+      rules.push_back(std::make_unique<UnsqueezeElimination>());
+      rules.push_back(std::make_unique<EliminateDropout>());
+      rules.push_back(std::make_unique<FuseReluClip>());
       break;
 
     case TransformerLevel::Level2:
@@ -111,21 +116,21 @@ std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerL
     }
     return transformers;
   }
-    std::vector<std::unique_ptr<GraphTransformer>> filtered_list;
-    // If the rule-based transformer is not empty, it should be included in the custom transformer list below.
-    if (rule_transformer != nullptr) {
-      filtered_list.emplace_back(std::move(rule_transformer));
-    }
-    // pick custom transformers enabled for this session
-    for (const auto& t_name : transformers_and_rules_to_enable) {
-      std::for_each(transformers.begin(), transformers.end(),
-                    [&](std::unique_ptr<GraphTransformer>& item) {
-                      if ((item != nullptr) && (item->Name() == t_name)) {
-                        filtered_list.push_back(std::move(item));
-                      }
-                    });
-    }
-    return filtered_list;
+  std::vector<std::unique_ptr<GraphTransformer>> filtered_list;
+  // If the rule-based transformer is not empty, it should be included in the custom transformer list below.
+  if (rule_transformer != nullptr) {
+    filtered_list.emplace_back(std::move(rule_transformer));
+  }
+  // pick custom transformers enabled for this session
+  for (const auto& t_name : transformers_and_rules_to_enable) {
+    std::for_each(transformers.begin(), transformers.end(),
+                  [&](std::unique_ptr<GraphTransformer>& item) {
+                    if ((item != nullptr) && (item->Name() == t_name)) {
+                      filtered_list.push_back(std::move(item));
+                    }
+                  });
+  }
+  return filtered_list;
 }
 
 }  // namespace transformer_utils
diff --git a/onnxruntime/core/optimizer/identity_elimination.cc b/onnxruntime/core/optimizer/identity_elimination.cc
index 236b98f5887d5..09df7fab42942 100644
--- a/onnxruntime/core/optimizer/identity_elimination.cc
+++ b/onnxruntime/core/optimizer/identity_elimination.cc
@@ -10,7 +10,7 @@
 
 namespace onnxruntime {
 
-Status EliminateIdentity::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) {
+Status EliminateIdentity::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
   if (graph_utils::RemoveNode(graph, node)) {
     rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
   }
@@ -18,7 +18,7 @@ Status EliminateIdentity::Apply(Graph& graph, Node& node, RewriteRuleEffect& rul
   return Status::OK();
 }
 
-bool EliminateIdentity::SatisfyCondition(const Graph& graph, const Node& node) {
+bool EliminateIdentity::SatisfyCondition(const Graph& graph, const Node& node) const {
   return graph_utils::IsSingleInSingleOutNode(node) &&
          !graph.IsNodeOutputsInGraphOutputs(node);
 }
diff --git a/onnxruntime/core/optimizer/identity_elimination.h b/onnxruntime/core/optimizer/identity_elimination.h
index b90d2164e01d8..55d8c2d8fa33f 100644
--- a/onnxruntime/core/optimizer/identity_elimination.h
+++ b/onnxruntime/core/optimizer/identity_elimination.h
@@ -23,9 +23,9 @@ class EliminateIdentity : public RewriteRule {
   }
 
  private:
-  bool SatisfyCondition(const Graph& graph, const Node& node) override;
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
 
-  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
 };  // namespace onnxruntime
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/relu_clip_fusion.cc b/onnxruntime/core/optimizer/relu_clip_fusion.cc
new file mode 100644
index 0000000000000..4e82916d4ed3a
--- /dev/null
+++ b/onnxruntime/core/optimizer/relu_clip_fusion.cc
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/relu_clip_fusion.h"
+#include "core/graph/graph.h"
+#include "core/graph/graph_utils.h"
+#include "core/graph/op.h"
+
+namespace onnxruntime {
+
+Status FuseReluClip::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
+  // get the following Clip node before we delete the Relu node
+  const auto& next_node = *node.OutputNodesBegin();
+
+  if (graph_utils::RemoveNode(graph, node)) {
+    // update the following Clip node if the 'min' is < 0.f to set it to 0.f
+    // this essentially fuses the Relu and Clip
+    // if the Clip 'min' is >= 0.f no change is required as Relu would have set the min to 0.f
+    if (graph_utils::GetNodeAttribute(next_node, "min")->f() < 0.f) {
+      auto* mutable_next_node = graph.GetNode(next_node.Index());
+      mutable_next_node->ClearAttribute("min");
+      mutable_next_node->AddAttribute("min", 0.f);
+    }
+
+    rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
+  }
+
+  return Status::OK();
+}
+
+bool FuseReluClip::SatisfyCondition(const Graph& graph, const Node& node) const {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Relu", {6})) {
+    return false;
+  }
+
+  if (!graph_utils::IsSingleInSingleOutNode(node) ||
+      graph.IsNodeOutputsInGraphOutputs(node)) {
+    return false;
+  }
+
+  // If the Relu is followed by a Clip node the Relu is redundant and can be removed
+  // as Clip will apply the minimum. If the Clip 'min' value is < 0 we need
+  // to update it to 0 to apply what the Relu would have done. We do that in Apply.
+  const auto& next_node = *node.OutputNodesBegin();
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "Clip", {6}) ||
+      next_node.GetExecutionProviderType() != node.GetExecutionProviderType()) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/relu_clip_fusion.h b/onnxruntime/core/optimizer/relu_clip_fusion.h
new file mode 100644
index 0000000000000..2b90e9c3d85c5
--- /dev/null
+++ b/onnxruntime/core/optimizer/relu_clip_fusion.h
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/rewrite_rule.h"
+
+namespace onnxruntime {
+
+/**
+@Class FuseReluClip
+
+Rewrite rule that merges a Relu operator with a following Clip operator.
+*/
+class FuseReluClip : public RewriteRule {
+ public:
+  FuseReluClip() noexcept : RewriteRule("FuseReluClip") {}
+
+  std::vector<std::string> TargetOpTypes() const noexcept override {
+    return {"Relu"};
+  }
+
+ private:
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
+
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/rule_based_graph_transformer.cc b/onnxruntime/core/optimizer/rule_based_graph_transformer.cc
index 7bafbed87b119..b25e4505a8b4e 100644
--- a/onnxruntime/core/optimizer/rule_based_graph_transformer.cc
+++ b/onnxruntime/core/optimizer/rule_based_graph_transformer.cc
@@ -13,19 +13,23 @@ Status RuleBasedGraphTransformer::Register(std::unique_ptr<RewriteRule> rule) {
   auto op_types = rule->TargetOpTypes();
   // If the target op types are empty, this rule will be evaluated for all op types.
   if (op_types.empty()) {
-    any_op_type_rules_.push_back(std::move(rule));
+    any_op_type_rules_.push_back(*rule);
   } else {
     std::for_each(op_types.cbegin(), op_types.cend(),
-                  [&](const auto& op_type) { op_type_to_rules_[op_type].push_back(std::move(rule)); });
+                  [&](const auto& op_type) { op_type_to_rules_[op_type].push_back(*rule); });
   }
+
+  // Save unique pointer at the rules_ list.
+  rules_.push_back(std::move(rule));
+
   return Status::OK();
 }
 
 Status RuleBasedGraphTransformer::ApplyRulesOnNode(Graph& graph, Node& node,
-                                                   const std::vector<std::unique_ptr<RewriteRule>>& rules,
+                                                   const std::vector<std::reference_wrapper<const RewriteRule>>& rules,
                                                    RuleEffect& rule_effect) const {
-  for (const auto& rule : rules) {
-    ORT_RETURN_IF_ERROR(rule->CheckConditionAndApply(graph, node, rule_effect));
+  for (const RewriteRule& rule : rules) {
+    ORT_RETURN_IF_ERROR(rule.CheckConditionAndApply(graph, node, rule_effect));
     // If the current node was removed as a result of a rule, stop rule application for that node.
     if (rule_effect == RuleEffect::kRemovedCurrentNode) {
       break;
@@ -56,7 +60,7 @@ Status RuleBasedGraphTransformer::ApplyImpl(Graph& graph, bool& modified, int gr
     // First apply rewrite rules that are registered for the op type of the current node; then apply rules that are
     // registered to be applied regardless of the op type; then recursively apply rules to subgraphs (if any).
     // Stop further rule application for the current node, if the node gets removed by a rule.
-    const std::vector<std::unique_ptr<RewriteRule>>* rules = nullptr;
+    const std::vector<std::reference_wrapper<const RewriteRule>>* rules = nullptr;
 
     rules = GetRewriteRulesForOpType(node->OpType());
     if (rules) {
@@ -84,9 +88,7 @@ Status RuleBasedGraphTransformer::ApplyImpl(Graph& graph, bool& modified, int gr
 }
 
 size_t RuleBasedGraphTransformer::RulesCount() const {
-  return any_op_type_rules_.size() +
-         std::accumulate(op_type_to_rules_.cbegin(), op_type_to_rules_.cend(), size_t(0),
-                         [](size_t sum, const auto& rules) { return sum + rules.second.size(); });
+  return rules_.size();
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/slice_elimination.cc b/onnxruntime/core/optimizer/slice_elimination.cc
index 3a42b6170d75a..65ebd81811218 100644
--- a/onnxruntime/core/optimizer/slice_elimination.cc
+++ b/onnxruntime/core/optimizer/slice_elimination.cc
@@ -8,7 +8,7 @@
 
 namespace onnxruntime {
 
-Status EliminateSlice::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) {
+Status EliminateSlice::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
   if (graph_utils::RemoveNode(graph, node)) {
     rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
   }
@@ -16,7 +16,7 @@ Status EliminateSlice::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_e
   return Status::OK();
 }
 
-bool EliminateSlice::SatisfyCondition(const Graph& graph, const Node& node) {
+bool EliminateSlice::SatisfyCondition(const Graph& graph, const Node& node) const {
   // We currently support elimination for Slice operator v1.
   // TODO Extend to support Slice operator v10, which includes "steps" and all attributes are now given as inputs.
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {1})) {
diff --git a/onnxruntime/core/optimizer/slice_elimination.h b/onnxruntime/core/optimizer/slice_elimination.h
index 28d689c558097..8a9ed2947417a 100644
--- a/onnxruntime/core/optimizer/slice_elimination.h
+++ b/onnxruntime/core/optimizer/slice_elimination.h
@@ -23,9 +23,9 @@ class EliminateSlice : public RewriteRule {
   }
 
  private:
-  bool SatisfyCondition(const Graph& graph, const Node& node) override;
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
 
-  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/unsqueeze_elimination.cc b/onnxruntime/core/optimizer/unsqueeze_elimination.cc
index 549d415bf244e..c35d078546fa6 100644
--- a/onnxruntime/core/optimizer/unsqueeze_elimination.cc
+++ b/onnxruntime/core/optimizer/unsqueeze_elimination.cc
@@ -10,7 +10,7 @@ using namespace ::onnxruntime::common;
 
 namespace onnxruntime {
 
-Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) {
+Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const {
   // Get "axes" attribute.
   const ONNX_NAMESPACE::AttributeProto* attr = graph_utils::GetNodeAttribute(node, "axes");
   if (attr == nullptr || attr->type() != AttributeProto_AttributeType_INTS) {
@@ -74,7 +74,7 @@ Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect&
   return Status::OK();
 }  // namespace onnxruntime
 
-bool UnsqueezeElimination::SatisfyCondition(const Graph& graph, const Node& node) {
+bool UnsqueezeElimination::SatisfyCondition(const Graph& graph, const Node& node) const {
   // Attempt to remove an Unsqueeze operator only if it gets an initializer as input.
   return node.GetInputEdgesCount() == 0 &&
          !graph.IsNodeOutputsInGraphOutputs(node);
diff --git a/onnxruntime/core/optimizer/unsqueeze_elimination.h b/onnxruntime/core/optimizer/unsqueeze_elimination.h
index e8e4dad40057f..3150513c13642 100644
--- a/onnxruntime/core/optimizer/unsqueeze_elimination.h
+++ b/onnxruntime/core/optimizer/unsqueeze_elimination.h
@@ -23,9 +23,9 @@ class UnsqueezeElimination : public RewriteRule {
   }
 
  private:
-  bool SatisfyCondition(const Graph& graph, const Node& node) override;
+  bool SatisfyCondition(const Graph& graph, const Node& node) const override;
 
-  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect) const override;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index 92375f6f31c46..2b7e50dee38aa 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -180,9 +180,10 @@ class PosixEnv : public Env {
     char buf[1024];
     const char* msg = "";
     if (e > 0) {
-#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+#if defined(__GLIBC__) && defined(_GNU_SOURCE) && !defined (__ANDROID__)
       msg = strerror_r(e, buf, sizeof(buf));
 #else
+      // for Mac OS X and Android lower than API 23
       if (strerror_r(e, buf, sizeof(buf)) != 0) {
         buf[0] = '\0';
       }
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index c984c8c2c306a..a6eeb5f369a7f 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -364,36 +364,36 @@ ORT_API_STATUS_IMPL(OrtAddCustomOpDomain, _In_ OrtSessionOptions* options, OrtCu
 }
 
 namespace {
-  template <typename Loader>
-  OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* options,
-                               Loader loader, _Out_ OrtSession** out) {
-    auto sess = std::make_unique<::onnxruntime::InferenceSession>(
-        options == nullptr ? onnxruntime::SessionOptions() : options->value, env->loggingManager);
-    Status status;
-    if (options != nullptr) {
-      if (!options->custom_op_domains_.empty()) {
-        status = sess->AddCustomOpDomains(options->custom_op_domains_);
-        if (!status.IsOK())
-          return ToOrtStatus(status);
-      }
+template <typename Loader>
+OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* options,
+                             Loader loader, _Out_ OrtSession** out) {
+  auto sess = std::make_unique<::onnxruntime::InferenceSession>(
+      options == nullptr ? onnxruntime::SessionOptions() : options->value, env->loggingManager);
+  Status status;
+  if (options != nullptr) {
+    if (!options->custom_op_domains_.empty()) {
+      status = sess->AddCustomOpDomains(options->custom_op_domains_);
+      if (!status.IsOK())
+        return ToOrtStatus(status);
     }
-
-    if (options != nullptr)
-      for (auto& factory : options->provider_factories) {
-        auto provider = factory->CreateProvider();
-        if (provider)
-          sess->RegisterExecutionProvider(std::move(provider));
-      }
-    status = loader(*sess);
-    if (!status.IsOK())
-      return ToOrtStatus(status);
-    status = sess->Initialize();
-    if (!status.IsOK())
-      return ToOrtStatus(status);
-    *out = reinterpret_cast<OrtSession*>(sess.release());
-    return nullptr;
   }
+
+  if (options != nullptr)
+    for (auto& factory : options->provider_factories) {
+      auto provider = factory->CreateProvider();
+      if (provider)
+        sess->RegisterExecutionProvider(std::move(provider));
+    }
+  status = loader(*sess);
+  if (!status.IsOK())
+    return ToOrtStatus(status);
+  status = sess->Initialize();
+  if (!status.IsOK())
+    return ToOrtStatus(status);
+  *out = reinterpret_cast<OrtSession*>(sess.release());
+  return nullptr;
 }
+}  // namespace
 
 ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path,
                     _In_ const OrtSessionOptions* options, _Out_ OrtSession** out) {
@@ -405,18 +405,18 @@ ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* mo
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, int model_data_len,
+ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
                     _In_ const OrtSessionOptions* options, _Out_ OrtSession** out) {
   API_IMPL_BEGIN
-  const auto loader = [model_data, model_data_len](InferenceSession& sess) {
-    return sess.Load(model_data, model_data_len);
+  const auto loader = [model_data, model_data_length](InferenceSession& sess) {
+    return sess.Load(model_data, static_cast<int>(model_data_length));
   };
   return CreateSessionImpl(env, options, loader, out);
   API_IMPL_END
 }
 
 ORT_API_STATUS_IMPL(OrtRun, _In_ OrtSession* sess,
-                    _In_ OrtRunOptions* run_options,
+                    _In_ const OrtRunOptions* run_options,
                     _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len,
                     _In_ const char* const* output_names1, size_t output_names_len, _Out_ OrtValue** output) {
   API_IMPL_BEGIN
diff --git a/onnxruntime/server/main.cc b/onnxruntime/server/main.cc
index f938a98c11fe4..2a3a105f7af43 100644
--- a/onnxruntime/server/main.cc
+++ b/onnxruntime/server/main.cc
@@ -6,11 +6,43 @@
 #include "predict_request_handler.h"
 #include "server_configuration.h"
 
+#define VALUE_TO_STRING(x) #x
+#define VALUE(x) VALUE_TO_STRING(x)
+#define VAR_NAME_VALUE(var) #var "="  VALUE(var)
+
+#define LOCAL_BUILD_VERSION "local_build"
+#if !defined(SRV_VERSION)
+#define SRV_VERSION LOCAL_BUILD_VERSION
+#endif
+#pragma message(VAR_NAME_VALUE(SRV_VERSION))
+
+#define DEFAULT_COMMIT_ID "default"
+#if !defined(LATEST_COMMIT_ID)
+#define LATEST_COMMIT_ID DEFAULT_COMMIT_ID
+#endif
+#pragma message(VAR_NAME_VALUE(LATEST_COMMIT_ID))
+
 namespace beast = boost::beast;
 namespace http = beast::http;
 namespace server = onnxruntime::server;
 
 int main(int argc, char* argv[]) {
+  // Here we use std::cout print out the version and latest commit id,
+  // to make sure in case even logger has problem, we still have the version information and commit id.
+  std::string version = SRV_VERSION;
+  if (version.empty()){
+    version = LOCAL_BUILD_VERSION;
+  }
+
+  std::string commit_id = LATEST_COMMIT_ID;
+  if (commit_id.empty()){
+    commit_id = DEFAULT_COMMIT_ID;
+  }
+
+  std::cout << "Version: " << version << std::endl;
+  std::cout << "Commit ID: " << commit_id << std::endl;
+  std::cout << std::endl;
+
   server::ServerConfiguration config{};
   auto res = config.ParseInput(argc, argv);
 
diff --git a/onnxruntime/test/optimizer/dummy_graph_transformer.h b/onnxruntime/test/optimizer/dummy_graph_transformer.h
index 1bff4af37fcf3..8116d0ba5f18a 100644
--- a/onnxruntime/test/optimizer/dummy_graph_transformer.h
+++ b/onnxruntime/test/optimizer/dummy_graph_transformer.h
@@ -41,13 +41,13 @@ class DummyRewriteRule : public RewriteRule {
   }
 
  private:
-  bool rewrite_rule_invoked_;
+  mutable bool rewrite_rule_invoked_;
 
-  bool SatisfyCondition(const Graph& /*graph*/, const Node& /*node*/) override {
+  bool SatisfyCondition(const Graph& /*graph*/, const Node& /*node*/) const override {
     return true;
   }
 
-  Status Apply(Graph& /*graph*/, Node& /*node*/, RewriteRuleEffect& /*rule_effect*/) override {
+  Status Apply(Graph& /*graph*/, Node& /*node*/, RewriteRuleEffect& /*rule_effect*/) const override {
     rewrite_rule_invoked_ = true;
     return Status::OK();
   }
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 1d2facbbd7b30..e8a33e5fb6b6c 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/session/inference_session.h"
+#include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
 #include "core/optimizer/graph_transformer.h"
@@ -16,6 +17,7 @@
 #include "core/optimizer/conv_activation_fusion.h"
 #include "core/optimizer/matmul_add_fusion.h"
 #include "core/optimizer/gemm_activation_fusion.h"
+#include "core/optimizer/relu_clip_fusion.h"
 #include "core/framework/data_types.h"
 #include "core/framework/ml_value.h"
 #include "core/util/math.h"
@@ -418,5 +420,72 @@ TEST(GraphTransformationTests, FuseConvBnAddMulFloat16) {
   ASSERT_EQ(expected_values_prod, found);
 }
 
+TEST(GraphTransformationTests, ReluClipFusion) {
+  Model model("ReluClipFusion");
+  auto& graph = model.MainGraph();
+
+  std::vector<NodeArg*> inputs;
+  std::vector<NodeArg*> outputs;
+
+  TypeProto input_tensor_type;
+  input_tensor_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  input_tensor_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+
+  // 3 paths in the model, each with Relu followed by Clip
+  // One has a Clip with min of 0  (remove Relu)
+  // One have a Clip with a min > 1 (remove Relu)
+  // One has a Clip with min < 0 (remove Relu and update Clip 'min' to 0)
+  auto& input0 = graph.GetOrCreateNodeArg("input_0", &input_tensor_type);
+  auto& input1 = graph.GetOrCreateNodeArg("input_1", &input_tensor_type);
+  auto& input2 = graph.GetOrCreateNodeArg("input_2", &input_tensor_type);
+
+  auto& relu0_output = graph.GetOrCreateNodeArg("relu0_output", &input_tensor_type);
+  auto& relu1_output = graph.GetOrCreateNodeArg("relu1_output", &input_tensor_type);
+  auto& relu2_output = graph.GetOrCreateNodeArg("relu2_output", &input_tensor_type);
+
+  auto& clip0_output = graph.GetOrCreateNodeArg("clip0_output", &input_tensor_type);
+  auto& clip1_output = graph.GetOrCreateNodeArg("clip1_output", &input_tensor_type);
+  auto& clip2_output = graph.GetOrCreateNodeArg("clip2_output", &input_tensor_type);
+
+  graph.AddNode("relu0", "Relu", "Relu to eliminate", {&input0}, {&relu0_output});
+  graph.AddNode("relu1", "Relu", "Relu to not eliminate", {&input1}, {&relu1_output});
+  graph.AddNode("relu2", "Relu", "Relu to eliminate and update 'min' of following Clip", {&input2}, {&relu2_output});
+
+  auto& clip0 = graph.AddNode("clip0", "Clip", "Clip with min 0", {&relu0_output}, {&clip0_output});
+  clip0.AddAttribute("min", 0.f);
+  clip0.AddAttribute("max", 1.f);
+
+  auto& clip1 = graph.AddNode("clip1", "Clip", "Clip with min 1", {&relu1_output}, {&clip1_output});
+  clip1.AddAttribute("min", 1.f);
+  clip1.AddAttribute("max", 1.f);
+
+  auto& clip2 = graph.AddNode("clip2", "Clip", "Clip with min -1", {&relu2_output}, {&clip2_output});
+  clip2.AddAttribute("min", -1.f);
+  clip2.AddAttribute("max", 1.f);
+
+  auto status = graph.Resolve();
+  EXPECT_EQ(status, Status::OK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Relu"] == 3);
+
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformer1");
+  rule_transformer_L1->Register(std::make_unique<FuseReluClip>());
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1);
+  ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1).IsOK());
+
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Relu"] == 0);
+
+  // make sure the Clip nodes were updated to have a 'min' >= 0
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "Clip") {
+      auto* min = graph_utils::GetNodeAttribute(node, "min");
+      ASSERT_TRUE(min->f() >= 0.f);
+    }
+  }
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/main.cc b/onnxruntime/test/perftest/main.cc
index 2b5483f17d36d..7eda96801ca3b 100644
--- a/onnxruntime/test/perftest/main.cc
+++ b/onnxruntime/test/perftest/main.cc
@@ -10,25 +10,22 @@
 using namespace onnxruntime;
 
 #ifdef _WIN32
-int real_main(int argc, wchar_t* argv[], OrtEnv** p_env) {
+int real_main(int argc, wchar_t* argv[]) {
 #else
-int real_main(int argc, char* argv[], OrtEnv** p_env) {
+int real_main(int argc, char* argv[]) {
 #endif
   perftest::PerformanceTestConfig test_config;
   if (!perftest::CommandLineParser::ParseArguments(test_config, argc, argv)) {
     perftest::CommandLineParser::ShowUsage();
     return -1;
   }
-  OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING;
-  OrtEnv* env;
-  {
-    OrtStatus* ost = OrtCreateEnv(logging_level, "Default", &env);
-    if (ost != nullptr) {
-      fprintf(stderr, "Error creating environment: %s \n", OrtGetErrorMessage(ost));
-      OrtReleaseStatus(ost);
-      return -1;
-    }
-    *p_env = env;
+  Ort::Env env{nullptr};
+  try {
+    OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING;
+    env = Ort::Env(logging_level, "Default");
+  } catch (const Ort::Exception& e) {
+    fprintf(stderr, "Error creating environment: %s \n", e.what());
+    return -1;
   }
   std::random_device rd;
   perftest::PerformanceRunner perf_runner(env, test_config, rd);
@@ -48,16 +45,12 @@ int wmain(int argc, wchar_t* argv[]) {
 #else
 int main(int argc, char* argv[]) {
 #endif
-  OrtEnv* env = nullptr;
   int retval = -1;
   try {
-    retval = real_main(argc, argv, &env);
+    retval = real_main(argc, argv);
   } catch (std::exception& ex) {
     fprintf(stderr, "%s\n", ex.what());
     retval = -1;
   }
-  if (env) {
-    OrtReleaseEnv(env);
-  }
   return retval;
 }
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index a0d6fb491a7bc..3d7d53ad26328 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -15,53 +15,49 @@ std::chrono::duration<double> OnnxRuntimeTestSession::Run() {
   //Randomly pick one OrtValueArray from test_inputs_. (NOT ThreadSafe)
   const std::uniform_int_distribution<int>::param_type p(0, static_cast<int>(test_inputs_.size() - 1));
   const size_t id = static_cast<size_t>(dist_(rand_engine_, p));
-  OrtValueArray* const input = test_inputs_.at(id);
+  auto& input = test_inputs_.at(id);
   auto start = std::chrono::high_resolution_clock::now();
-  ORT_THROW_ON_ERROR(OrtRun(session_object_, nullptr, input_names_.data(), input->Data(), input_names_.size(),
-                            output_names_raw_ptr.data(), output_names_raw_ptr.size(), output_values_.data()));
+  auto output_values = session_.Run(Ort::RunOptions{nullptr}, input_names_.data(), input.data(), input_names_.size(),
+                                    output_names_raw_ptr.data(), output_names_raw_ptr.size());
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> duration_seconds = end - start;
-  for (size_t i = 0; i != output_values_.size(); ++i) {
-    OrtReleaseValue(output_values_[i]);
-    output_values_[i] = nullptr;
-  }
   return duration_seconds;
 }
 
-OnnxRuntimeTestSession::OnnxRuntimeTestSession(OrtEnv* env, std::random_device& rd,
+OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device& rd,
                                                const PerformanceTestConfig& performance_test_config,
                                                const TestModelInfo* m)
     : rand_engine_(rd()), input_names_(m->GetInputCount()), input_length_(m->GetInputCount()) {
-  SessionOptionsWrapper sf(env);
+  Ort::SessionOptions session_options;
   const std::string& provider_name = performance_test_config.machine_config.provider_type_name;
   if (provider_name == onnxruntime::kMklDnnExecutionProvider) {
 #ifdef USE_MKLDNN
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Mkldnn(sf, performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Mkldnn(session_options, performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
 #else
     ORT_THROW("MKL-DNN is not supported in this build\n");
 #endif
   } else if (provider_name == onnxruntime::kNGraphExecutionProvider) {
 #ifdef USE_NGRAPH
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_NGraph(sf, "CPU"));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_NGraph(session_options, "CPU"));
 #else
     ORT_THROW("nGraph is not supported in this build");
 #endif
   } else if (provider_name == onnxruntime::kCudaExecutionProvider) {
 #ifdef USE_CUDA
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, 0));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
 #else
     ORT_THROW("CUDA is not supported in this build\n");
 #endif
   } else if (provider_name == onnxruntime::kNupharExecutionProvider) {
 #ifdef USE_NUPHAR
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Nuphar(sf, /*allow_unaligned_buffers*/ 0, 0, ""));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Nuphar(session_options, /*allow_unaligned_buffers*/ 0, 0, ""));
 #else
     ORT_THROW("Nuphar is not supported in this build\n");
 #endif
   } else if (provider_name == onnxruntime::kTensorrtExecutionProvider) {
 #ifdef USE_TENSORRT
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf));
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, 0));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Tensorrt(session_options));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
 #else
     ORT_THROW("TensorRT is not supported in this build\n");
 #endif
@@ -70,44 +66,41 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(OrtEnv* env, std::random_device&
   }
 
   if (performance_test_config.run_config.enable_cpu_mem_arena)
-    sf.EnableCpuMemArena();
+    session_options.EnableCpuMemArena();
   else
-    sf.DisableCpuMemArena();
+    session_options.DisableCpuMemArena();
   if (performance_test_config.run_config.enable_memory_pattern &&
       performance_test_config.run_config.enable_sequential_execution)
-    sf.EnableMemPattern();
+    session_options.EnableMemPattern();
   else
-    sf.DisableMemPattern();
+    session_options.DisableMemPattern();
   if (performance_test_config.run_config.enable_sequential_execution)
-    sf.EnableSequentialExecution();
+    session_options.EnableSequentialExecution();
   else
-    sf.DisableSequentialExecution();
+    session_options.DisableSequentialExecution();
   fprintf(stdout, "Setting thread pool size to %d\n", performance_test_config.run_config.session_thread_pool_size);
-  sf.SetSessionThreadPoolSize(performance_test_config.run_config.session_thread_pool_size);
+  // Don't set the thread pool size unless it has been changed from our zero default value (as zero will fail)
+  if (performance_test_config.run_config.session_thread_pool_size != 0)
+    session_options.SetThreadPoolSize(performance_test_config.run_config.session_thread_pool_size);
   // Set optimization level.
-  sf.SetSessionGraphOptimizationLevel(performance_test_config.run_config.optimization_level);
+  session_options.SetGraphOptimizationLevel(performance_test_config.run_config.optimization_level);
   if (!performance_test_config.run_config.profile_file.empty())
-    sf.EnableProfiling(performance_test_config.run_config.profile_file.c_str());
-  session_object_ = sf.OrtCreateSession(performance_test_config.model_info.model_file_path.c_str());
+    session_options.EnableProfiling(performance_test_config.run_config.profile_file.c_str());
+  session_ = Ort::Session(env, performance_test_config.model_info.model_file_path.c_str(), session_options);
 
-  size_t output_count;
-  ORT_THROW_ON_ERROR(OrtSessionGetOutputCount(session_object_, &output_count));
+  size_t output_count = session_.GetOutputCount();
   output_names_.resize(output_count);
-  OrtAllocator* a;
-  ORT_THROW_ON_ERROR(OrtCreateDefaultAllocator(&a));
+  Ort::Allocator a = Ort::Allocator::CreateDefault();
   for (size_t i = 0; i != output_count; ++i) {
-    char* output_name = nullptr;
-    ORT_THROW_ON_ERROR(OrtSessionGetOutputName(session_object_, i, a, &output_name));
+    char* output_name = session_.GetOutputName(i, a);
     assert(output_name != nullptr);
     output_names_[i] = output_name;
-    a->Free(a, output_name);
+    a.Free(output_name);
   }
   output_names_raw_ptr.resize(output_count);
   for (size_t i = 0; i != output_count; ++i) {
     output_names_raw_ptr[i] = output_names_[i].c_str();
   }
-  OrtReleaseAllocator(a);
-  output_values_.resize(output_count);
 
   size_t input_count = static_cast<size_t>(m->GetInputCount());
   for (size_t i = 0; i != input_count; ++i) {
diff --git a/onnxruntime/test/perftest/ort_test_session.h b/onnxruntime/test/perftest/ort_test_session.h
index 3264a2a92d330..e71b5ad86f17e 100644
--- a/onnxruntime/test/perftest/ort_test_session.h
+++ b/onnxruntime/test/perftest/ort_test_session.h
@@ -11,21 +11,21 @@ namespace onnxruntime {
 namespace perftest {
 class OnnxRuntimeTestSession : public TestSession {
  public:
-  OnnxRuntimeTestSession(OrtEnv* env, std::random_device& rd, const PerformanceTestConfig& performance_test_config,
+  OnnxRuntimeTestSession(Ort::Env& env, std::random_device& rd, const PerformanceTestConfig& performance_test_config,
                          const TestModelInfo* m);
 
   void PreLoadTestData(size_t test_data_id, size_t input_id, OrtValue* value) override {
     if (test_inputs_.size() < test_data_id + 1) {
       test_inputs_.resize(test_data_id + 1);
     }
-    if (test_inputs_.at(test_data_id) == nullptr) {
-      test_inputs_[test_data_id] = new OrtValueArray(input_length_);
+    if (test_inputs_.at(test_data_id).size() == 0) {
+      for (int i = 0; i < input_length_; i++)
+        test_inputs_[test_data_id].emplace_back(nullptr);
     }
-    test_inputs_[test_data_id]->Set(input_id, value);
+    test_inputs_[test_data_id][input_id] = Ort::Value{value};
   }
 
   ~OnnxRuntimeTestSession() override {
-    if (session_object_ != nullptr) OrtReleaseSession(session_object_);
     for (char* p : input_names_) {
       free(p);
     }
@@ -35,18 +35,17 @@ class OnnxRuntimeTestSession : public TestSession {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OnnxRuntimeTestSession);
 
  private:
-  OrtSession* session_object_ = nullptr;
+  Ort::Session session_{nullptr};
   std::mt19937 rand_engine_;
   std::uniform_int_distribution<int> dist_;
-  std::vector<OrtValueArray*> test_inputs_;
+  std::vector<std::vector<Ort::Value>> test_inputs_;
   std::vector<std::string> output_names_;
   // The same size with output_names_.
   // TODO: implement a customized allocator, then we can remove output_names_ to simplify this code
   std::vector<const char*> output_names_raw_ptr;
-  std::vector<OrtValue*> output_values_;
   std::vector<char*> input_names_;
   const int input_length_;
 };
 
 }  // namespace perftest
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index 51adecc478a8f..d55f2d4f201b0 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -168,7 +168,7 @@ static TestModelInfo* CreateModelInfo(const PerformanceTestConfig& performance_t
   ORT_NOT_IMPLEMENTED(ToMBString(performance_test_config_.backend), " is not supported");
 }
 
-static TestSession* CreateSession(OrtEnv* env, std::random_device& rd,
+static TestSession* CreateSession(Ort::Env& env, std::random_device& rd,
                                   const PerformanceTestConfig& performance_test_config_,
                                   TestModelInfo* test_model_info) {
   if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
@@ -181,7 +181,7 @@ static TestSession* CreateSession(OrtEnv* env, std::random_device& rd,
 #endif
   ORT_NOT_IMPLEMENTED(ToMBString(performance_test_config_.backend), " is not supported");
 }
-PerformanceRunner::PerformanceRunner(OrtEnv* env, const PerformanceTestConfig& test_config, std::random_device& rd)
+PerformanceRunner::PerformanceRunner(Ort::Env& env, const PerformanceTestConfig& test_config, std::random_device& rd)
     : performance_test_config_(test_config),
       test_model_info_(CreateModelInfo(test_config)),
       session_(CreateSession(env, rd, test_config, test_model_info_)) {}
diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h
index 258d4847e2777..d4abaceeea82d 100644
--- a/onnxruntime/test/perftest/performance_runner.h
+++ b/onnxruntime/test/perftest/performance_runner.h
@@ -73,7 +73,7 @@ struct PerformanceResult {
 
 class PerformanceRunner {
  public:
-  PerformanceRunner(OrtEnv* env, const PerformanceTestConfig& test_config, std::random_device& rd);
+  PerformanceRunner(Ort::Env& env, const PerformanceTestConfig& test_config, std::random_device& rd);
 
   ~PerformanceRunner();
   Status Run();
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index 350efbab230de..9e5a804054779 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -246,7 +246,7 @@ TEST(Random, MultinomialGoodCase) {
   const std::vector<int64_t> output_dims{batch_size, num_samples};
 #ifdef _WIN32
   const std::vector<int64_t> expected_output{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
-#elif defined(__MACH__)
+#elif defined(__MACH__) || defined (__ANDROID__)
   const std::vector<int64_t> expected_output{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
 #else
   const std::vector<int64_t> expected_output{2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0};
@@ -274,7 +274,7 @@ TEST(Random, MultinomialDefaultDType) {
   const std::vector<int64_t> output_dims{batch_size, num_samples};
 #ifdef _WIN32
   const std::vector<int32_t> expected_output{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
-#elif defined(__MACH__)
+#elif defined(__MACH__) || defined (__ANDROID__)
   const std::vector<int32_t> expected_output{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
 #else
   const std::vector<int32_t> expected_output{2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0};
diff --git a/onnxruntime/test/shared_lib/test_fixture.h b/onnxruntime/test/shared_lib/test_fixture.h
index ac01b4c924cec..465f7b0a6a254 100644
--- a/onnxruntime/test/shared_lib/test_fixture.h
+++ b/onnxruntime/test/shared_lib/test_fixture.h
@@ -15,23 +15,20 @@ typedef const char* PATH_TYPE;
 //empty
 static inline void ORT_API_CALL MyLoggingFunction(void*, OrtLoggingLevel, const char*, const char*, const char*, const char*) {
 }
+
 template <bool use_customer_logger>
 class CApiTestImpl : public ::testing::Test {
  protected:
-  OrtEnv* env = nullptr;
+  Ort::Env env_{nullptr};
 
   void SetUp() override {
     if (use_customer_logger) {
-      ORT_THROW_ON_ERROR(OrtCreateEnvWithCustomLogger(MyLoggingFunction, nullptr, ORT_LOGGING_LEVEL_INFO, "Default", &env));
+      env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "Default", MyLoggingFunction, nullptr);
     } else {
-      ORT_THROW_ON_ERROR(OrtCreateEnv(ORT_LOGGING_LEVEL_INFO, "Default", &env));
+      env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "Default");
     }
   }
 
-  void TearDown() override {
-    if (env) OrtReleaseEnv(env);
-  }
-
   // Objects declared here can be used by all tests in the test case for Foo.
 };
 
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 3ccf4bb83dc6d..be8ea0ae0cc30 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -11,7 +11,6 @@
 #include "test_allocator.h"
 #include "test_fixture.h"
 #include "onnx_protobuf.h"
-using namespace onnxruntime;
 
 struct Input {
   const char* name;
@@ -19,77 +18,65 @@ struct Input {
   std::vector<float> values;
 };
 
-void RunSession(OrtAllocator* env, OrtSession* session_object,
+void RunSession(OrtAllocator* allocator, Ort::Session& session_object,
                 const std::vector<Input>& inputs,
                 const char* output_name,
                 const std::vector<int64_t>& dims_y,
                 const std::vector<float>& values_y,
-                OrtValue* output_tensor) {
-  std::vector<OrtValue*> ort_inputs;
-  std::vector<std::unique_ptr<OrtValue, decltype(&OrtReleaseValue)>> ort_inputs_cleanup;
+                Ort::Value* output_tensor) {
+  std::vector<Ort::Value> ort_inputs;
   std::vector<const char*> input_names;
   for (size_t i = 0; i < inputs.size(); i++) {
     input_names.emplace_back(inputs[i].name);
-    ort_inputs.emplace_back(OrtCreateTensorWithDataAsOrtValue(env->Info(env), (void*)inputs[i].values.data(), inputs[i].values.size() * sizeof(inputs[i].values[0]), inputs[i].dims, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT));
-    ort_inputs_cleanup.emplace_back(ort_inputs.back(), OrtReleaseValue);
+    ort_inputs.emplace_back(Ort::Value::CreateTensor<float>(allocator->Info(allocator), const_cast<float*>(inputs[i].values.data()), inputs[i].values.size(), inputs[i].dims.data(), inputs[i].dims.size()));
   }
 
-  //  const char* output_names[] = {"Y"};
-  bool is_output_allocated_by_ort = output_tensor == nullptr;
-  OrtValue* old_output_ptr = output_tensor;
-  ORT_THROW_ON_ERROR(OrtRun(session_object, NULL, input_names.data(), ort_inputs.data(), ort_inputs.size(), &output_name, 1, &output_tensor));
-  ASSERT_NE(output_tensor, nullptr);
-  if (!is_output_allocated_by_ort)
-    ASSERT_EQ(output_tensor, old_output_ptr);
-  std::unique_ptr<OrtTensorTypeAndShapeInfo> shape_info;
-  {
-    OrtTensorTypeAndShapeInfo* shape_info_ptr;
-    ORT_THROW_ON_ERROR(OrtGetTensorTypeAndShape(output_tensor, &shape_info_ptr));
-    shape_info.reset(shape_info_ptr);
-  }
-  size_t rtensor_dims = OrtGetDimensionsCount(shape_info.get());
-  std::vector<int64_t> shape_array(rtensor_dims);
-  OrtGetDimensions(shape_info.get(), shape_array.data(), shape_array.size());
-  ASSERT_EQ(shape_array, dims_y);
-  size_t total_len = 1;
-  for (size_t i = 0; i != rtensor_dims; ++i) {
-    total_len *= shape_array[i];
+  std::vector<Ort::Value> ort_outputs;
+  if (output_tensor)
+    session_object.Run(Ort::RunOptions{nullptr}, input_names.data(), ort_inputs.data(), ort_inputs.size(), &output_name, output_tensor, 1);
+  else {
+    ort_outputs = session_object.Run(Ort::RunOptions{nullptr}, input_names.data(), ort_inputs.data(), ort_inputs.size(), &output_name, 1);
+    ASSERT_EQ(ort_outputs.size(), 1);
+    output_tensor = &ort_outputs[0];
   }
+
+  auto type_info = output_tensor->GetTensorTypeAndShapeInfo();
+  ASSERT_EQ(type_info.GetShape(), dims_y);
+  size_t total_len = type_info.GetElementCount();
   ASSERT_EQ(values_y.size(), total_len);
-  float* f;
-  ORT_THROW_ON_ERROR(OrtGetTensorMutableData(output_tensor, (void**)&f));
+
+  float* f = output_tensor->GetTensorMutableData<float>();
   for (size_t i = 0; i != total_len; ++i) {
     ASSERT_EQ(values_y[i], f[i]);
   }
-  if (is_output_allocated_by_ort) OrtReleaseValue(output_tensor);
 }
 
 template <typename T>
-void TestInference(OrtEnv* env, T model_uri,
+void TestInference(Ort::Env& env, T model_uri,
                    const std::vector<Input>& inputs,
                    const char* output_name,
                    const std::vector<int64_t>& expected_dims_y,
                    const std::vector<float>& expected_values_y,
                    int provider_type, OrtCustomOpDomain* custom_op_domain_ptr) {
-  SessionOptionsWrapper sf(env);
+  Ort::SessionOptions session_options;
 
   if (provider_type == 1) {
 #ifdef USE_CUDA
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, 0));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
     std::cout << "Running simple inference with cuda provider" << std::endl;
 #else
     return;
 #endif
   } else if (provider_type == 2) {
 #ifdef USE_MKLDNN
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Mkldnn(sf, 1));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Mkldnn(session_options, 1));
     std::cout << "Running simple inference with mkldnn provider" << std::endl;
 #else
     return;
 #endif
   } else if (provider_type == 3) {
 #ifdef USE_NUPHAR
-    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Nuphar(sf, 1, 0, ""));
+    ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Nuphar(session_options, 1, 0, ""));
     std::cout << "Running simple inference with nuphar provider" << std::endl;
 #else
     return;
@@ -98,43 +85,32 @@ void TestInference(OrtEnv* env, T model_uri,
     std::cout << "Running simple inference with default provider" << std::endl;
   }
   if (custom_op_domain_ptr) {
-    ORT_THROW_ON_ERROR(OrtAddCustomOpDomain(sf, custom_op_domain_ptr));
+    ORT_THROW_ON_ERROR(OrtAddCustomOpDomain(session_options, custom_op_domain_ptr));
   }
 
-  std::unique_ptr<OrtSession, decltype(&OrtReleaseSession)>
-      inference_session(sf.OrtCreateSession(model_uri), OrtReleaseSession);
-  std::unique_ptr<MockedOrtAllocator> default_allocator(std::make_unique<MockedOrtAllocator>());
+  Ort::Session session(env, model_uri, session_options);
+  auto default_allocator = std::make_unique<MockedOrtAllocator>();
   // Now run
   //without preallocated output tensor
   RunSession(default_allocator.get(),
-             inference_session.get(),
+             session,
              inputs,
              output_name,
              expected_dims_y,
              expected_values_y,
              nullptr);
   //with preallocated output tensor
-  std::unique_ptr<OrtValue, decltype(&OrtReleaseValue)> value_y(nullptr, OrtReleaseValue);
-  {
-    std::vector<OrtValue*> allocated_outputs(1);
-    std::vector<int64_t> dims_y(expected_dims_y.size());
-    for (size_t i = 0; i != expected_dims_y.size(); ++i) {
-      dims_y[i] = expected_dims_y[i];
-    }
+  Ort::Value value_y = Ort::Value::CreateTensor<float>(default_allocator.get(), expected_dims_y.data(), expected_dims_y.size());
 
-    allocated_outputs[0] =
-        OrtCreateTensorAsOrtValue(default_allocator.get(), dims_y, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
-    value_y.reset(allocated_outputs[0]);
-  }
   //test it twice
   for (int i = 0; i != 2; ++i)
     RunSession(default_allocator.get(),
-               inference_session.get(),
+               session,
                inputs,
                output_name,
                expected_dims_y,
                expected_values_y,
-               value_y.get());
+               &value_y);
 }
 
 static constexpr PATH_TYPE MODEL_URI = TSTR("testdata/mul_1.pb");
@@ -158,7 +134,7 @@ TEST_P(CApiTestWithProvider, simple) {
   std::vector<int64_t> expected_dims_y = {3, 2};
   std::vector<float> expected_values_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
 
-  TestInference<PATH_TYPE>(env, MODEL_URI, inputs, "Y", expected_dims_y, expected_values_y, GetParam(), nullptr);
+  TestInference<PATH_TYPE>(env_, MODEL_URI, inputs, "Y", expected_dims_y, expected_values_y, GetParam(), nullptr);
 }
 
 INSTANTIATE_TEST_CASE_P(CApiTestWithProviders,
@@ -235,65 +211,54 @@ TEST_F(CApiTest, custom_op_handler) {
   Ort::CustomOpDomain custom_op_domain("");
   custom_op_domain.Add(&custom_op);
 
-  TestInference<PATH_TYPE>(env, CUSTOM_OP_MODEL_URI, inputs, "Y", expected_dims_y, expected_values_y, 0, custom_op_domain);
+  TestInference<PATH_TYPE>(env_, CUSTOM_OP_MODEL_URI, inputs, "Y", expected_dims_y, expected_values_y, 0, custom_op_domain);
 }
 
 #ifdef ORT_RUN_EXTERNAL_ONNX_TESTS
 TEST_F(CApiTest, create_session_without_session_option) {
   constexpr PATH_TYPE model_uri = TSTR("../models/opset8/test_squeezenet/model.onnx");
-  OrtSession* ret;
-  ORT_THROW_ON_ERROR(::OrtCreateSession(env, model_uri, nullptr, &ret));
+  Ort::Session ret(env_, model_uri, Ort::SessionOptions{nullptr});
   ASSERT_NE(nullptr, ret);
-  OrtReleaseSession(ret);
 }
 #endif
 
 TEST_F(CApiTest, create_tensor) {
   const char* s[] = {"abc", "kmp"};
   int64_t expected_len = 2;
-  std::unique_ptr<MockedOrtAllocator> default_allocator(std::make_unique<MockedOrtAllocator>());
-  {
-    std::unique_ptr<OrtValue, decltype(&OrtReleaseValue)> tensor(
-        OrtCreateTensorAsOrtValue(default_allocator.get(), {expected_len}, ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING),
-        OrtReleaseValue);
-    ORT_THROW_ON_ERROR(OrtFillStringTensor(tensor.get(), s, expected_len));
-    std::unique_ptr<OrtTensorTypeAndShapeInfo> shape_info;
-    {
-      OrtTensorTypeAndShapeInfo* shape_info_ptr;
-      ORT_THROW_ON_ERROR(OrtGetTensorTypeAndShape(tensor.get(), &shape_info_ptr));
-      shape_info.reset(shape_info_ptr);
-    }
-    int64_t len = OrtGetTensorShapeElementCount(shape_info.get());
-    ASSERT_EQ(len, expected_len);
-    std::vector<int64_t> shape_array(len);
-
-    size_t data_len;
-    ORT_THROW_ON_ERROR(OrtGetStringTensorDataLength(tensor.get(), &data_len));
-    std::string result(data_len, '\0');
-    std::vector<size_t> offsets(len);
-    ORT_THROW_ON_ERROR(OrtGetStringTensorContent(tensor.get(), (void*)result.data(), data_len, offsets.data(),
-                                                 offsets.size()));
-  }
+  auto default_allocator = std::make_unique<MockedOrtAllocator>();
+
+  Ort::Value tensor = Ort::Value::CreateTensor(default_allocator.get(), &expected_len, 1, ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING);
+
+  ORT_THROW_ON_ERROR(OrtFillStringTensor(tensor, s, expected_len));
+  auto shape_info = tensor.GetTensorTypeAndShapeInfo();
+
+  int64_t len = shape_info.GetElementCount();
+  ASSERT_EQ(len, expected_len);
+  std::vector<int64_t> shape_array(len);
+
+  size_t data_len = tensor.GetStringTensorDataLength();
+  std::string result(data_len, '\0');
+  std::vector<size_t> offsets(len);
+  tensor.GetStringTensorContent((void*)result.data(), data_len, offsets.data(), offsets.size());
 }
 
 TEST_F(CApiTest, create_tensor_with_data) {
   float values[] = {3.0f, 1.0f, 2.f, 0.f};
   constexpr size_t values_length = sizeof(values) / sizeof(values[0]);
-  OrtAllocatorInfo* info;
-  ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &info));
+
+  Ort::AllocatorInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
   std::vector<int64_t> dims = {4};
-  std::unique_ptr<OrtValue, decltype(&OrtReleaseValue)> tensor(
-      OrtCreateTensorWithDataAsOrtValue(info, values, values_length * sizeof(float), dims, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT), OrtReleaseValue);
-  OrtReleaseAllocatorInfo(info);
-  void* new_pointer;
-  ORT_THROW_ON_ERROR(OrtGetTensorMutableData(tensor.get(), &new_pointer));
+  Ort::Value tensor = Ort::Value::CreateTensor<float>(info, values, values_length, dims.data(), dims.size());
+
+  float* new_pointer = tensor.GetTensorMutableData<float>();
   ASSERT_EQ(new_pointer, values);
-  struct OrtTypeInfo* type_info;
-  ORT_THROW_ON_ERROR(OrtGetTypeInfo(tensor.get(), &type_info));
-  const struct OrtTensorTypeAndShapeInfo* tensor_info = OrtCastTypeInfoToTensorInfo(type_info);
+
+  auto type_info = tensor.GetTypeInfo();
+  auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
+
   ASSERT_NE(tensor_info, nullptr);
-  ASSERT_EQ(1, OrtGetDimensionsCount(tensor_info));
-  OrtReleaseTypeInfo(type_info);
+  ASSERT_EQ(1, tensor_info.GetDimensionsCount());
 }
 
 int main(int argc, char** argv) {
diff --git a/onnxruntime/test/shared_lib/test_io_types.cc b/onnxruntime/test/shared_lib/test_io_types.cc
index 37651cd17cde6..f34bf111b764a 100644
--- a/onnxruntime/test/shared_lib/test_io_types.cc
+++ b/onnxruntime/test/shared_lib/test_io_types.cc
@@ -4,45 +4,34 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test_fixture.h"
 
-using namespace onnxruntime;
-
-static void TestModelInfo(const OrtSession* inference_session, bool is_input, const std::vector<int64_t>& dims) {
+static void TestModelInfo(const Ort::Session& session, bool is_input, const std::vector<int64_t>& dims) {
   size_t input_count;
   if (is_input) {
-    ORT_THROW_ON_ERROR(OrtSessionGetInputCount(inference_session, &input_count));
+    input_count = session.GetInputCount();
   } else {
-    ORT_THROW_ON_ERROR(OrtSessionGetOutputCount(inference_session, &input_count));
+    input_count = session.GetOutputCount();
   }
   ASSERT_EQ(1, input_count);
-  std::unique_ptr<OrtTypeInfo> input_type_info;
-  {
-    OrtTypeInfo* t;
-    if (is_input) {
-      ORT_THROW_ON_ERROR(OrtSessionGetInputTypeInfo(inference_session, 0, &t));
-    } else {
-      ORT_THROW_ON_ERROR(OrtSessionGetOutputTypeInfo(inference_session, 0, &t));
-    }
-    input_type_info.reset(t);
-  }
+  Ort::TypeInfo input_type_info = is_input ? session.GetInputTypeInfo(0) : session.GetOutputTypeInfo(0);
   ASSERT_NE(nullptr, input_type_info);
-  enum ONNXType otype = OrtOnnxTypeFromTypeInfo(input_type_info.get());
+
+  ONNXType otype = input_type_info.GetONNXType();
   ASSERT_EQ(ONNX_TYPE_TENSOR, otype);
 
-  const OrtTensorTypeAndShapeInfo* p = OrtCastTypeInfoToTensorInfo(input_type_info.get());
+  auto p = input_type_info.GetTensorTypeAndShapeInfo();
   ASSERT_NE(nullptr, p);
 
-  enum ONNXTensorElementDataType ele_type = OrtGetTensorElementType(p);
+  ONNXTensorElementDataType ele_type = p.GetElementType();
   ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ele_type);
-  ASSERT_EQ(dims.size(), OrtGetDimensionsCount(p));
-  std::vector<int64_t> real_dims(dims.size());
-  OrtGetDimensions(p, real_dims.data(), real_dims.size());
+  ASSERT_EQ(dims.size(), p.GetDimensionsCount());
+  std::vector<int64_t> real_dims = p.GetShape();
   ASSERT_EQ(real_dims, dims);
 }
 
 TEST_F(CApiTest, input_output_type_info) {
-  SessionOptionsWrapper sf(env);
   constexpr PATH_TYPE model_uri = TSTR("../models/opset8/test_squeezenet/model.onnx");
-  std::unique_ptr<OrtSession, decltype(&OrtReleaseSession)> inference_session(sf.OrtCreateSession(model_uri), OrtReleaseSession);
-  TestModelInfo(inference_session.get(), true, {1, 3, 224, 224});
-  TestModelInfo(inference_session.get(), false, {1, 1000, 1, 1});
+  Ort::SessionOptions session_options;
+  Ort::Session session(env_, model_uri, session_options);
+  TestModelInfo(session, true, {1, 3, 224, 224});
+  TestModelInfo(session, false, {1, 1000, 1, 1});
 }
diff --git a/onnxruntime/test/shared_lib/test_model_loading.cc b/onnxruntime/test/shared_lib/test_model_loading.cc
index 5537801649d35..4da741a7c627b 100644
--- a/onnxruntime/test/shared_lib/test_model_loading.cc
+++ b/onnxruntime/test/shared_lib/test_model_loading.cc
@@ -81,9 +81,9 @@ TEST_F(CApiTest, model_missing_data) {
   WriteStringToTempFile(test_data, model_url);
   std::unique_ptr<ORTCHAR_T, decltype(&DeleteFileFromDisk)> file_deleter(const_cast<ORTCHAR_T*>(model_url.c_str()),
                                                                          DeleteFileFromDisk);
-  std::unique_ptr<OrtSessionOptions> so(OrtCreateSessionOptions());
+  Ort::SessionOptions so;
   OrtSession* ret;
-  auto st = ::OrtCreateSession(env, model_url.c_str(), so.get(), &ret);
+  auto st = ::OrtCreateSession(env_, model_url.c_str(), so, &ret);
   ASSERT_NE(st, nullptr);
   OrtReleaseStatus(st);
 }
@@ -165,33 +165,25 @@ TEST_F(CApiTest, model_with_external_data) {
   WriteStringToTempFile(model_data.c_str(), model_url);
   std::unique_ptr<ORTCHAR_T, decltype(&DeleteFileFromDisk)> file_deleter(const_cast<ORTCHAR_T*>(model_url.c_str()),
                                                                          DeleteFileFromDisk);
-  std::unique_ptr<OrtSessionOptions> so(OrtCreateSessionOptions());
-  OrtSession* session;
-  auto st = ::OrtCreateSession(env, model_url.c_str(), so.get(), &session);
-  ASSERT_EQ(st, nullptr) << OrtGetErrorMessage(st);
-  OrtReleaseStatus(st);
-  ::OrtReleaseSession(session);
+  Ort::SessionOptions so;
+  Ort::Session session(env_, model_url.c_str(), so);
 }
 
 TEST_F(CApiTest, model_from_array) {
   const char* model_path = "testdata/matmul_1.pb";
   std::vector<char> buffer;
   {
-      std::ifstream file(model_path, std::ios::binary | std::ios::ate);
-      if (!file)
-          throw std::runtime_error("Error reading model");
-      buffer.resize(file.tellg());
-      file.seekg(0, std::ios::beg);
-      if (!file.read(buffer.data(), buffer.size()))
-          throw std::runtime_error("Error reading model");
+    std::ifstream file(model_path, std::ios::binary | std::ios::ate);
+    if (!file)
+      throw std::runtime_error("Error reading model");
+    buffer.resize(file.tellg());
+    file.seekg(0, std::ios::beg);
+    if (!file.read(buffer.data(), buffer.size()))
+      throw std::runtime_error("Error reading model");
   }
 
-  std::unique_ptr<OrtSessionOptions> so(OrtCreateSessionOptions());
-  OrtSession* session;
-  auto st = ::OrtCreateSessionFromArray(env, buffer.data(), static_cast<int>(buffer.size()), so.get(), &session);
-  ASSERT_EQ(st, nullptr) << OrtGetErrorMessage(st);
-  OrtReleaseStatus(st);
-  ::OrtReleaseSession(session);
+  Ort::SessionOptions so;
+  Ort::Session session(env_, buffer.data(), buffer.size(), so);
 }
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc
index fd9e8c0357d89..7ec63abf25b54 100644
--- a/onnxruntime/test/shared_lib/test_nontensor_types.cc
+++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc
@@ -8,8 +8,6 @@
 #include "test_allocator.h"
 #include <iostream>
 
-using namespace onnxruntime;
-
 template <typename T>
 struct RelAllocations {
   RelAllocations(std::function<void(T*)> f) : relf(f) {}
diff --git a/onnxruntime/test/shared_lib/test_session_options.cc b/onnxruntime/test/shared_lib/test_session_options.cc
index 52b372d699636..597a96aaa8240 100644
--- a/onnxruntime/test/shared_lib/test_session_options.cc
+++ b/onnxruntime/test/shared_lib/test_session_options.cc
@@ -7,18 +7,13 @@
 #include "test_fixture.h"
 using namespace onnxruntime;
 
-TEST_F(CApiTest, session_options) {
-  std::unique_ptr<OrtSessionOptions> options(OrtCreateSessionOptions());
-  ASSERT_NE(options, nullptr);
-}
-
 TEST_F(CApiTest, session_options_graph_optimization_level) {
   // Test set optimization level succeeds when valid level is provided.
   uint32_t valid_optimization_level = static_cast<uint32_t>(TransformerLevel::Level2);
-  std::unique_ptr<OrtSessionOptions> options(OrtCreateSessionOptions());
-  ASSERT_EQ(OrtSetSessionGraphOptimizationLevel(options.get(), valid_optimization_level), 0);
+  Ort::SessionOptions options;
+  options.SetGraphOptimizationLevel(valid_optimization_level);
 
   // Test set optimization level fails when invalid level is provided.
   uint32_t invalid_level = static_cast<uint32_t>(TransformerLevel::MaxTransformerLevel);
-  ASSERT_EQ(OrtSetSessionGraphOptimizationLevel(options.get(), invalid_level), -1);
-}
\ No newline at end of file
+  ASSERT_EQ(OrtSetSessionGraphOptimizationLevel(options, invalid_level), -1);
+}
diff --git a/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml
index 66871a3aa34a5..9ae7351ac7897 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml
@@ -30,7 +30,7 @@ jobs:
         pythonInterpreter: '/usr/bin/python3'
         workingDirectory: $(Build.BinariesDirectory)
 
-    - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--config Debug --build_server --use_openmp --use_full_protobuf --enable_server_model_tests"'
+    - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--config RelWithDebInfo --build_server --use_openmp --use_full_protobuf --enable_server_model_tests --cmake_extra_defines onnxruntime_SERVER_VERSION=$(cat ./VERSION_NUMBER)-$(Build.BuildNumber) onnxruntime_LATEST_COMMIT_ID=$(Build.SourceVersion)"'
       displayName: 'Run build script with model tests'
 
     - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0
diff --git a/tools/ci_build/github/linux/run_build.sh b/tools/ci_build/github/linux/run_build.sh
index 97279f9336fff..d784ed265ae54 100755
--- a/tools/ci_build/github/linux/run_build.sh
+++ b/tools/ci_build/github/linux/run_build.sh
@@ -17,7 +17,7 @@ done
 if [ $BUILD_OS = "android" ]; then
     pushd /onnxruntime_src
     mkdir build-android && cd build-android
-    /opt/cmake/bin/cmake -DCMAKE_TOOLCHAIN_FILE=/android-ndk/build/cmake/android.toolchain.cmake -DANDROID_CPP_FEATURES=exceptions -DANDROID_PLATFORM=android-28 -DANDROID_ABI=arm64-v8a -DCMAKE_BUILD_TYPE=Release -Donnxruntime_CROSS_COMPILING=ON -Donnxruntime_BUILD_x86=OFF -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc ../cmake
+    /opt/cmake/bin/cmake -DCMAKE_TOOLCHAIN_FILE=/android-ndk/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc ../cmake
     /opt/cmake/bin/cmake --build . -- -j$(nproc)
 else
     COMMON_BUILD_ARGS="--skip_submodule_sync --enable_onnx_tests --parallel --build_shared_lib --use_openmp"