diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 044588c080072..46f8c8891dda5 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -346,7 +346,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "12a3b24c456cebd9fd11f23ac0164f78129b00c6",
+          "commitHash": "b9b4a37041dec3dd62ac92014a6cc1aece48d9f3",
           "repositoryUrl": "https://github.com/google/dawn.git"
         },
         "comments": "dawn"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 304236743fd42..b332583035890 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -93,6 +93,7 @@ option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
 option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF)
 option(onnxruntime_USE_QNN "Build with QNN support" OFF)
+option(onnxruntime_BUILD_QNN_EP_STATIC_LIB "Build with QNN EP as a static library" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index d1a528bd6b4be..c73d9a4e3532f 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -58,5 +58,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/f3f6caa6e
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
-dawn;https://github.com/google/dawn/archive/12a3b24c456cebd9fd11f23ac0164f78129b00c6.zip;ad428f6dc16f1336d584f7bad5714e1097dafc43
+dawn;https://github.com/google/dawn/archive/b9b4a37041dec3dd62ac92014a6cc1aece48d9f3.zip;e8b8c2ebabdedb7c57d931fc4a19ae22146d31e1
 kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/d15722976120710080ca098fe8ddabf4556cb40f/kleidiai-d15722976120710080ca098fe8ddabf4556cb40f.zip;d6c840d00c3b05aedf06e957ddaece1013d1f40b
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 7b6e2141eeb1b..6b4404a124926 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -32,7 +32,13 @@ FetchContent_Declare(
 
 onnxruntime_fetchcontent_makeavailable(abseil_cpp)
 FetchContent_GetProperties(abseil_cpp)
-set(ABSEIL_SOURCE_DIR ${abseil_cpp_SOURCE_DIR})
+if(abseil_cpp_SOURCE_DIR)
+  set(ABSEIL_SOURCE_DIR ${abseil_cpp_SOURCE_DIR})
+  if(onnxruntime_USE_WEBGPU)
+    set(DAWN_ABSEIL_DIR ${abseil_cpp_SOURCE_DIR})
+  endif()
+endif()
+
 # abseil_cpp_SOURCE_DIR is non-empty if we build it from source
 message(STATUS "Abseil source dir:" ${ABSEIL_SOURCE_DIR})
 # abseil_cpp_VERSION  is non-empty if we find a preinstalled ABSL
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index b8e90026b4f9a..e95656969866f 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -222,6 +222,11 @@ onnxruntime_fetchcontent_makeavailable(Protobuf)
 if(Protobuf_FOUND)
   message(STATUS "Protobuf version: ${Protobuf_VERSION}")
 else()
+  if(protobuf_SOURCE_DIR)
+    if(onnxruntime_USE_WEBGPU)
+      set(DAWN_PROTOBUF_DIR ${protobuf_SOURCE_DIR})
+    endif()
+  endif()
   # Adjust warning flags
   if (TARGET libprotoc)
     if (NOT MSVC)
@@ -626,7 +631,7 @@ if (onnxruntime_USE_WEBGPU)
       URL_HASH SHA1=${DEP_SHA1_dawn}
       # All previous patches are merged into the upstream dawn project. We don't need to apply any patches right now.
       # if we need to apply patches in the future, we can uncomment the following line.
-      PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
+      # PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
     )
   endif()
 
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index d72b61a0859b2..78edb4179fafd 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -199,17 +199,12 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA)
   endforeach()
 endif()
 
-# This list is a reversed topological ordering of library dependencies.
-# Earlier entries may depend on later ones. Later ones should not depend on earlier ones.
-set(onnxruntime_INTERNAL_LIBRARIES
-  onnxruntime_session
-  ${onnxruntime_libs}
+set(onnxruntime_INTERNAL_PROVIDER_LIBRARIES
   ${PROVIDERS_ACL}
   ${PROVIDERS_ARMNN}
   ${PROVIDERS_COREML}
   ${PROVIDERS_DML}
   ${PROVIDERS_NNAPI}
-  ${PROVIDERS_QNN}
   ${PROVIDERS_SNPE}
   ${PROVIDERS_RKNPU}
   ${PROVIDERS_VSINPU}
@@ -218,6 +213,18 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${PROVIDERS_WEBNN}
   ${PROVIDERS_AZURE}
   ${PROVIDERS_INTERNAL_TESTING}
+)
+
+if (onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+  list(APPEND onnxruntime_INTERNAL_PROVIDER_LIBRARIES onnxruntime_providers_qnn)
+endif()
+
+# This list is a reversed topological ordering of library dependencies.
+# Earlier entries may depend on later ones. Later ones should not depend on earlier ones.
+set(onnxruntime_INTERNAL_LIBRARIES
+  onnxruntime_session
+  ${onnxruntime_libs}
+  ${onnxruntime_INTERNAL_PROVIDER_LIBRARIES}
   ${onnxruntime_winml}
   onnxruntime_optimizer
   onnxruntime_providers
diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index b15b9632e9e24..1227264e595ed 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -148,7 +148,7 @@ if (WIN32)
   if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime>)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_FILE_NAME:onnxruntime4j_jni>)
-    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB))
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_shared>)
     endif()
     if (onnxruntime_USE_CUDA)
@@ -163,11 +163,14 @@ if (WIN32)
     if (onnxruntime_USE_TENSORRT)
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_tensorrt>)
     endif()
+    if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+      add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_qnn>)
+    endif()
   endif()
 else()
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime4j_jni>)
-  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB))
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_shared>)
   endif()
   if (onnxruntime_USE_CUDA)
@@ -182,6 +185,9 @@ else()
   if (onnxruntime_USE_TENSORRT)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_tensorrt>)
   endif()
+  if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_qnn>)
+  endif()
 endif()
 
 # run the build process (this copies the results back into CMAKE_CURRENT_BINARY_DIR)
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 5124262ec0004..ed3ad89247975 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -95,6 +95,8 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
         ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
         ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
+        ${MLAS_SRC_DIR}/hgemm_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -374,6 +376,7 @@ else()
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
+          ${MLAS_SRC_DIR}/hgemm_kernel_neon.cpp
         )
         set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
                                     PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
@@ -394,6 +397,7 @@ else()
             ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
             ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
             ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
+            ${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp
           )
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
@@ -406,6 +410,7 @@ else()
           set_source_files_properties(${MLAS_SRC_DIR}/cast_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
         endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 582491de9503d..67fa48b28278d 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -74,9 +74,6 @@ endif()
 if(onnxruntime_USE_JSEP)
   set(PROVIDERS_JS onnxruntime_providers_js)
 endif()
-if(onnxruntime_USE_QNN)
-  set(PROVIDERS_QNN onnxruntime_providers_qnn)
-endif()
 if(onnxruntime_USE_RKNPU)
   set(PROVIDERS_RKNPU onnxruntime_providers_rknpu)
 endif()
diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index ec7bc7a98969e..18048c8cdce2f 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -8,25 +8,18 @@ endif()
 add_compile_definitions(USE_COREML=1)
 
 # Check if we can build the coremltools code for creating an mlpackage with an mlprogram.
-# The coremltools source requires std::filesystem::path which is only available from iOS 13 on.
-set(_enable_ML_PROGRAM ON)
-if (IOS AND CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 13.0)
-  message(WARNING "CoreML ML Program is not supported on iOS < 13.0. Excluding ML Program support from build.")
-  set(_enable_ML_PROGRAM OFF)
-elseif(LINUX)
-  # uuid-dev is required. we don't bother installing on CIs as it's really for manual developer testing.
+if(LINUX)
   find_library(LibUUID_LIBRARY NAMES uuid)
   find_path(LibUUID_INCLUDE_DIR NAMES uuid/uuid.h)
   if (NOT LibUUID_INCLUDE_DIR)
-    message(STATUS "uuid/uuid.h was not found as is required for ML Program support. "
+    message(FATAL "uuid/uuid.h was not found as is required for ML Program support. "
                     "Run `sudo apt install uuid-dev` if you need to test ML Program related CoreML EP code. ")
-    set(_enable_ML_PROGRAM OFF)
   endif()
 endif()
 
-if (_enable_ML_PROGRAM)
-  add_compile_definitions(COREML_ENABLE_MLPROGRAM=1)
-endif()
+
+add_compile_definitions(COREML_ENABLE_MLPROGRAM=1)
+
 
 # Compile CoreML proto definition to ${CMAKE_CURRENT_BINARY_DIR}/coreml_proto
 set(COREML_PROTO_ROOT ${coremltools_SOURCE_DIR}/mlmodel/format)
@@ -93,10 +86,10 @@ file(GLOB_RECURSE
   "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.cc"
 )
 
-if(_enable_ML_PROGRAM)
+
   # Add helpers to create mlpackage weights. limit to just the files we need to minimize the changes to make them
   # build on Windows and Linux.
-  file(GLOB
+file(GLOB
     onnxruntime_providers_coreml_milblob_cc_srcs CONFIGURE_DEPENDS
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.hpp"
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.cpp"
@@ -105,22 +98,22 @@ if(_enable_ML_PROGRAM)
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageFormat.hpp"
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/FileWriter.?pp"
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageWriter.?pp"
-  )
+)
 
-  # Add helpers to create mlpackage
-  file(GLOB
+# Add helpers to create mlpackage
+file(GLOB
     onnxruntime_providers_coreml_modelpackage_cc_srcs CONFIGURE_DEPENDS
     "${coremltools_SOURCE_DIR}/modelpackage/src/ModelPackage.?pp"
     "${coremltools_SOURCE_DIR}/modelpackage/src/utils/JsonMap.?pp"
-  )
+)
 
-  set(coremltools_srcs
+set(coremltools_srcs
     ${onnxruntime_providers_coreml_milblob_cc_srcs}
     ${onnxruntime_providers_coreml_modelpackage_cc_srcs}
-  )
+)
+
+source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs})
 
-  source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs})
-endif()
 
 # Add CoreML objective c++ source code
 if (APPLE)
@@ -174,34 +167,34 @@ if (APPLE)
   target_compile_definitions(onnxruntime_providers_coreml PRIVATE __APPLE__)
 endif()
 
-if (_enable_ML_PROGRAM)
-  # Setup coremltools fp16 and json dependencies for creating an mlpackage.
-  #
-  # fp16 depends on psimd
-  FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
-  onnxruntime_fetchcontent_makeavailable(psimd)
-  set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
-  FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16})
-  set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
-  set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
-  onnxruntime_fetchcontent_makeavailable(fp16)
-
-  # need to tweak the include paths to match what the coreml source code expects
-  target_include_directories(onnxruntime_providers_coreml PRIVATE
-                            ${fp16_SOURCE_DIR}/include
-                            ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann
-                            ${coremltools_SOURCE_DIR}
-                            ${coremltools_SOURCE_DIR}/mlmodel/src/
-                            ${coremltools_SOURCE_DIR}/modelpackage/src/
-  )
 
-  add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16)
+# Setup coremltools fp16 and json dependencies for creating an mlpackage.
+#
+# fp16 depends on psimd
+FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
+onnxruntime_fetchcontent_makeavailable(psimd)
+set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
+FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16})
+set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
+set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
+onnxruntime_fetchcontent_makeavailable(fp16)
+
+# need to tweak the include paths to match what the coreml source code expects
+target_include_directories(onnxruntime_providers_coreml PRIVATE
+                          ${fp16_SOURCE_DIR}/include
+                          ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann
+                          ${coremltools_SOURCE_DIR}
+                          ${coremltools_SOURCE_DIR}/mlmodel/src/
+                          ${coremltools_SOURCE_DIR}/modelpackage/src/
+)
 
-  if (LINUX)
-    target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid)
-  endif()
+add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16)
+
+if (LINUX)
+  target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid)
 endif()
 
+
 if (APPLE)
   target_link_libraries(onnxruntime_providers_coreml PRIVATE "-framework Foundation" "-framework CoreML")
 endif()
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 91a2b13002ec9..4ae89a392278f 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -239,7 +239,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
     if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-      set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+      target_link_options(onnxruntime_providers_shared PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
+                          "LINKER:--gc-sections")
     endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index b68d84c23bb32..303020145889b 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -3,41 +3,89 @@
 
   add_compile_definitions(USE_QNN=1)
 
-  # These are shared utils,
-  # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML
-  file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-  )
+  if(onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    add_compile_definitions(BUILD_QNN_EP_STATIC_LIB=1)
+  endif()
 
   file(GLOB_RECURSE
-    onnxruntime_providers_qnn_ep_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc"
+       onnxruntime_providers_qnn_ep_srcs CONFIGURE_DEPENDS
+       "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h"
+       "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc"
   )
 
-  file(GLOB_RECURSE
-    onnxruntime_providers_qnn_builder_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.cc"
-  )
+  if(onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    #
+    # Build QNN EP as a static library
+    #
+    set(onnxruntime_providers_qnn_srcs ${onnxruntime_providers_qnn_ep_srcs})
+    source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx
+                                                                onnx_proto protobuf::libprotobuf-lite
+                                                                flatbuffers::flatbuffers Boost::mp11)
+    add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
+    target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
+                                                                 ${onnxruntime_QNN_HOME}/include/QNN
+                                                                 ${onnxruntime_QNN_HOME}/include)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
 
-  set(onnxruntime_providers_qnn_cc_srcs
-    ${onnxruntime_providers_shared_utils_cc_srcs}
-    ${onnxruntime_providers_qnn_ep_cc_srcs}
-    ${onnxruntime_providers_qnn_builder_cc_srcs}
-  )
+    # ignore the warning unknown-pragmas on "pragma region"
+    if(NOT MSVC)
+      target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
+    endif()
+  else()
+    #
+    # Build QNN EP as a shared library
+    #
+    file(GLOB_RECURSE
+         onnxruntime_providers_qnn_shared_lib_srcs CONFIGURE_DEPENDS
+         "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
+         "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
+    )
+    set(onnxruntime_providers_qnn_srcs ${onnxruntime_providers_qnn_ep_srcs}
+	                               ${onnxruntime_providers_qnn_shared_lib_srcs})
+
+    source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_shared_library_module(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx
+	                                                        onnxruntime_common Boost::mp11 safeint_interface)
+    target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS} ${CMAKE_DL_LIBS})
+    add_dependencies(onnxruntime_providers_qnn onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
+    target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
+                                                                 ${CMAKE_CURRENT_BINARY_DIR}
+                                                                 ${onnxruntime_QNN_HOME}/include/QNN
+                                                                 ${onnxruntime_QNN_HOME}/include)
+
+    # Set linker flags for function(s) exported by EP DLL
+    if(UNIX)
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+      )
+    elseif(WIN32)
+      set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS
+                   "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
+    else()
+      message(FATAL_ERROR "onnxruntime_providers_qnn unknown platform, need to specify shared library exports for it")
+    endif()
+
+    # Set compile options
+    if(MSVC)
+      target_compile_options(onnxruntime_providers_qnn PUBLIC /wd4099 /wd4005)
+    else()
+      # ignore the warning unknown-pragmas on "pragma region"
+      target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
+    endif()
+
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
 
-  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers::flatbuffers Boost::mp11)
-  target_link_libraries(onnxruntime_providers_qnn)
-  add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
-  target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_QNN_HOME}/include/QNN ${onnxruntime_QNN_HOME}/include)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
-  # ignore the warning unknown-pragmas on "pragma region"
-  if(NOT MSVC)
-    target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
+    install(TARGETS onnxruntime_providers_qnn
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 5b29d1093aa5c..15a2862cede0c 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -169,9 +169,7 @@ if (onnxruntime_ENABLE_LAZY_TENSOR)
   endif()
 endif()
 
-target_link_libraries(onnxruntime_pybind11_state PRIVATE
-    onnxruntime_session
-    ${onnxruntime_libs}
+set(onnxruntime_pybind11_state_static_providers
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_VSINPU}
     ${PROVIDERS_XNNPACK}
@@ -183,7 +181,16 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_WEBGPU}
     ${PROVIDERS_AZURE}
-    ${PROVIDERS_QNN}
+)
+
+if(onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+  list(APPEND onnxruntime_pybind11_state_static_providers PRIVATE onnxruntime_providers_qnn)
+endif()
+
+target_link_libraries(onnxruntime_pybind11_state PRIVATE
+    onnxruntime_session
+    ${onnxruntime_libs}
+    ${onnxruntime_pybind11_state_static_providers}
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
@@ -1000,6 +1007,16 @@ if (onnxruntime_USE_COREML)
 endif()
 
 if (onnxruntime_USE_QNN)
+  if(NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    add_custom_command(
+      TARGET onnxruntime_pybind11_state POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:onnxruntime_providers_qnn>
+        $<TARGET_FILE:onnxruntime_providers_shared>
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+    )
+  endif()
+
   add_custom_command(
     TARGET onnxruntime_pybind11_state POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 7c1b9ddc1548f..c727f4b7e381b 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -619,16 +619,13 @@ if(onnxruntime_USE_ARMNN)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_armnn)
 endif()
 
-set(ONNXRUNTIME_TEST_LIBS
-    onnxruntime_session
-    ${ONNXRUNTIME_INTEROP_TEST_LIBS}
-    ${onnxruntime_libs}
-    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
+set(ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS
+    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime.
+    # QNN EP can be built as either a dynamic and static libs.
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_VSINPU}
     ${PROVIDERS_JS}
     ${PROVIDERS_WEBGPU}
-    ${PROVIDERS_QNN}
     ${PROVIDERS_SNPE}
     ${PROVIDERS_RKNPU}
     ${PROVIDERS_DML}
@@ -637,6 +634,17 @@ set(ONNXRUNTIME_TEST_LIBS
     ${PROVIDERS_COREML}
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_AZURE}
+)
+
+if (onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+  list(APPEND ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS onnxruntime_providers_qnn)
+endif()
+
+set(ONNXRUNTIME_TEST_LIBS
+    onnxruntime_session
+    ${ONNXRUNTIME_INTEROP_TEST_LIBS}
+    ${onnxruntime_libs}
+    ${ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS}
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
@@ -700,7 +708,9 @@ if(onnxruntime_USE_QNN AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_RED
   list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/qnn/*)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_qnn)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn)
-  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_qnn)
+  if(NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_shared)
+  endif()
 endif()
 
 if(onnxruntime_USE_SNPE)
diff --git a/cmake/patches/coremltools/crossplatformbuild.patch b/cmake/patches/coremltools/crossplatformbuild.patch
index 7f2268f50c82e..832191b366d4d 100644
--- a/cmake/patches/coremltools/crossplatformbuild.patch
+++ b/cmake/patches/coremltools/crossplatformbuild.patch
@@ -3,7 +3,7 @@ index adc7bfcf..7b2bf9cc 100644
 --- a/mlmodel/src/MILBlob/Blob/FileWriter.cpp
 +++ b/mlmodel/src/MILBlob/Blob/FileWriter.cpp
 @@ -8,8 +8,12 @@
-
+ 
  #include <cstdio>
  #include <stdexcept>
 +
@@ -12,17 +12,31 @@ index adc7bfcf..7b2bf9cc 100644
  #include <sys/mman.h>
  #include <sys/stat.h>
 +#endif
-
+ 
  using namespace MILBlob;
  using namespace MILBlob::Blob;
+diff --git a/mlmodel/src/MILBlob/Blob/FileWriter.hpp b/mlmodel/src/MILBlob/Blob/FileWriter.hpp
+index 2bc99403..49239513 100644
+--- a/mlmodel/src/MILBlob/Blob/FileWriter.hpp
++++ b/mlmodel/src/MILBlob/Blob/FileWriter.hpp
+@@ -6,7 +6,8 @@
+ #pragma once
+ 
+ #include "MILBlob/Util/Span.hpp"
+-
++// ORT_EDIT: add missing header
++#include <cstdint>
+ #include <fstream>
+ #include <string>
+ #include <type_traits>
 diff --git a/mlmodel/src/MILBlob/Fp16.cpp b/mlmodel/src/MILBlob/Fp16.cpp
 index ae1e71a1..77a7161f 100644
 --- a/mlmodel/src/MILBlob/Fp16.cpp
 +++ b/mlmodel/src/MILBlob/Fp16.cpp
 @@ -5,6 +5,8 @@
-
+ 
  #include "MILBlob/Fp16.hpp"
-
+ 
 +// ORT_EDIT: Exclude clang specific pragmas from other builds
 +#if defined(__clang__)
  // fp16 lib code has some conversion warnings we don't want to globally ignore
@@ -35,11 +49,11 @@ index ae1e71a1..77a7161f 100644
 +#else
 +#include "fp16/fp16.h"
 +#endif
-
+ 
  using namespace MILBlob;
-
+ 
 diff --git a/modelpackage/src/ModelPackage.cpp b/modelpackage/src/ModelPackage.cpp
-index 8fee56b9..99e0d8d6 100644
+index 8fee56b9..5508e316 100644
 --- a/modelpackage/src/ModelPackage.cpp
 +++ b/modelpackage/src/ModelPackage.cpp
 @@ -26,7 +26,14 @@ namespace std {
@@ -55,22 +69,22 @@ index 8fee56b9..99e0d8d6 100644
  #include <uuid/uuid.h>
 +#endif
  #include <vector>
-
+ 
  #if defined(__cplusplus)
 @@ -187,7 +194,10 @@ public:
      ModelPackageItemInfo createFile(const std::string& name, const std::string& author, const std::string& description);
  };
-
+ 
 +// ORT_EDIT: pragma only available on APPLE platforms
 +#if defined(__APPLE__)
  #pragma mark ModelPackageImpl
 +#endif
-
+ 
  ModelPackageImpl::ModelPackageImpl(const std::filesystem::path& path, bool createIfNecessary, bool readOnly)
  : m_packagePath(path),
 @@ -372,6 +382,20 @@ std::filesystem::path ModelPackageImpl::getItemPath(const std::string& name, con
  }
-
+ 
  std::string ModelPackageImpl::generateIdentifier() const {
 +// ORT_EDIT: Use built-in UUID generation on Windows
 +#if defined(_WIN32)
@@ -87,20 +101,20 @@ index 8fee56b9..99e0d8d6 100644
 +    return uuidStrCpp;
 +#else
      uuid_t uuid;
-
+     
      // uuid_unparse generates a 36-character null-terminated string (37 bytes).
 @@ -383,6 +407,7 @@ std::string ModelPackageImpl::generateIdentifier() const {
      uuid_unparse(uuid, buf);
-
+         
      return std::string(buf);
 +#endif
  }
-
+ 
  ModelPackageItemInfo ModelPackageImpl::createFile(const std::string& name, const std::string& author, const std::string& description) {
-@@ -468,7 +493,13 @@ std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::stri
+@@ -468,7 +493,14 @@ std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::stri
      auto author = itemInfoEntry->getString(kModelPackageItemInfoAuthorKey);
      auto description = itemInfoEntry->getString(kModelPackageItemInfoDescriptionKey);
-
+     
 +// ORT_EDIT: need to use path.string() on Windows
 +#if defined(_WIN32)
 +    return std::make_shared<ModelPackageItemInfo>(std::make_shared<ModelPackageItemInfoImpl>(identifier, path.string(), name, author, description));
@@ -108,12 +122,13 @@ index 8fee56b9..99e0d8d6 100644
 +#else
      return std::make_shared<ModelPackageItemInfo>(std::make_shared<ModelPackageItemInfoImpl>(identifier, path, name, author, description));
 +#endif
++
  }
-
+ 
  std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::string& name, const std::string& author) const
-@@ -514,7 +545,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier)
+@@ -514,7 +546,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier)
      }
-
+     
      auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey);
 -    if (0 != std::remove(path.c_str())) {
 +    // ORT_EDIT: std::remove doesn't work on Windows. Use std::filesystem::remove instead.
@@ -121,8 +136,8 @@ index 8fee56b9..99e0d8d6 100644
 +    if (!std::filesystem::remove(path)) {
          throw std::runtime_error("Failed to remove file at path: " + path.string());
      }
-
-@@ -525,13 +558,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path)
+     
+@@ -525,13 +559,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path)
  {
      try {
          ModelPackageImpl(path, false, true);
@@ -132,16 +147,16 @@ index 8fee56b9..99e0d8d6 100644
      }
      return true;
  }
-
+ 
 +// ORT_EDIT: pragma only available on APPLE platforms
 +#if defined(__APPLE__)
  #pragma mark ModelPackage
 +#endif
-
+ 
  ModelPackage::ModelPackage(const std::string& packagePath, bool createIfNecessary, bool readOnly)
  : m_modelPackageImpl(std::make_shared<ModelPackageImpl>(packagePath, createIfNecessary, readOnly))
-@@ -544,7 +580,12 @@ ModelPackage::~ModelPackage()
-
+@@ -544,7 +581,12 @@ ModelPackage::~ModelPackage()
+ 
  std::string ModelPackage::path() const
  {
 +// ORT_EDIT: Windows doesn't automatically convert to std::string as the native format could be char or wchar.
@@ -151,5 +166,19 @@ index 8fee56b9..99e0d8d6 100644
      return m_modelPackageImpl->path();
 +#endif
  }
-
+ 
  std::string ModelPackage::setRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description)
+diff --git a/modelpackage/src/utils/JsonMap.hpp b/modelpackage/src/utils/JsonMap.hpp
+index 0d7dc3f4..b700cfd5 100644
+--- a/modelpackage/src/utils/JsonMap.hpp
++++ b/modelpackage/src/utils/JsonMap.hpp
+@@ -10,7 +10,8 @@
+ #include <iostream>
+ #include <vector>
+ #include <string>
+-
++// ORT_EDIT: add missing header
++#include <memory>
+ class JsonMapImpl;
+ 
+ class JsonMap {
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
deleted file mode 100644
index 868db0c43e3a5..0000000000000
--- a/cmake/patches/dawn/dawn.patch
+++ /dev/null
@@ -1,118 +0,0 @@
-diff --git a/src/emdawnwebgpu/CMakeLists.txt b/src/emdawnwebgpu/CMakeLists.txt
-index de673537d3..c98dc46de7 100644
---- a/src/emdawnwebgpu/CMakeLists.txt
-+++ b/src/emdawnwebgpu/CMakeLists.txt
-@@ -78,6 +78,7 @@ if (${DAWN_ENABLE_EMSCRIPTEN})
-         endif()
- 
-         set(ARGS
-+            ${Python3_EXECUTABLE}
-             "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/maint/gen_struct_info.py"
-             -q
-             "${EM_BUILD_GEN_DIR}/struct_info_webgpu.json"
-diff --git a/third_party/emdawnwebgpu/library_webgpu.js b/third_party/emdawnwebgpu/library_webgpu.js
-index d1835cb090..df03ea2f94 100644
---- a/third_party/emdawnwebgpu/library_webgpu.js
-+++ b/third_party/emdawnwebgpu/library_webgpu.js
-@@ -16,10 +16,19 @@
-     throw new Error("To use Dawn's library_webgpu.js, disable -sUSE_WEBGPU and first include Dawn's library_webgpu_struct_info.js and library_webgpu_enum_tables.js (before library_webgpu.js)");
-   }
- 
-+  if (MEMORY64) {
-+    throw new Error("The current implementation of Dawn's library_webgpu.js does not support MEMORY64 yet");
-+  }
-+
-   // Helper functions for code generation
-   globalThis.gpu = {
--    convertSentinelToUndefined: function(name) {
--      return `if (${name} == -1) ${name} = undefined;`;
-+    convertSentinelToUndefined: function(name, isPtr = false) {
-+      // When `CAN_ADDRESS_2GB` is true, value `-1` is normalized to `0xFFFFFFFF` for pointer.
-+      if (CAN_ADDRESS_2GB && isPtr) {
-+        return `if (${name} == 0xFFFFFFFF) ${name} = undefined;`;
-+      } else {
-+        return `if (${name} == -1) ${name} = undefined;`;
-+      }
-     },
- 
-     makeGetBool: function(struct, offset) {
-@@ -700,6 +709,7 @@ var LibraryWebGPU = {
-     {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.adapterType, 'adapterType', 'i32') }}};
-     {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.vendorID, '0', 'i32') }}};
-     {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.deviceID, '0', 'i32') }}};
-+    return 1;
-   },
- 
-   wgpuAdapterGetLimits: (adapterPtr, limitsOutPtr) => {
-@@ -882,7 +892,7 @@ var LibraryWebGPU = {
- 
-     if (size === 0) warnOnce('getMappedRange size=0 no longer means WGPU_WHOLE_MAP_SIZE');
- 
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     var mapped;
-     try {
-@@ -909,7 +919,7 @@ var LibraryWebGPU = {
- 
-     if (size === 0) warnOnce('getMappedRange size=0 no longer means WGPU_WHOLE_MAP_SIZE');
- 
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     var mapped;
-     try {
-@@ -950,7 +960,7 @@ var LibraryWebGPU = {
-     var buffer = WebGPU.getJsObject(bufferPtr);
-     WebGPU.Internals.bufferOnUnmaps[bufferPtr] = [];
- 
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     {{{ runtimeKeepalivePush() }}}
-     WebGPU.Internals.futureInsert(futureId, buffer.mapAsync(mode, offset, size).then(() => {
-@@ -1145,7 +1155,7 @@ var LibraryWebGPU = {
- 
-   wgpuCommandEncoderClearBuffer: (encoderPtr, bufferPtr, offset, size) => {
-     var commandEncoder = WebGPU.getJsObject(encoderPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     var buffer = WebGPU.getJsObject(bufferPtr);
-     commandEncoder.clearBuffer(buffer, offset, size);
-@@ -2103,7 +2113,7 @@ var LibraryWebGPU = {
-   wgpuRenderBundleEncoderSetIndexBuffer: (passPtr, bufferPtr, format, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setIndexBuffer(buffer, WebGPU.IndexFormat[format], offset, size);
-   },
- 
-@@ -2116,7 +2126,7 @@ var LibraryWebGPU = {
-   wgpuRenderBundleEncoderSetVertexBuffer: (passPtr, slot, bufferPtr, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setVertexBuffer(slot, buffer, offset, size);
-   },
- 
-@@ -2211,7 +2221,7 @@ var LibraryWebGPU = {
-   wgpuRenderPassEncoderSetIndexBuffer: (passPtr, bufferPtr, format, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setIndexBuffer(buffer, WebGPU.IndexFormat[format], offset, size);
-   },
- 
-@@ -2234,7 +2244,7 @@ var LibraryWebGPU = {
-   wgpuRenderPassEncoderSetVertexBuffer: (passPtr, slot, bufferPtr, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setVertexBuffer(slot, buffer, offset, size);
-   },
- 
diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake
new file mode 100644
index 0000000000000..e61308bf643b4
--- /dev/null
+++ b/cmake/vcpkg-ports/cpuinfo/portfile.cmake
@@ -0,0 +1,63 @@
+# On Windows, we can get a cpuinfo.dll, but it exports no symbols.
+if(VCPKG_TARGET_IS_WINDOWS)
+    vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+endif()
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO pytorch/cpuinfo
+    REF 8a1772a0c5c447df2d18edf33ec4603a8c9c04a6
+    SHA512 b94ccbfa886221d6bb16513d074675af0a72928a9dd9485dcacdc1124a8a60aacbbe91913a1579e766dfb024f0be1d52eeead40342004ff0238a8b94a095ed08
+    HEAD_REF master
+)
+
+vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
+    FEATURES
+        tools CPUINFO_BUILD_TOOLS
+)
+
+set(LINK_OPTIONS "")
+if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic")
+    list(APPEND LINK_OPTIONS -DCPUINFO_LIBRARY_TYPE=shared)
+else()
+    list(APPEND LINK_OPTIONS -DCPUINFO_LIBRARY_TYPE=static)
+endif()
+
+if(VCPKG_CRT_LINKAGE STREQUAL "dynamic")
+    list(APPEND LINK_OPTIONS -DCPUINFO_RUNTIME_TYPE=shared)
+else()
+    list(APPEND LINK_OPTIONS -DCPUINFO_RUNTIME_TYPE=static)
+endif()
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+        ${FEATURE_OPTIONS}
+        ${LINK_OPTIONS}
+        -DCPUINFO_BUILD_UNIT_TESTS=OFF
+        -DCPUINFO_BUILD_MOCK_TESTS=OFF
+        -DCPUINFO_BUILD_BENCHMARKS=OFF
+    OPTIONS_DEBUG
+        -DCPUINFO_LOG_LEVEL=debug
+    OPTIONS_RELEASE
+        -DCPUINFO_LOG_LEVEL=default
+)
+vcpkg_cmake_install()
+vcpkg_cmake_config_fixup()
+vcpkg_copy_pdbs()
+vcpkg_fixup_pkgconfig() # pkg_check_modules(libcpuinfo)
+
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
+
+if("tools" IN_LIST FEATURES)
+    set(additional_tools "")
+    if(EXISTS "${CURRENT_PACKAGES_DIR}/bin/cpuid-dump${VCPKG_TARGET_EXECUTABLE_SUFFIX}")
+        list(APPEND additional_tools "cpuid-dump")
+    endif()
+    vcpkg_copy_tools(
+        TOOL_NAMES cache-info cpu-info isa-info ${additional_tools}
+        AUTO_CLEAN
+    )
+endif()
+
+file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
diff --git a/cmake/vcpkg-ports/cpuinfo/vcpkg.json b/cmake/vcpkg-ports/cpuinfo/vcpkg.json
new file mode 100644
index 0000000000000..ce93591dba5ac
--- /dev/null
+++ b/cmake/vcpkg-ports/cpuinfo/vcpkg.json
@@ -0,0 +1,25 @@
+{
+  "name": "cpuinfo",
+  "version-date": "2024-12-09",
+  "port-version": 3,
+  "description": "CPU INFOrmation library (x86/x86-64/ARM/ARM64, Linux/Windows/Android/macOS/iOS)",
+  "homepage": "https://github.com/pytorch/cpuinfo",
+  "license": "BSD-2-Clause",
+  "supports": "!(uwp & arm32)",
+  "dependencies": [
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ],
+  "features": {
+    "tools": {
+      "description": "Build cpuinfo command-line tools",
+      "supports": "!uwp"
+    }
+  }
+}
diff --git a/cmake/vcpkg-ports/onnx/fix-cmakelists.patch b/cmake/vcpkg-ports/onnx/fix-cmakelists.patch
new file mode 100644
index 0000000000000..f8d300103ab20
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/fix-cmakelists.patch
@@ -0,0 +1,67 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4dd56b6..2ff3e29 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -65,6 +65,27 @@ endif()
+ 
+ include(GNUInstallDirs)
+ 
++# install protobuf files
++install(FILES ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-data.proto
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-data.proto3
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-ml.proto
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-ml.proto3
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-operators-ml.proto
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-operators-ml.proto3
++        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnx
++)
++# install python files
++if(BUILD_ONNX_PYTHON)
++  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_data_pb.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_data_pb2.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_ml_pb2.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_operators_ml_pb2.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_operators_pb.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_pb.py
++          DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnx
++  )
++endif()
++
+ set(ONNX_ROOT ${PROJECT_SOURCE_DIR})
+ 
+ # Read ONNX version
+@@ -116,7 +137,8 @@ endif()
+ # find_package Python has replaced PythonInterp and PythonLibs since cmake 3.12
+ # Use the following command in the future; now this is only compatible with the latest pybind11
+ # find_package(Python ${PY_VERSION} COMPONENTS Interpreter Development REQUIRED)
+-find_package(PythonInterp ${PY_VERSION} REQUIRED)
++find_package(Python3 ${PY_VERSION} COMPONENTS Interpreter REQUIRED)
++set(PYTHON_EXECUTABLE ${Python3_EXECUTABLE})
+ if(BUILD_ONNX_PYTHON)
+   find_package(PythonLibs ${PY_VERSION})
+ endif()
+@@ -434,6 +456,7 @@ target_link_libraries(onnx PUBLIC onnx_proto)
+ add_onnx_global_defines(onnx)
+ 
+ if(BUILD_ONNX_PYTHON)
++  find_package(Python3 ${PY_VERSION} COMPONENTS Development REQUIRED)
+   if("${PY_EXT_SUFFIX}" STREQUAL "")
+     if(MSVC)
+       set(PY_EXT_SUFFIX ".pyd")
+@@ -452,10 +475,14 @@ if(BUILD_ONNX_PYTHON)
+   target_include_directories(onnx_cpp2py_export PRIVATE
+                              $<BUILD_INTERFACE:${ONNX_ROOT}>
+                              $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+-                             $<INSTALL_INTERFACE:include>)
++                             ${Python3_INCLUDE_DIRS})
++  target_link_directories(onnx_cpp2py_export PRIVATE
++                          ${Python3_LIBRARY_DIRS})
++  target_link_libraries(onnx_cpp2py_export PRIVATE
++                        ${Python3_LIBRARIES})
+ 
+   # pybind11 is a header only lib
+-  find_package(pybind11 2.2 CONFIG)
++  find_package(pybind11 2.2 CONFIG REQUIRED)
+   if(NOT pybind11_FOUND)
+     if(EXISTS "${ONNX_ROOT}/third_party/pybind11/include/pybind11/pybind11.h")
+       add_subdirectory("${ONNX_ROOT}/third_party/pybind11")
diff --git a/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch b/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch
new file mode 100644
index 0000000000000..c435922d0103d
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch
@@ -0,0 +1,28 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index d81ac1d..9f97998 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -149,6 +149,7 @@ if(ONNX_BUILD_TESTS)
+   set(googletest_STATIC_LIBRARIES GTest::gtest)
+ endif()
+ 
++find_package(protobuf CONFIG REQUIRED)
+ if((ONNX_USE_LITE_PROTO AND TARGET protobuf::libprotobuf-lite) OR ((NOT ONNX_USE_LITE_PROTO) AND TARGET protobuf::libprotobuf))
+   # Sometimes we need to use protoc compiled for host architecture while linking
+   # libprotobuf against target architecture. See https://github.com/caffe2/caffe
+diff --git a/cmake/ONNXConfig.cmake.in b/cmake/ONNXConfig.cmake.in
+index d588f8a..dbd4398 100644
+--- a/cmake/ONNXConfig.cmake.in
++++ b/cmake/ONNXConfig.cmake.in
+@@ -6,9 +6,8 @@
+ # library version information
+ set(ONNX_VERSION "@ONNX_VERSION@")
+ 
+-list(APPEND CMAKE_PREFIX_PATH "@PROTOBUF_DIR@")
+-set(Protobuf_INCLUDE_DIR "@PROTOBUF_INCLUDE_DIR@")
+-find_package(Protobuf REQUIRED)
++include(CMakeFindDependencyMacro)
++find_dependency(protobuf CONFIG)
+ 
+ # import targets
+ include ("${CMAKE_CURRENT_LIST_DIR}/ONNXTargets.cmake")
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
new file mode 100644
index 0000000000000..a0c997803870d
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -0,0 +1,83 @@
+vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO onnx/onnx
+    REF "v${VERSION}"
+    SHA512 5a18e2b19ec9c18c8b115fb7e12ed98eddaa581c95f15c4dd420cd6c86e7caa04f9a393da589e76b89cf9b3544abd3749a8c77c2446782f37502eb74e9b1f661
+    PATCHES
+        fix-cmakelists.patch
+        fix-dependency-protobuf.patch
+)
+
+string(COMPARE EQUAL "${VCPKG_CRT_LINKAGE}" "static" USE_STATIC_RUNTIME)
+
+# ONNX_USE_PROTOBUF_SHARED_LIBS: find the library and check its file extension
+find_library(PROTOBUF_LIBPATH NAMES protobuf PATHS "${CURRENT_INSTALLED_DIR}/bin" "${CURRENT_INSTALLED_DIR}/lib" REQUIRED)
+get_filename_component(PROTOBUF_LIBNAME "${PROTOBUF_LIBPATH}" NAME)
+
+set(USE_PROTOBUF_SHARED OFF)
+
+
+
+# Like protoc, python is required for codegen.
+vcpkg_find_acquire_program(PYTHON3)
+
+# PATH for .bat scripts so it can find 'python'
+get_filename_component(PYTHON_DIR "${PYTHON3}" PATH)
+vcpkg_add_to_path(PREPEND "${PYTHON_DIR}")
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+        ${FEATURE_OPTIONS}
+        -DPython3_EXECUTABLE=${PYTHON3}
+        -DONNX_ML=ON
+        -DONNX_GEN_PB_TYPE_STUBS=ON
+        -DONNX_USE_PROTOBUF_SHARED_LIBS=${USE_PROTOBUF_SHARED}
+        -DONNX_USE_LITE_PROTO=OFF
+        -DONNX_USE_MSVC_STATIC_RUNTIME=${USE_STATIC_RUNTIME}
+        -DONNX_BUILD_TESTS=OFF
+        -DONNX_BUILD_BENCHMARKS=OFF
+        -DONNX_DISABLE_STATIC_REGISTRATION=ON
+    MAYBE_UNUSED_VARIABLES
+        ONNX_USE_MSVC_STATIC_RUNTIME
+)
+
+vcpkg_cmake_install()
+vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/ONNX)
+
+vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE")
+
+file(REMOVE_RECURSE
+    "${CURRENT_PACKAGES_DIR}/debug/include"
+    "${CURRENT_PACKAGES_DIR}/debug/share"
+    # the others are empty
+    "${CURRENT_PACKAGES_DIR}/include/onnx/backend"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/bin"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/controlflow"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/generator"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/image"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/logical"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/math"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/nn"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/object_detection"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/optional"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/quantization"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/reduction"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/rnn"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/sequence"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/text"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/traditionalml"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/training"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/examples"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/frontend"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_cpp2py_export"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/test"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/tools"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_ml"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_data"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_operators_ml"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/reference/ops"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/reference"
+)
diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json
new file mode 100644
index 0000000000000..7d2bbd84c05b3
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/vcpkg.json
@@ -0,0 +1,23 @@
+{
+  "name": "onnx",
+  "version-semver": "1.17.0",
+  "description": "Open standard for machine learning interoperability",
+  "homepage": "https://onnx.ai",
+  "license": "Apache-2.0",
+  "supports": "!uwp",
+  "dependencies": [
+    "protobuf",
+    {
+      "name": "protobuf",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ]
+}
diff --git a/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch b/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch
new file mode 100644
index 0000000000000..97fd1ac7a2bb1
--- /dev/null
+++ b/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch
@@ -0,0 +1,82 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f06aada..3c6c6e2 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -31,8 +31,6 @@ IF(CCACHE_BINARY)
+ ENDIF()
+ 
+ # ---[ Options.
+-SET(PTHREADPOOL_LIBRARY_TYPE "default" CACHE STRING "Type of library (shared, static, or default) to build")
+-SET_PROPERTY(CACHE PTHREADPOOL_LIBRARY_TYPE PROPERTY STRINGS default static shared)
+ OPTION(PTHREADPOOL_ALLOW_DEPRECATED_API "Enable deprecated API functions" ON)
+ SET(PTHREADPOOL_SYNC_PRIMITIVE "default" CACHE STRING "Synchronization primitive (condvar, futex, gcd, event, or default) for worker threads")
+ SET_PROPERTY(CACHE PTHREADPOOL_SYNC_PRIMITIVE PROPERTY STRINGS default condvar futex gcd event)
+@@ -41,7 +39,7 @@ IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|AMD64|x86(_64)?)$")
+ ELSE()
+   OPTION(PTHREADPOOL_ENABLE_FASTPATH "Enable fast path using atomic decrement instead of atomic compare-and-swap" OFF)
+ ENDIF()
+-IF("${CMAKE_SOURCE_DIR}" STREQUAL "${PROJECT_SOURCE_DIR}")
++IF(FALSE)
+   OPTION(PTHREADPOOL_BUILD_TESTS "Build pthreadpool unit tests" ON)
+   OPTION(PTHREADPOOL_BUILD_BENCHMARKS "Build pthreadpool micro-benchmarks" ON)
+ ELSE()
+@@ -67,7 +65,8 @@ MACRO(PTHREADPOOL_TARGET_ENABLE_CXX11 target)
+ ENDMACRO()
+ 
+ # ---[ Download deps
+-IF(NOT DEFINED FXDIV_SOURCE_DIR)
++find_path(FXDIV_INCLUDE_DIRS "fxdiv.h")
++IF(FALSE)
+   MESSAGE(STATUS "Downloading FXdiv to ${CMAKE_BINARY_DIR}/FXdiv-source (define FXDIV_SOURCE_DIR to avoid it)")
+   CONFIGURE_FILE(cmake/DownloadFXdiv.cmake "${CMAKE_BINARY_DIR}/FXdiv-download/CMakeLists.txt")
+   EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+@@ -118,21 +117,13 @@ ELSE()
+ ENDIF()
+ 
+ ADD_LIBRARY(pthreadpool_interface INTERFACE)
+-TARGET_INCLUDE_DIRECTORIES(pthreadpool_interface INTERFACE include)
++TARGET_INCLUDE_DIRECTORIES(pthreadpool_interface INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+ IF(NOT PTHREADPOOL_ALLOW_DEPRECATED_API)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool_interface INTERFACE PTHREADPOOL_NO_DEPRECATED_API=1)
+ ENDIF()
+ INSTALL(FILES include/pthreadpool.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+ 
+-IF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "default")
+-  ADD_LIBRARY(pthreadpool ${PTHREADPOOL_SRCS})
+-ELSEIF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "shared")
+-  ADD_LIBRARY(pthreadpool SHARED ${PTHREADPOOL_SRCS})
+-ELSEIF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "static")
+-  ADD_LIBRARY(pthreadpool STATIC ${PTHREADPOOL_SRCS})
+-ELSE()
+-  MESSAGE(FATAL_ERROR "Unsupported library type ${PTHREADPOOL_LIBRARY_TYPE}")
+-ENDIF()
++ADD_LIBRARY(pthreadpool ${PTHREADPOOL_SRCS})
+ 
+ IF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "condvar")
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=0)
+@@ -181,18 +172,22 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ ENDIF()
+ 
+ # ---[ Configure FXdiv
+-IF(NOT TARGET fxdiv)
++IF(FALSE)
+   SET(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
+   SET(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "")
+   ADD_SUBDIRECTORY(
+     "${FXDIV_SOURCE_DIR}"
+     "${CMAKE_BINARY_DIR}/FXdiv")
+ ENDIF()
+-TARGET_LINK_LIBRARIES(pthreadpool PRIVATE fxdiv)
++TARGET_INCLUDE_DIRECTORIES(pthreadpool PRIVATE ${FXDIV_INCLUDE_DIRS})
+ 
+-INSTALL(TARGETS pthreadpool
++INSTALL(TARGETS pthreadpool pthreadpool_interface
++  EXPORT unofficial-pthreadpool-config
++  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(EXPORT unofficial-pthreadpool-config NAMESPACE unofficial::
++  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/unofficial-${PROJECT_NAME}) # share/unofficial-pthreadpool
+ 
+ IF(PTHREADPOOL_BUILD_TESTS)
+   # ---[ Build google test
diff --git a/cmake/vcpkg-ports/pthreadpool/portfile.cmake b/cmake/vcpkg-ports/pthreadpool/portfile.cmake
new file mode 100644
index 0000000000000..9400e5e886639
--- /dev/null
+++ b/cmake/vcpkg-ports/pthreadpool/portfile.cmake
@@ -0,0 +1,25 @@
+if(VCPKG_TARGET_IS_WINDOWS)
+    vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+endif()
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO google/pthreadpool
+    REF 4e80ca24521aa0fb3a746f9ea9c3eaa20e9afbb0
+    SHA512 776017cc5d2aa94337292f2f4fbd54d099ef29abf736ab8147f07f98f12b7654cbd2fe38d34646a479a519c261ac253bbaf19c6dcbb0ec4cc0859de70f7e6472
+    PATCHES
+        fix-cmakelists.patch
+)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+        -DPTHREADPOOL_BUILD_TESTS=OFF
+        -DPTHREADPOOL_BUILD_BENCHMARKS=OFF
+)
+vcpkg_cmake_install()
+vcpkg_copy_pdbs()
+vcpkg_cmake_config_fixup(PACKAGE_NAME unofficial-${PORT})
+
+#file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
diff --git a/cmake/vcpkg-ports/pthreadpool/vcpkg.json b/cmake/vcpkg-ports/pthreadpool/vcpkg.json
new file mode 100644
index 0000000000000..16c0bea5b712c
--- /dev/null
+++ b/cmake/vcpkg-ports/pthreadpool/vcpkg.json
@@ -0,0 +1,17 @@
+{
+  "name": "pthreadpool",
+  "version-date": "2024-12-17",
+  "description": "Portable (POSIX/Windows/Emscripten) thread pool for C/C++",
+  "homepage": "https://github.com/google/pthreadpool",
+  "dependencies": [
+    "fxdiv",
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ]
+}
diff --git a/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch b/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch
new file mode 100644
index 0000000000000..a7c5e0e254aa1
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch
@@ -0,0 +1,12 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4a9fad59a..2713cded3 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -334,6 +334,7 @@ ENDIF()
+ IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+   # Disable "note: parameter passing for argument of type ... changed/will change in ..."
+   ADD_COMPILE_OPTIONS("-Wno-psabi")
++  ADD_COMPILE_OPTIONS("-Wno-incompatible-pointer-types")
+ ENDIF()
+ 
+ # ---[ Build flags
diff --git a/cmake/vcpkg-ports/xnnpack/fix-build.patch b/cmake/vcpkg-ports/xnnpack/fix-build.patch
new file mode 100644
index 0000000000000..b867377d2ff9e
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/fix-build.patch
@@ -0,0 +1,71 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f0b3410ae..ba54c3bfe 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1047,9 +1047,11 @@ ENDIF()
+ IF(XNNPACK_BUILD_ALL_MICROKERNELS)
+   TARGET_INCLUDE_DIRECTORIES(microkernels-all PRIVATE include src)
+ ENDIF()
++
+ TARGET_INCLUDE_DIRECTORIES(datatype PRIVATE include src)
+ TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE include src)
+-TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src ${CPUINFO_SOURCE_DIR}/include)
++TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src)
++
+ TARGET_INCLUDE_DIRECTORIES(indirection PRIVATE include src)
+ TARGET_INCLUDE_DIRECTORIES(microparams-init PRIVATE include src)
+ TARGET_INCLUDE_DIRECTORIES(normalization PRIVATE include src)
+@@ -1104,14 +1106,9 @@ IF(NOT TARGET cpuinfo)
+       "${CPUINFO_SOURCE_DIR}"
+       "${CMAKE_BINARY_DIR}/cpuinfo")
+   ELSE()
+-    ADD_LIBRARY(cpuinfo SHARED IMPORTED)
+-    FIND_LIBRARY(CPUINFO_LIBRARY cpuinfo PATHS "${CPUINFO_SOURCE_DIR}/lib")
+-    IF(NOT CPUINFO_LIBRARY)
+-      MESSAGE(FATAL_ERROR "Cannot find cpuinfo")
+-    ENDIF()
+-    TARGET_INCLUDE_DIRECTORIES(cpuinfo INTERFACE "${CPUINFO_SOURCE_DIR}/include")
+-    SET_PROPERTY(TARGET cpuinfo PROPERTY IMPORTED_LOCATION "${CPUINFO_LIBRARY}")
+-    SET_PROPERTY(TARGET cpuinfo PROPERTY IMPORTED_IMPLIB "${CPUINFO_LIBRARY}")
++    ADD_LIBRARY(cpuinfo INTERFACE)
++    FIND_PACKAGE(cpuinfo CONFIG REQUIRED)
++    TARGET_LINK_LIBRARIES(cpuinfo INTERFACE cpuinfo::cpuinfo)
+   ENDIF()
+ ENDIF()
+ IF(XNNPACK_BUILD_LIBRARY)
+@@ -1129,16 +1126,12 @@ IF(NOT TARGET pthreadpool)
+       "${PTHREADPOOL_SOURCE_DIR}"
+       "${CMAKE_BINARY_DIR}/pthreadpool")
+   ELSE()
++    find_package(unofficial-pthreadpool CONFIG REQUIRED)
+     ADD_LIBRARY(pthreadpool SHARED IMPORTED)
+-    FIND_LIBRARY(PTHREADPOOL_LIBRARY pthreadpool PATHS "${PTHREADPOOL_SOURCE_DIR}/lib")
+-    IF(NOT PTHREADPOOL_LIBRARY)
+-      MESSAGE(FATAL_ERROR "Cannot find pthreadpool")
+-    ENDIF()
++    FIND_LIBRARY(PTHREADPOOL_LIBRARY NAMES pthreadpool REQUIRED)
+     FIND_PACKAGE(Threads REQUIRED)
+-    TARGET_INCLUDE_DIRECTORIES(pthreadpool INTERFACE "${PTHREADPOOL_SOURCE_DIR}/include")
+-    TARGET_LINK_LIBRARIES(pthreadpool INTERFACE Threads::Threads)
++    TARGET_LINK_LIBRARIES(pthreadpool INTERFACE Threads::Threads unofficial::pthreadpool unofficial::pthreadpool_interface)
+     SET_PROPERTY(TARGET pthreadpool PROPERTY IMPORTED_LOCATION "${PTHREADPOOL_LIBRARY}")
+-    SET_PROPERTY(TARGET pthreadpool PROPERTY IMPORTED_IMPLIB "${PTHREADPOOL_LIBRARY}")
+   ENDIF()
+ ENDIF()
+ TARGET_LINK_LIBRARIES(xnnpack-base INTERFACE pthreadpool)
+@@ -1152,12 +1145,12 @@ IF(NOT TARGET fxdiv)
+       "${FXDIV_SOURCE_DIR}"
+       "${CMAKE_BINARY_DIR}/FXdiv")
+   ELSE()
+-    FIND_FILE(FXDIV_HDR fxdiv.h PATH_SUFFIXES include PATHS "${FXDIV_SOURCE_DIR}")
++    FIND_PATH(FXDIV_HDR fxdiv.h PATH_SUFFIXES include)
+     IF(NOT FXDIV_HDR)
+       MESSAGE(FATAL_ERROR "Cannot find fxdiv")
+     ENDIF()
+-    ADD_LIBRARY(fxdiv STATIC "${FXDIV_HDR}")
+-    TARGET_INCLUDE_DIRECTORIES(fxdiv INTERFACE "${FXDIV_SOURCE_DIR}/include")
++    ADD_LIBRARY(fxdiv INTERFACE IMPORTED)
++    target_include_directories(fxdiv INTERFACE "${FXDIV_HDR}")
+     SET_PROPERTY(TARGET fxdiv PROPERTY LINKER_LANGUAGE C)
+   ENDIF()
+ ENDIF()
diff --git a/cmake/vcpkg-ports/xnnpack/portfile.cmake b/cmake/vcpkg-ports/xnnpack/portfile.cmake
new file mode 100644
index 0000000000000..b07da3186b4b4
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/portfile.cmake
@@ -0,0 +1,39 @@
+if(VCPKG_TARGET_IS_WINDOWS)
+    vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+endif()
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO google/XNNPACK
+    REF 854b343f9cad36bd596e4390959ca3648208e048
+    SHA512 f37384b43022cb74bf87bd99c2e82e51d48fe4e0e4642611fcbc10cbb86ff2468b67964027f13f82a715dc7201c490d88d5020fb565ad236187b9dd219f3f644
+    HEAD_REF master
+    PATCHES
+        fix-build.patch
+	disable_gcc_warning.patch
+)
+vcpkg_find_acquire_program(PYTHON3)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+	WINDOWS_USE_MSBUILD
+    OPTIONS
+        "-DPython3_EXECUTABLE=${PYTHON3}"
+        "-DPython_EXECUTABLE=${PYTHON3}"
+        -DXNNPACK_USE_SYSTEM_LIBS=ON
+        -DXNNPACK_ENABLE_AVXVNNI=OFF
+        -DXNNPACK_ENABLE_ASSEMBLY=ON
+        -DXNNPACK_ENABLE_MEMOPT=ON
+        -DXNNPACK_ENABLE_SPARSE=ON
+        -DXNNPACK_ENABLE_KLEIDIAI=OFF
+        -DXNNPACK_BUILD_TESTS=OFF
+        -DXNNPACK_BUILD_BENCHMARKS=OFF
+)
+vcpkg_cmake_install()
+vcpkg_copy_pdbs()
+
+file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include"
+                    "${CURRENT_PACKAGES_DIR}/debug/bin"
+                    "${CURRENT_PACKAGES_DIR}/debug/share"
+)
diff --git a/cmake/vcpkg-ports/xnnpack/vcpkg.json b/cmake/vcpkg-ports/xnnpack/vcpkg.json
new file mode 100644
index 0000000000000..5e383c0b37810
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/vcpkg.json
@@ -0,0 +1,17 @@
+{
+  "name": "xnnpack",
+  "version-date": "2025-01-17",
+  "description": "High-efficiency floating-point neural network inference operators for mobile, server, and Web",
+  "homepage": "https://github.com/google/XNNPACK",
+  "license": "BSD-3-Clause",
+  "supports": "!(arm & windows) & !uwp & !arm32",
+  "dependencies": [
+    "cpuinfo",
+    "fxdiv",
+    "pthreadpool",
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    }
+  ]
+}
diff --git a/cmake/vcpkg-triplets/asan/arm64-linux.cmake b/cmake/vcpkg-triplets/asan/arm64-linux.cmake
index 6875a03064bfa..9f5c9997daedb 100644
--- a/cmake/vcpkg-triplets/asan/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64-osx.cmake b/cmake/vcpkg-triplets/asan/arm64-osx.cmake
index 4ac6bd8097de6..ba56684949836 100644
--- a/cmake/vcpkg-triplets/asan/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake
index c03c9e718fc80..79e10ad9e4436 100644
--- a/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake b/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake
index 184001d4238b0..d0a3305b1f74a 100644
--- a/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake
index 36176fe04033e..05a9718835ffb 100644
--- a/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake
index aa086c1220dfb..e0f4b2e1e4183 100644
--- a/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/universal2-osx.cmake b/cmake/vcpkg-triplets/asan/universal2-osx.cmake
index de2c8cee48ed5..d74494d578cd9 100644
--- a/cmake/vcpkg-triplets/asan/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/asan/universal2-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-linux.cmake b/cmake/vcpkg-triplets/asan/x64-linux.cmake
index dd1d066eb373a..64ba6b2216394 100644
--- a/cmake/vcpkg-triplets/asan/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-osx.cmake b/cmake/vcpkg-triplets/asan/x64-osx.cmake
index 5f1442c1d5c4e..bbcaff4c39209 100644
--- a/cmake/vcpkg-triplets/asan/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake
index 27f7a0190a33c..c0edb9ca31cb6 100644
--- a/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-windows-static.cmake b/cmake/vcpkg-triplets/asan/x64-windows-static.cmake
index 23b8082fbd5a3..3370987c55a12 100644
--- a/cmake/vcpkg-triplets/asan/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake
index cb9c639049936..429a4ac7cea36 100644
--- a/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x86-windows-static.cmake b/cmake/vcpkg-triplets/asan/x86-windows-static.cmake
index 0667f5f0ea61e..404cb3fbd07fb 100644
--- a/cmake/vcpkg-triplets/asan/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake
index 77f35ebada258..3d78741ebcf1d 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake
index 5cc70905e6e24..b25f8f8ebb8d1 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake
index cb0957791f432..c4ba82b7cac2a 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake
index 2d38883062bb1..3b028c4e40bcc 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake
index 4cc7102bf3b1c..d2d4bda334e38 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake
index d84533c8de35c..8e986eb139862 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake
index cacbfa751677d..6181e6d1c161b 100644
--- a/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake
index b53e668a64c09..d7103ff2508bf 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake
index 9f4adb513edd4..191dfb3d35d10 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake
index 2812ed9419e43..ae3f00b851145 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake
index ccdb919b3e3ee..d64f20d3ce7f6 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake
index 7a6b45666a924..24ddfa43c0f59 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake
index 96b2a2ad749b8..53fcb44313c26 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/arm64-linux.cmake b/cmake/vcpkg-triplets/binskim/arm64-linux.cmake
index 4b738553e0fbc..8a3cf645d7f5f 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/arm64-osx.cmake b/cmake/vcpkg-triplets/binskim/arm64-osx.cmake
index 4b6999874b111..9892a3eac80e8 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake
index 89dfae4bcbf26..3818356b5c0ce 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake b/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake
index 28ef65c4d1227..ab38e9f9a9f18 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake
index 0c087aa1b59f7..6937aea847a8a 100644
--- a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake
index 8c7de3b8a97f9..84c0531033699 100644
--- a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/universal2-osx.cmake b/cmake/vcpkg-triplets/binskim/universal2-osx.cmake
index 60826f1ede770..da4c6abb39000 100644
--- a/cmake/vcpkg-triplets/binskim/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/x64-linux.cmake b/cmake/vcpkg-triplets/binskim/x64-linux.cmake
index 8d7aeb2342e26..e3d4d34326409 100644
--- a/cmake/vcpkg-triplets/binskim/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/x64-osx.cmake b/cmake/vcpkg-triplets/binskim/x64-osx.cmake
index e391ab9eaee6d..426a35e33f747 100644
--- a/cmake/vcpkg-triplets/binskim/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake
index ef67223cd0cd3..0f600d7931076 100644
--- a/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake b/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake
index 62948a156c911..17d41775c9d06 100644
--- a/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake
index 8ac022c7eee4c..cb981c264a2f1 100644
--- a/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake b/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake
index 8fd2d29dc3d99..53342263d5ada 100644
--- a/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake
index c9787f460b78d..203c85fa3a59e 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake
index f5866d6863cb7..c57a2401e4c0f 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake
index 927b110c98d45..9963cfb66f4b1 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake
index b0419c9a0d0e0..0f4948ff076f1 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake
index aa8b7a5f0e96b..6a5c8b9f1058a 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake
index 96da5d9b1372f..668d4fb4dc420 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake
index f4ef6f0c659d8..1956daf30e6d9 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake
index 8fe977fb86e56..da17e0073980f 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake
index 196018d7cf442..c74e60bc7c206 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake
index 38b5cbdde2d65..6491d31ae469b 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake
index bea970b669f4f..011999df2ac99 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake
index e75d0c645c6a1..bf843c3e950e2 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake
index 6de6f80d9705c..21e0858066ab8 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/default/arm64-linux.cmake b/cmake/vcpkg-triplets/default/arm64-linux.cmake
index 581367931ba5e..120865a5b0b57 100644
--- a/cmake/vcpkg-triplets/default/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64-osx.cmake b/cmake/vcpkg-triplets/default/arm64-osx.cmake
index 4d74306ba4e6a..02e8a3430475f 100644
--- a/cmake/vcpkg-triplets/default/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake
index 135dc6ed6f3b5..6d5cf67665f4f 100644
--- a/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64-windows-static.cmake b/cmake/vcpkg-triplets/default/arm64-windows-static.cmake
index 56e1aebfe620b..19ca6f16cd890 100644
--- a/cmake/vcpkg-triplets/default/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake
index 9256f07f5451d..d7982158f390c 100644
--- a/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake
index bbdfed06fb2bc..fb14ad71c1d91 100644
--- a/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/universal2-osx.cmake b/cmake/vcpkg-triplets/default/universal2-osx.cmake
index 64b19451dd64d..57386c423cf99 100644
--- a/cmake/vcpkg-triplets/default/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/default/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-linux.cmake b/cmake/vcpkg-triplets/default/x64-linux.cmake
index 57114dd5fcb76..30c7b1b786302 100644
--- a/cmake/vcpkg-triplets/default/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/default/x64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-osx.cmake b/cmake/vcpkg-triplets/default/x64-osx.cmake
index dd50e622677b7..7af622e1354b9 100644
--- a/cmake/vcpkg-triplets/default/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/default/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake
index 5339a033715bb..bec5f2724da13 100644
--- a/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-windows-static.cmake b/cmake/vcpkg-triplets/default/x64-windows-static.cmake
index 579740efb6ab7..3f62418071583 100644
--- a/cmake/vcpkg-triplets/default/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake
index 34223c67e8f44..d93d87b3289f3 100644
--- a/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x86-windows-static.cmake b/cmake/vcpkg-triplets/default/x86-windows-static.cmake
index fc95d409f890e..727b35cd1f7cc 100644
--- a/cmake/vcpkg-triplets/default/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/gen.py b/cmake/vcpkg-triplets/gen.py
index 615ca66fc8f64..bec1a87a0a20e 100644
--- a/cmake/vcpkg-triplets/gen.py
+++ b/cmake/vcpkg-triplets/gen.py
@@ -88,9 +88,11 @@ def add_copyright_header(f):
                             # Disable RTTI and turn usage of dynamic_cast and typeid into errors
                             cxxflags += ["/GR-", "/we4541"]
                         # TODO: should it be a cmake list separated by semicolons?
-                        f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
-                        f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
-                        f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)\n")
+                        if len(cflags) >= 1:
+                            f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
+                        if len(cxxflags) >= 1:
+                            f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
+                        f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)\n")
                         if ldflags:
                             f.write('set(VCPKG_LINKER_FLAGS "{}")\n'.format(" ".join(ldflags)))
                         add_port_configs(f)
@@ -135,27 +137,34 @@ def add_copyright_header(f):
                                 f.write(f"set(VCPKG_TARGET_ARCHITECTURE {target_abi})\n")
                             f.write(f"set(VCPKG_CRT_LINKAGE {crt_linkage})\n")
                             f.write("set(VCPKG_LIBRARY_LINKAGE static)\n")
+
+
                             if enable_binskim and os_name == "linux":
                                 ldflags = [
                                     "-Wl,-Bsymbolic-functions",
                                     "-Wl,-z,relro",
                                     "-Wl,-z,now",
-                                    "-Wl,-z,noexecstack",
+                                    "-Wl,-z,noexecstack"
                                 ]
                             else:
                                 ldflags = []
-                            cflags = []
+                            # Enable debug info for all build configs
+                            cflags = ["-g"]
+                            cflags_release = ["-DNDEBUG", "-O3"]
                             if enable_binskim:
-                                cflags += [
+                                # A warning may be generated from include/features.h if the _FORTIFY_SOURCE flag was used in a debug build
+                                cflags_release += [
                                     "-Wp,-D_FORTIFY_SOURCE=2",
                                     "-Wp,-D_GLIBCXX_ASSERTIONS",
                                     "-fstack-protector-strong",
                                 ]
                                 if target_abi == "x64":
-                                    cflags += ["-fstack-clash-protection", "-fcf-protection"]
+                                    cflags_release += ["-fstack-clash-protection", "-fcf-protection"]
                             elif enable_asan:
                                 cflags += ["-fsanitize=address"]
                                 ldflags += ["-fsanitize=address"]
+                            # Enable debug info for all build configs
+                            ldflags.append('-g')
                             # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
                             # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001
                             if not enable_rtti:
@@ -163,8 +172,13 @@ def add_copyright_header(f):
                             cxxflags = cflags.copy()
                             if not enable_rtti:
                                 cxxflags.append("-fno-rtti")
-                            f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
-                            f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
+                            if len(cflags) >= 1:
+                                f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
+                            if len(cxxflags) >= 1:
+                                f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
+                            if len(cflags_release) >= 1:
+                                f.write('set(VCPKG_C_FLAGS_RELEASE "{}")\n'.format(" ".join(cflags_release)))
+                                f.write('set(VCPKG_CXX_FLAGS_RELEASE "{}")\n'.format(" ".join(cflags_release)))
                             if os_name == "linux":
                                 f.write("set(VCPKG_CMAKE_SYSTEM_NAME Linux)\n")
                             else:
@@ -184,4 +198,8 @@ def add_copyright_header(f):
 
                             if ldflags:
                                 f.write('set(VCPKG_LINKER_FLAGS "{}")\n'.format(" ".join(ldflags)))
+                            if os_name == 'osx':
+                                f.write('list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n')
+                            else:
+                                f.write('list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)\n')
                             add_port_configs(f)
diff --git a/cmake/vcpkg-triplets/nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/nortti/arm64-linux.cmake
index 4bd974a112125..f9035fc299ce5 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/nortti/arm64-osx.cmake
index fd8be60b7158a..d8971e8122f9d 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake
index 45b24baa2c94b..9d3c86ce644d0 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake
index 947fe9b61bd6c..238f7405ec492 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake
index ea5741fa42fbe..da314824ca7cc 100644
--- a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake
index 2b354ba511303..0c7fb60401f1d 100644
--- a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/nortti/universal2-osx.cmake
index 7111bb87c910b..febc002c0488e 100644
--- a/cmake/vcpkg-triplets/nortti/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/nortti/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-linux.cmake b/cmake/vcpkg-triplets/nortti/x64-linux.cmake
index 34fcc968e6c0e..c1dac19d33f2c 100644
--- a/cmake/vcpkg-triplets/nortti/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-osx.cmake b/cmake/vcpkg-triplets/nortti/x64-osx.cmake
index 0a7fcc08e0c69..242d34a358170 100644
--- a/cmake/vcpkg-triplets/nortti/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake
index 5c62c4263f8eb..a8d2441583d0f 100644
--- a/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake
index deceefcd95910..688ed230fd17c 100644
--- a/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake
index cb1b3cd887932..1d3de9c1420c4 100644
--- a/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake
index 032021745099a..3a856c26797a4 100644
--- a/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 63131d05c03d5..be0e8d2ee58a4 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="MSBuild.Sdk.Extras/3.0.22">
+<Project Sdk="Microsoft.NET.Sdk">
   <PropertyGroup>
     <!--- packaging properties -->
     <OrtPackageId Condition="'$(OrtPackageId)' == ''">Microsoft.ML.OnnxRuntime</OrtPackageId>
@@ -127,6 +127,11 @@
     <OrtConstants>$(OrtConstants);__ENABLE_TRAINING_APIS__</OrtConstants>
   </PropertyGroup>
 
+  <!-- Enable same optimizations as configuration Release for the custom RelWithDebInfo -->
+  <PropertyGroup Condition="'$(Configuration)' == 'RelWithDebInfo'">
+    <Optimize>true</Optimize>
+  </PropertyGroup>
+
   <!--
     Properties that are used when creating the managed package using the Pack target.
   -->
@@ -184,6 +189,10 @@
     <PackageReference Include="Microsoft.SourceLink.GitHub" Version="8.0.0" PrivateAssets="All" />
   </ItemGroup>
 
+  <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net8.0'))">
+	<PackageReference Include="System.Numerics.Tensors" Version="9.0.0" />
+  </ItemGroup>
+
   <!-- debug output - makes finding/fixing any issues with the the conditions easy.  -->
   <Target Name="DumpValues" BeforeTargets="PreBuildEvent">
     <Message Text="SolutionName='$(SolutionName)'" />
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
index d38748c2f97cc..7a5c3aaa19eac 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
@@ -9,6 +9,14 @@
 using System.Runtime.InteropServices;
 using System.Text;
 
+#if NET8_0_OR_GREATER
+using System.Diagnostics.CodeAnalysis;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using SystemNumericsTensors = System.Numerics.Tensors;
+using TensorPrimitives = System.Numerics.Tensors.TensorPrimitives;
+#endif
+
 namespace Microsoft.ML.OnnxRuntime
 {
     /// <summary>
@@ -205,6 +213,33 @@ public ReadOnlySpan<T> GetTensorDataAsSpan<T>() where T : unmanaged
             return MemoryMarshal.Cast<byte, T>(byteSpan);
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// Returns a ReadOnlyTensorSpan<typeparamref name="T"/> over tensor native buffer that
+        /// provides a read-only view.
+        ///
+        /// Note, that the memory may be device allocated and, therefore, not accessible from the CPU.
+        /// To get memory descriptor use GetTensorMemoryInfo().
+        ///
+        /// OrtValue must contain a non-string tensor.
+        /// The span is valid as long as the OrtValue instance is alive (not disposed).
+        /// </summary>
+        /// <typeparam name="T"></typeparam>
+        /// <returns>ReadOnlySpan<typeparamref name="T"/></returns>
+        /// <exception cref="OnnxRuntimeException"></exception>
+        [Experimental("SYSLIB5001")]
+        public SystemNumericsTensors.ReadOnlyTensorSpan<T> GetTensorDataAsTensorSpan<T>() where T : unmanaged
+        {
+            var byteSpan = GetTensorBufferRawData(typeof(T));
+
+            var typeSpan = MemoryMarshal.Cast<byte, T>(byteSpan);
+            var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape;
+            nint[] nArray = Array.ConvertAll(shape, new Converter<long, nint>(x => (nint)x));
+
+            return new SystemNumericsTensors.ReadOnlyTensorSpan<T>(typeSpan, nArray, []);
+        }
+#endif
+
         /// <summary>
         /// Returns a Span<typeparamref name="T"/> over tensor native buffer.
         /// This enables you to safely and efficiently modify the underlying
@@ -225,6 +260,32 @@ public Span<T> GetTensorMutableDataAsSpan<T>() where T : unmanaged
             return MemoryMarshal.Cast<byte, T>(byteSpan);
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// Returns a TensorSpan<typeparamref name="T"/> over tensor native buffer.
+        ///
+        /// Note, that the memory may be device allocated and, therefore, not accessible from the CPU.
+        /// To get memory descriptor use GetTensorMemoryInfo().
+        ///
+        /// OrtValue must contain a non-string tensor.
+        /// The span is valid as long as the OrtValue instance is alive (not disposed).
+        /// </summary>
+        /// <typeparam name="T"></typeparam>
+        /// <returns>ReadOnlySpan<typeparamref name="T"/></returns>
+        /// <exception cref="OnnxRuntimeException"></exception>
+        [Experimental("SYSLIB5001")]
+        public SystemNumericsTensors.TensorSpan<T> GetTensorMutableDataAsTensorSpan<T>() where T : unmanaged
+        {
+            var byteSpan = GetTensorBufferRawData(typeof(T));
+
+            var typeSpan = MemoryMarshal.Cast<byte, T>(byteSpan);
+            var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape;
+            nint[] nArray = Array.ConvertAll(shape, new Converter<long, nint>(x => (nint)x));
+
+            return new SystemNumericsTensors.TensorSpan<T>(typeSpan, nArray, []);
+        }
+#endif
+
         /// <summary>
         /// Provides mutable raw native buffer access.
         /// </summary>
@@ -234,6 +295,23 @@ public Span<byte> GetTensorMutableRawData()
             return GetTensorBufferRawData(typeof(byte));
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// Provides mutable raw native buffer access.
+        /// </summary>
+        /// <returns>TensorSpan over the native buffer bytes</returns>
+        [Experimental("SYSLIB5001")]
+        public SystemNumericsTensors.TensorSpan<byte> GetTensorSpanMutableRawData<T>() where T : unmanaged
+        {
+            var byteSpan = GetTensorBufferRawData(typeof(T));
+
+            var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape;
+            nint[] nArray = Array.ConvertAll(shape, new Converter<long, nint>(x => (nint)x));
+
+            return new SystemNumericsTensors.TensorSpan<byte>(byteSpan, nArray, []);
+        }
+#endif
+
         /// <summary>
         /// Fetch string tensor element buffer pointer at the specified index,
         /// convert/copy to UTF-16 char[] and return a ReadOnlyMemory{char} instance.
@@ -605,6 +683,80 @@ public static OrtValue CreateTensorValueFromMemory<T>(T[] data, long[] shape) wh
             return OrtValue.CreateTensorValueFromMemory(OrtMemoryInfo.DefaultInstance, new Memory<T>(data), shape);
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// This is a factory method creates a native Onnxruntime OrtValue containing a tensor on top of the existing tensor managed memory.
+        /// The method will attempt to pin managed memory so no copying occurs when data is passed down
+        /// to native code.
+        /// </summary>
+        /// <param name="value">Tensor object</param>
+        /// <param name="elementType">discovered tensor element type</param>
+        /// <returns>And instance of OrtValue constructed on top of the object</returns>
+        [Experimental("SYSLIB5001")]
+        public static OrtValue CreateTensorValueFromSystemNumericsTensorObject<T>(SystemNumericsTensors.Tensor<T> tensor) where T : unmanaged
+        {
+            if (!IsContiguousAndDense(tensor))
+            {
+                var newTensor = SystemNumericsTensors.Tensor.Create<T>(tensor.Lengths);
+                tensor.CopyTo(newTensor);
+                tensor = newTensor;
+            }
+            unsafe
+            {
+                var backingData = (T[])tensor.GetType().GetField("_values", BindingFlags.Instance | BindingFlags.NonPublic).GetValue(tensor);
+                GCHandle handle = GCHandle.Alloc(backingData, GCHandleType.Pinned);
+                var memHandle = new MemoryHandle(Unsafe.AsPointer(ref tensor.GetPinnableReference()), handle);
+
+                try
+                {
+                    IntPtr dataBufferPointer = IntPtr.Zero;
+                    unsafe
+                    {
+                        dataBufferPointer = (IntPtr)memHandle.Pointer;
+                    }
+
+                    var bufferLengthInBytes = tensor.FlattenedLength * sizeof(T);
+                    long[] shape = Array.ConvertAll(tensor.Lengths.ToArray(), new Converter<nint, long>(x => (long)x));
+
+                    var typeInfo = TensorBase.GetTypeInfo(typeof(T)) ??
+                        throw new OnnxRuntimeException(ErrorCode.InvalidArgument, $"Tensor of type: {typeof(T)} is not supported");
+
+                    NativeApiStatus.VerifySuccess(NativeMethods.OrtCreateTensorWithDataAsOrtValue(
+                        OrtMemoryInfo.DefaultInstance.Pointer,
+                        dataBufferPointer,
+                        (UIntPtr)(bufferLengthInBytes),
+                        shape,
+                        (UIntPtr)tensor.Rank,
+                        typeInfo.ElementType,
+                        out IntPtr nativeValue));
+
+                    return new OrtValue(nativeValue, memHandle);
+                }
+                catch (Exception)
+                {
+                    memHandle.Dispose();
+                    throw;
+                }
+            }
+        }
+
+        [Experimental("SYSLIB5001")]
+        private static bool IsContiguousAndDense<T>(SystemNumericsTensors.Tensor<T> tensor) where T : unmanaged 
+        {
+            // Right most dimension must be 1 for a dense tensor.
+            if (tensor.Strides[^1] != 1)
+                return false;
+
+            // For other dimensions, the stride must be equal to the product of the dimensions to the right.
+            for (int i = tensor.Rank - 2; i >= 0; i--)
+            {
+                if (tensor.Strides[i] != TensorPrimitives.Product(tensor.Lengths.Slice(i + 1, tensor.Lengths.Length - i - 1)))
+                    return false;
+            }
+            return true;
+        }
+#endif
+
         /// <summary>
         /// The factory API creates an OrtValue with memory allocated using the given allocator
         /// according to the specified shape and element type. The memory will be released when OrtValue
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
index ff5fd2de54197..816511150a137 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
@@ -7,6 +7,10 @@
 using System.Text.RegularExpressions;
 using Xunit;
 
+#if NET8_0_OR_GREATER
+using SystemNumericsTensors = System.Numerics.Tensors;
+#endif
+
 namespace Microsoft.ML.OnnxRuntime.Tests
 {
     /// <summary>
@@ -67,6 +71,194 @@ public void CanCreateAndDisposeSessionWithModelPath()
             }
         }
 
+#if NET8_0_OR_GREATER
+#pragma warning disable SYSLIB5001 // System.Numerics.Tensors is only in preview so we can continue receiving API feedback
+        [Theory]
+        [InlineData(GraphOptimizationLevel.ORT_DISABLE_ALL, true)]
+        [InlineData(GraphOptimizationLevel.ORT_DISABLE_ALL, false)]
+        [InlineData(GraphOptimizationLevel.ORT_ENABLE_EXTENDED, true)]
+        [InlineData(GraphOptimizationLevel.ORT_ENABLE_EXTENDED, false)]
+        private void CanRunInferenceOnAModelDotnetTensors(GraphOptimizationLevel graphOptimizationLevel, bool enableParallelExecution)
+        {
+            var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+            using (var cleanUp = new DisposableListTest<IDisposable>())
+            {
+                // Set the graph optimization level for this session.
+                SessionOptions options = new SessionOptions();
+                cleanUp.Add(options);
+                options.GraphOptimizationLevel = graphOptimizationLevel;
+
+                var session = new InferenceSession(model, options);
+                cleanUp.Add(session);
+
+                using var runOptions = new RunOptions();
+                var inputMeta = session.InputMetadata;
+                var outputMeta = session.OutputMetadata;
+
+                float[] expectedOutput = TestDataLoader.LoadTensorFromEmbeddedResource("bench.expected_out");
+                long[] expectedDimensions = { 1, 1000, 1, 1 };  // hardcoded for now for the test data
+                ReadOnlySpan<long> expectedOutputDimensions = expectedDimensions;
+
+                float[] inputData = TestDataLoader.LoadTensorFromEmbeddedResource("bench.in"); // this is the data for only one input tensor for this model
+
+                using var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count);
+
+                foreach (var name in inputMeta.Keys)
+                {
+                    Assert.Equal(typeof(float), inputMeta[name].ElementType);
+                    Assert.True(inputMeta[name].IsTensor);
+                    var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, inputMeta[name].Dimensions.Select(x => (nint)x).ToArray());
+                    inputOrtValues.Add(new DisposableTestPair<OrtValue>(name, OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+
+                }
+
+                runOptions.LogId = "CsharpTest";
+                runOptions.Terminate = false;  // TODO: Test terminate = true, it currently crashes
+                runOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_ERROR;
+                // Run inference with named inputs and outputs created with in Run()
+                using (var results = session.Run(runOptions, inputOrtValues.Select(x => x.Key).ToList(), inputOrtValues.Select(x => x.Value).ToList(), new List<string>(["softmaxout_1"])))  // results is an IDisposableReadOnlyCollection<OrtValue> container
+                {
+                    // validate the results
+                    foreach (var r in results)
+                    {
+                        Assert.Single(results);
+
+                        ValidateRunResult(r, expectedOutput, expectedDimensions);
+                    }
+                }
+            }
+        }
+
+        [Fact]
+        public void InferenceSessionDisposedDotnetTensors()
+        {
+            var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+            // Set the graph optimization level for this session.
+            using (SessionOptions options = new SessionOptions())
+            {
+                options.ProfileOutputPathPrefix = "Ort_P_";
+                options.EnableProfiling = true;
+                using (var session = new InferenceSession(model, options))
+                {
+                    var inputMeta = session.InputMetadata;
+                    var container = new List<NamedOnnxValue>();
+
+                    float[] inputData = TestDataLoader.LoadTensorFromEmbeddedResource("bench.in"); // this is the data for only one input tensor for this model
+
+                    using (var runOptions = new RunOptions())
+                    using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+                    using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+                    {
+                        
+                        foreach (var name in inputMeta.Keys)
+                        {
+                            Assert.Equal(typeof(float), inputMeta[name].ElementType);
+                            Assert.True(inputMeta[name].IsTensor);
+                            var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, inputMeta[name].Dimensions.Select(x => (nint) x).ToArray());
+                            inputOrtValues.Add(new DisposableTestPair<OrtValue>(name, OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                        }
+                        
+                        // Run inference with named inputs and outputs created with in Run()
+                        using (var results = session.Run(runOptions, inputOrtValues.Select(x => x.Key).ToList(), inputOrtValues.Select(x => x.Value).ToList(), new List<string>(["softmaxout_1"])))  // results is an IDisposableReadOnlyCollection<OrtValue> container
+                        {
+                            // validate the results
+                            foreach (var r in results)
+                            {
+                                Assert.Single(results);
+
+                                float[] expectedOutput = TestDataLoader.LoadTensorFromEmbeddedResource("bench.expected_out");
+                                long[] expectedDimensions = { 1, 1000, 1, 1 };  // hardcoded for now for the test data
+                                ValidateRunResult(r, expectedOutput, expectedDimensions);
+                            }
+                        }
+                    }
+
+                    string profile_file = session.EndProfiling();
+
+                    // Profile file should have the output path prefix in it
+                    Assert.Contains("Ort_P_", profile_file);
+                }
+            }
+        }
+
+        [Fact]
+        private void ThrowWrongOutputNameDotnetTensors()
+        {
+            var tuple = OpenSessionSqueezeNet();
+            var session = tuple.Item1;
+            var inputData = tuple.Item2;
+            var inputTensor = tuple.Item3;
+
+            using (var runOptions = new RunOptions())
+            using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+            using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+            {
+                var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, Array.ConvertAll<int, nint>(inputTensor.Dimensions.ToArray(), x => (nint)x));
+
+                inputOrtValues.Add(new DisposableTestPair<OrtValue>("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                outputOrtValues.Add(new DisposableTestPair<OrtValue>("bad_output_name", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor)));
+
+                var ex = Assert.Throws<OnnxRuntimeException>(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["bad_output_name"], [outputOrtValues[0].Value]));
+                Assert.Contains("Output name: 'bad_output_name' is not in the metadata", ex.Message);
+            }
+
+            session.Dispose();
+        }
+
+        [Fact]
+        private void ThrowWrongOutputDimensionDotnetTensors()
+        {
+            var tuple = OpenSessionSqueezeNet();
+            var session = tuple.Item1;
+            var inputData = tuple.Item2;
+            var inputTensor = tuple.Item3;
+            var outputTensor = SystemNumericsTensors.Tensor.Create<float>([1, 1001, 1, 1]);
+
+            using (var runOptions = new RunOptions())
+            using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+            using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+            {
+                var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, Array.ConvertAll<int, nint>(inputTensor.Dimensions.ToArray(), x => (nint)x));
+
+                inputOrtValues.Add(new DisposableTestPair<OrtValue>("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                outputOrtValues.Add(new DisposableTestPair<OrtValue>("softmaxout_1", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(outputTensor)));
+                
+                var ex = Assert.Throws<OnnxRuntimeException>(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["softmaxout_1"], [outputOrtValues[0].Value]));
+            }
+
+            session.Dispose();
+        }
+
+        [Fact]
+        private void ThrowInconsistentPinnedOutputsDotnetTensors()
+        {
+            var tuple = OpenSessionSqueezeNet();
+            using var cleanUp = new DisposableListTest<IDisposable>();
+            cleanUp.Add(tuple.Item1);
+            var session = tuple.Item1;
+            var inputData = tuple.Item2;
+            var inputTensor = tuple.Item3;
+            var outputTensor = SystemNumericsTensors.Tensor.Create([1, 1001, 1, 1], [4]);
+
+            using (var runOptions = new RunOptions())
+            using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+            using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+            {
+                var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, Array.ConvertAll<int, nint>(inputTensor.Dimensions.ToArray(), x => (nint)x));
+
+                inputOrtValues.Add(new DisposableTestPair<OrtValue>("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                outputOrtValues.Add(new DisposableTestPair<OrtValue>("softmaxout_1", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(outputTensor)));
+                OrtValue[] outputs = [];
+                var ex = Assert.Throws<ArgumentException>(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["softmaxout_1"], outputs));
+                Assert.StartsWith("Length of outputNames (1) must match that of outputValues (0).", ex.Message);
+            }
+        }
+#pragma warning restore SYSLIB5001 // System.Numerics.Tensors is only in preview so we can continue receiving API feedback
+#endif
+
+
 #if USE_CUDA
         [Fact(DisplayName = "TestCUDAProviderOptions")]
         private void TestCUDAProviderOptions()
@@ -1416,6 +1608,25 @@ private void VerifyNativeMethodsExist()
             }
         }
 
+#if NET8_0_OR_GREATER
+#pragma warning disable SYSLIB5001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+        private void ValidateRunResultData(SystemNumericsTensors.Tensor<float> resultTensor, float[] expectedOutput, int[] expectedDimensions)
+        {
+            Assert.Equal(expectedDimensions.Length, resultTensor.Rank);
+
+            var resultDimensions = resultTensor.Lengths;
+            for (int i = 0; i < expectedDimensions.Length; i++)
+            {
+                Assert.Equal(expectedDimensions[i], resultDimensions[i]);
+            }
+
+            var resultArray = resultTensor.ToArray();
+            Assert.Equal(expectedOutput.Length, resultArray.Length);
+            Assert.Equal(expectedOutput, resultArray, new FloatComparer());
+        }
+#pragma warning restore SYSLIB5001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+#endif
+
         static string GetTestModelsDir()
         {
             // get build directory, append downloaded models location
diff --git a/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
index 098078d2e3683..b814f99b05ae1 100644
--- a/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
+++ b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
@@ -7,7 +7,7 @@
      If you need a more sophisticated package for testing, you can run the production packaging pipeline against your
      branch and download the resulting nuget package from the build artifacts.
  -->
-<Project Sdk="MSBuild.Sdk.Extras/3.0.22">
+<Project Sdk="Microsoft.NET.Sdk">
   <PropertyGroup>
     <TargetFrameworks>netstandard2.0</TargetFrameworks>
     <NuspecFile>$(OnnxRuntimeBuildDirectory)/NativeNuget.nuspec</NuspecFile>
diff --git a/include/onnxruntime/core/graph/node_arg.h b/include/onnxruntime/core/graph/node_arg.h
index 921bff59fb6d4..0ddf1a2b9d3de 100644
--- a/include/onnxruntime/core/graph/node_arg.h
+++ b/include/onnxruntime/core/graph/node_arg.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/graph/basic_types.h"
 #include "core/common/status.h"
diff --git a/onnxruntime/core/graph/onnx_protobuf.h b/include/onnxruntime/core/graph/onnx_protobuf.h
similarity index 100%
rename from onnxruntime/core/graph/onnx_protobuf.h
rename to include/onnxruntime/core/graph/onnx_protobuf.h
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index b80debdde47c4..c28c79f1e723e 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -76,6 +76,9 @@ final class OnnxRuntime {
   /** The short name of the ONNX runtime TensorRT provider library */
   static final String ONNXRUNTIME_LIBRARY_TENSORRT_NAME = "onnxruntime_providers_tensorrt";
 
+  /** The short name of the ONNX runtime QNN provider library */
+  static final String ONNXRUNTIME_LIBRARY_QNN_NAME = "onnxruntime_providers_qnn";
+
   /** The OS & CPU architecture string */
   private static final String OS_ARCH_STR = initOsArch();
 
@@ -159,8 +162,11 @@ static synchronized void init() throws IOException {
       // the ONNX Runtime native library will load it
       extractProviderLibrary(ONNXRUNTIME_LIBRARY_SHARED_NAME);
 
-      load(ONNXRUNTIME_LIBRARY_NAME);
+      if (!isAndroid()) {
+        load(ONNXRUNTIME_LIBRARY_NAME);
+      }
       load(ONNXRUNTIME_JNI_LIBRARY_NAME);
+
       ortApiHandle = initialiseAPIBase(ORT_API_VERSION_14);
       if (ortApiHandle == 0L) {
         throw new IllegalStateException(
@@ -252,6 +258,16 @@ static boolean extractTensorRT() {
     return extractProviderLibrary(ONNXRUNTIME_LIBRARY_TENSORRT_NAME);
   }
 
+  /**
+   * Extracts the QNN provider library from the classpath resources if present, or checks to see if
+   * the QNN provider library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
+   *
+   * @return True if the QNN provider library is ready for loading, false otherwise.
+   */
+  static boolean extractQNN() {
+    return extractProviderLibrary(ONNXRUNTIME_LIBRARY_QNN_NAME);
+  }
+
   /**
    * Extracts a shared provider library from the classpath resources if present, or checks to see if
    * that library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
@@ -260,7 +276,7 @@ static boolean extractTensorRT() {
    * @return True if the library is ready for loading by ORT's native code, false otherwise.
    */
   static synchronized boolean extractProviderLibrary(String libraryName) {
-    // Android does not need to extract library and it has no shared provider library
+    // Android does not need to extract provider libraries.
     if (isAndroid()) {
       return false;
     }
@@ -312,7 +328,7 @@ static boolean isAndroid() {
   private static void load(String library) throws IOException {
     // On Android, we simply use System.loadLibrary
     if (isAndroid()) {
-      System.loadLibrary("onnxruntime4j_jni");
+      System.loadLibrary(library);
       return;
     }
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index 32dc9d9f84aaa..bd988e2bb7468 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -1320,6 +1320,10 @@ public void addXnnpack(Map<String, String> providerOptions) throws OrtException
      */
     public void addQnn(Map<String, String> providerOptions) throws OrtException {
       String qnnProviderName = "QNN";
+
+      // QNN can either be built as a shared or static library. extractQNN() will extract the
+      // (lib)onnxruntime_providers_qnn(.so/.dll) from classpath resources if present.
+      OnnxRuntime.extractQNN();
       addExecutionProvider(qnnProviderName, providerOptions);
     }
 
diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 5ad2311ef80de..a6a2ecdf6f467 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -6,108 +6,110 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 
 (**Note**: ONNX Runtime only *guarantees* support for models stamped with opset version 7 or above for opset domain 'ai.onnx'.)
 
-[WebNN API](https://webmachinelearning.github.io/webnn) provides two device types `cpu` and `gpu` to leverage different on-device accelerators. WebNN API implementation in Chromium uses TFLite XNNPack delegate backend for `cpu` device type and DirectML backend for `gpu` device type. [The op support status](https://webmachinelearning.github.io/webnn-status/) behind these two backends is inconsistent.
+The [WebNN API](https://webmachinelearning.github.io/webnn) is available in the latest versions of Chrome and Edge on Windows,
+Linux, macOS, Android, and ChromeOS behind an <i>"Enables WebNN API"</i> flag. The operator support status may vary across these
+platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-status/) for the latest implementation details.
 
 
-| Operator | Opset | WebNN API | WebNN CPU | WebNN GPU | Comments |
-|:------:|:------:|:------:|:-:|:-:|:------|
-| Abs | ai.onnx(7-12, 13+) | abs | ✓ | ✓ | |
-| Add | ai.onnx(7-12, 13, 14+) | add | ✓ | ✓ | |
-| And | ai.onnx(7+) | logicalAnd | ✗ | ✓ | |
-| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | |
-| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | |
-| AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 |
-| BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✓ | ✓ | Only supports 'training_mode' value is 0, one output |
-| Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✓ | ✓ | WebNN CPU backend doesn't support casting to uint64 data type |
-| Ceil | ai.onnx(7-12, 13+) | ceil | ✓ | ✓ | |
-| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | ✓ | ✓ | WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0] (Chromium issue: https://issues.chromium.org/issues/326156496) |
-| Concat | ai.onnx(7-10, 11-12, 13+) | concat | ✓ | ✓ | |
-| Conv | ai.onnx(7-10, 11+) | conv2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight) |
-| ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight). WebNN CPU backend only supports default dilations and group |
-| Cos | ai.onnx(7+) | cos | ✓ | ✓ | |
-| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | 'axis' input should be a constant |
-| Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | |
-| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input |
-| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode |
-| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | ✓ | ✓ | |
-| Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 |
-| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✓ | ✓ | |
-| Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✓ | ✓ | |
-| Exp | ai.onnx(7-12, 13+) | exp | ✓ | ✓ | |
-| Expand | ai.onnx(8-12, 13+) | expand | ✓ | ✓ | 'shape' input should be a constant |
-| Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | |
-| Floor | ai.onnx(7-12, 13+) | floor | ✓ | ✓ | |
-| Gather | ai.onnx(7-10, 11-12, 13+) | gather | ✓ | ✓ | |
-| GatherElements | ai.onnx(11-12, 13+) | gatherElements | ✗ | ✓ | |
-| GatherND | ai.onnx(11, 12, 13+) | gatherND | ✓ | ✓ | Only supports 'batch_dims' == 0 |
-| Gelu | ai.onnx(20+) | gelu | ✓ | ✓ | |
-| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | ✓ | ✓ | Only supports 1-D 'C' input |
-| GlobalAveragePool | ai.onnx(7+) | averagePool2d | ✓ | ✓ | Only supports 4-D input |
-| GlobalMaxPool | ai.onnx(7+) | maxPool2d | ✓ | ✓ | Only supports 4-D input |
-| GlobalLpPool| ai.onnx(7+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 'p' value is 2 |
-| Greater | ai.onnx(7-8, 9-12, 13+) | greater | ✓ | ✓ | |
-| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | ✓ | ✓ | |
-| GRU | ai.onnx(7-13, 14-21, 22+) | gru | ✓ | ✓ | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
-| HardSigmoid | ai.onnx(7+) | hardSigmoid | ✓ | ✓ | |
-| HardSwish | ai.onnx(14+) | hardSwish | ✓ | ✓ | |
-| Identity | ai.onnx(7-13, 14-15, 16-18, 19-20, 21+) | identity | ✓ | ✓ | |
-| InstanceNormalization | ai.onnx(7+) | instanceNormalization | ✓ | ✓ | |
-| LayerNormalization | ai.onnx(7-16, 17+) | layerNormalization | ✓ | ✓ | |
-| LeakyRelu | ai.onnx(7-15, 16+) | leakyRelu | ✓ | ✓ | |
-| Less | ai.onnx(7-8, 9-12, 13+) | lesser | ✓ | ✓ | |
-| LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | ✓ | ✓ | |
-| Log | ai.onnx(7-12, 13+) | log | ✓ | ✓ | |
-| LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 |
-| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | ✓ | ✓ | |
-| LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | ✓ | ✓ | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
-| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | |
-| Max | ai.onnx(7, 8-11, 12, 13+) | max | ✓ | ✓ | |
-| MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output |
-| Min | ai.onnx(7, 8-11, 12, 13+) | min | ✓ | ✓ | |
-| Mul | ai.onnx(7-12, 13, 14+) | mul | ✓ | ✓ | |
-| Neg | ai.onnx(7-12, 13+) | neg | ✓ | ✓ | |
-| Not | ai.onnx(7+) | logicalNot | ✓ | ✓ | |
-| Or | ai.onnx(7+) | logicalOr | ✗ | ✓ | |
-| Pad | ai.onnx(7-10, 11-12, 13-17, 18, 19-20, 21+) | pad | ✓ | ✓ | modes == 'wrap' is not supported |
-| Pow | ai.onnx(7-11, 12, 13-14, 15+) | pow | ✓ | ✓ | |
-| PRelu | ai.onnx(7-8, 9-15, 16+) | prelu | ✓ | ✓ | WebNN CPU backend restricts the last dimension of input and slope to be same (Chromium issue: https://issues.chromium.org/issues/335517470) |
-| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input |
-| Reciprocal | ai.onnx(7-12, 13+) | reciprocal | ✓ | ✓ | |
-| ReduceL1 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL1 | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceL2 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL2 | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceLogSum| ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSum| ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceLogSumExp | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSumExp | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceMax | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMax | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceMean | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceMean | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceMin | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMin | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceProd | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceProduct | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceSum | ai.onnx(7-10, 11-12, 13+) | reduceSum | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✓ | ✓ | Input 'axes' if present should be a constant |
-| Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | |
-| Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
-| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
-| RotaryEmbedding | com.microsoft(1+) | add, concat, gather, mul, reshape, split | ✓ | ✓ | |
-| ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | ✗ | ✓ | Only supports 'reduction' == 'none' |
-| ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | ✗ | ✓ | Only supports 'reduction' == 'none' |
-| Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | |
-| SimplifiedLayerNormalization | ai.onnx(1+) | pow, reduceMean, add, sqrt, div, mul | ✓ | ✓ | |
-| Sigmoid | ai.onnx(7-12, 13+) | sigmoid | ✓ | ✓ | |
-| Sign | ai.onnx(9-12, 13+) | sign | ✓ | ✓ | |
-| SkipSimplifiedLayerNormalization | com.microsoft(1+) | pow, reduceMean, add, sqrt, div, mul | ✓ | ✓ | |
-| Softplus | ai.onnx(7+) | softplus | ✓ | ✓ | |
-| Softsign | ai.onnx(7+) | softsign | ✓ | ✓ | |
-| Sin | ai.onnx(7+) | sin | ✓ | ✓ | |
-| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant |
-| Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | ✓ | ✓ | |
-| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant |
-| Sqrt | ai.onnx(7-12, 13+) | sqrt | ✓ | ✓ | |
-| Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | Input 'axes' if present should be a constant |
-| Sub | ai.onnx(7-12, 13, 14+) | sub | ✓ | ✓ | |
-| Tan | ai.onnx(7+) | tan | ✓ | ✓ | |
-| Tanh | ai.onnx(7-12, 13+) | tanh | ✓ | ✓ | |
-| Tile | ai.onnx(7-12, 13+) | tile | ✗ | ✓ | Input 'repeats' should be a constant |
-| Transpose | ai.onnx(7-12, 13-20, 21+) | transpose | ✓ | ✓ | |
-| Trilu | ai.onnx(14+) | triangular | ✓ | ✓ | Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
-| Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | |
-| Where | ai.onnx(7-8, 9-15, 16+) | where | ✓ | ✓ | |
-| Xor | ai.onnx(7+) | logicalXor | ✗ | ✓ | |
+| Operator | Opset | WebNN API | Comments |
+|:------:|:------:|:------:|:------|
+| Abs | ai.onnx(7-12, 13+) | abs | |
+| Add | ai.onnx(7-12, 13, 14+) | add | |
+| And | ai.onnx(7+) | logicalAnd | |
+| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | |
+| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | |
+| AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 |
+| BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | Only supports 'training_mode' value is 0, one output |
+| Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | |
+| Ceil | ai.onnx(7-12, 13+) | ceil | |
+| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | |
+| Concat | ai.onnx(7-10, 11-12, 13+) | concat | |
+| Conv | ai.onnx(7-10, 11+) | conv2d | Only supports 3-D or 4-D input and 'W' (weight) |
+| ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | Only supports 3-D or 4-D input and 'W' (weight) |
+| Cos | ai.onnx(7+) | cos | |
+| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | 'axis' input should be a constant |
+| Div | ai.onnx(7-12, 13, 14+) | div | |
+| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | The shape of x_scale should be a subsample of the shape of input |
+| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | Only supports test mode |
+| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | |
+| Elu | ai.onnx(7+) | elu | |
+| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | |
+| Erf | ai.onnx(7-9, 10-12, 13+) | erf | |
+| Exp | ai.onnx(7-12, 13+) | exp | |
+| Expand | ai.onnx(8-12, 13+) | expand | 'shape' input should be a constant |
+| Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape | |
+| Floor | ai.onnx(7-12, 13+) | floor | |
+| Gather | ai.onnx(7-10, 11-12, 13+) | gather | |
+| GatherElements | ai.onnx(11-12, 13+) | gatherElements | |
+| GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 |
+| Gelu | ai.onnx(20+) | gelu | |
+| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input |
+| GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input |
+| GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input |
+| GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
+| Greater | ai.onnx(7-8, 9-12, 13+) | greater | |
+| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | |
+| GRU | ai.onnx(7-13, 14-21, 22+) | gru | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
+| HardSigmoid | ai.onnx(7+) | hardSigmoid | |
+| HardSwish | ai.onnx(14+) | hardSwish | |
+| Identity | ai.onnx(7-13, 14-15, 16-18, 19-20, 21+) | identity | |
+| InstanceNormalization | ai.onnx(7+) | instanceNormalization | |
+| LayerNormalization | ai.onnx(7-16, 17+) | layerNormalization | |
+| LeakyRelu | ai.onnx(7-15, 16+) | leakyRelu | |
+| Less | ai.onnx(7-8, 9-12, 13+) | lesser | |
+| LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | |
+| Log | ai.onnx(7-12, 13+) | log | |
+| LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 |
+| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | |
+| LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
+| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | |
+| Max | ai.onnx(7, 8-11, 12, 13+) | max | |
+| MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output |
+| Min | ai.onnx(7, 8-11, 12, 13+) | min | |
+| Mul | ai.onnx(7-12, 13, 14+) | mul | |
+| Neg | ai.onnx(7-12, 13+) | neg | |
+| Not | ai.onnx(7+) | logicalNot | |
+| Or | ai.onnx(7+) | logicalOr | |
+| Pad | ai.onnx(7-10, 11-12, 13-17, 18, 19-20, 21+) | pad | modes == 'wrap' is not supported |
+| Pow | ai.onnx(7-11, 12, 13-14, 15+) | pow | |
+| PRelu | ai.onnx(7-8, 9-15, 16+) | prelu | |
+| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | The shape of x_scale should be a subsample of the shape of input |
+| Reciprocal | ai.onnx(7-12, 13+) | reciprocal | |
+| ReduceL1 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL1 | Input 'axes' if present should be a constant |
+| ReduceL2 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL2 | Input 'axes' if present should be a constant |
+| ReduceLogSum| ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSum | Input 'axes' if present should be a constant |
+| ReduceLogSumExp | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSumExp | Input 'axes' if present should be a constant |
+| ReduceMax | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMax | Input 'axes' if present should be a constant |
+| ReduceMean | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceMean | Input 'axes' if present should be a constant |
+| ReduceMin | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMin | Input 'axes' if present should be a constant |
+| ReduceProd | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceProduct | Input 'axes' if present should be a constant |
+| ReduceSum | ai.onnx(7-10, 11-12, 13+) | reduceSum | Input 'axes' if present should be a constant |
+| ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | Input 'axes' if present should be a constant |
+| Relu | ai.onnx(7-12, 13, 14+) | relu | |
+| Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
+| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
+| RotaryEmbedding | com.microsoft(1+) | add, concat, gather, mul, reshape, split | |
+| ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | Only supports 'reduction' == 'none' |
+| ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | Only supports 'reduction' == 'none' |
+| Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | |
+| SimplifiedLayerNormalization | ai.onnx(1+) | pow, reduceMean, add, sqrt, div, mul | |
+| Sigmoid | ai.onnx(7-12, 13+) | sigmoid | |
+| Sign | ai.onnx(9-12, 13+) | sign | |
+| SkipSimplifiedLayerNormalization | com.microsoft(1+) | pow, reduceMean, add, sqrt, div, mul | |
+| Softplus | ai.onnx(7+) | softplus | |
+| Softsign | ai.onnx(7+) | softsign | |
+| Sin | ai.onnx(7+) | sin | |
+| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant |
+| Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | |
+| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | Input 'split' if present should be a constant |
+| Sqrt | ai.onnx(7-12, 13+) | sqrt | |
+| Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | Input 'axes' if present should be a constant |
+| Sub | ai.onnx(7-12, 13, 14+) | sub | |
+| Tan | ai.onnx(7+) | tan | |
+| Tanh | ai.onnx(7-12, 13+) | tanh | |
+| Tile | ai.onnx(7-12, 13+) | tile | Input 'repeats' should be a constant |
+| Transpose | ai.onnx(7-12, 13-20, 21+) | transpose | |
+| Trilu | ai.onnx(14+) | triangular | Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
+| Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | |
+| Where | ai.onnx(7-8, 9-15, 16+) | where | |
+| Xor | ai.onnx(7+) | logicalXor | |
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 0aa3ad6c4c267..097e2552569c8 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -46,6 +46,11 @@ export const createConvTranspose2DProgramInfo = (
   const inputChannelsPerGroup = wShape[2] / group;
   const outputChannelsPerGroup = wShape[3];
   const aComponents = isChannelsLast ? getMaxComponents(inputChannelsPerGroup) : 1;
+  const packInputAs4 = isChannelsLast && outputChannelsPerGroup === 1;
+  const inputChannelsPerGroupInt = packInputAs4
+    ? Math.floor(inputChannelsPerGroup / 4) * 4
+    : Math.floor(inputChannelsPerGroup / aComponents) * aComponents;
+  const inputChannelsRemainder = inputChannelsPerGroup - inputChannelsPerGroupInt;
   const components = isChannelsLast ? getMaxComponents(outputChannelsPerGroup) : 1;
   const bComponents = isChannelsLast ? (outputChannelsPerGroup === 1 ? aComponents : components) : 1;
   const outputSize = ShapeUtil.size(outputShape) / components;
@@ -78,7 +83,7 @@ export const createConvTranspose2DProgramInfo = (
     { type: DataType.uint32, data: dilations },
     { type: DataType.uint32, data: effectiveFilterDims },
     { type: DataType.int32, data: pads },
-    { type: DataType.uint32, data: inputChannelsPerGroup },
+    { type: DataType.uint32, data: inputChannelsPerGroupInt },
     { type: DataType.uint32, data: outputChannelsPerGroup },
     ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims),
   ];
@@ -114,16 +119,40 @@ export const createConvTranspose2DProgramInfo = (
 
     const calculateResult = (): string => {
       let calcStr = '';
-      if (aComponents === 1) {
-        calcStr += `
-        let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
-        let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
-        dotProd = dotProd + xValue * wValue;`;
+      if (packInputAs4) {
+        if (aComponents === 4) {
+          calcStr += `
+        let xValue = ${dy.getByOffset('x_offset')};
+        let wValue = ${w.getByOffset('w_offset')};
+        dotProd = dotProd + dot(xValue, wValue);
+        x_offset += 1u;
+        w_offset += 1u;`;
+        } else if (aComponents === 2) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}));
+          x_offset += 2u;
+          w_offset += 2u;`;
+        } else if (aComponents === 1) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}, ${dy.getByOffset('x_offset + 2u')}, ${dy.getByOffset('x_offset + 3u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}, ${w.getByOffset('w_offset + 2u')}, ${w.getByOffset('w_offset + 3u')}));
+          x_offset += 4u;
+          w_offset += 4u;`;
+        }
       } else {
-        if (outputChannelsPerGroup === 1) {
+        calcStr += `
+                  let xValue = ${
+                    isChannelsLast
+                      ? dy.getByOffset(
+                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
+                        )
+                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
+                  };
+        `;
+        if (aComponents === 1) {
           calcStr += `
-          let wValue = ${w.getByOffset(`${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)} / ${bComponents}`)};
-          dotProd = dotProd + dot(xValue, wValue);`;
+          let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+          let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
+          dotProd = dotProd + xValue * wValue;`;
         } else {
           for (let c = 0; c < aComponents; c++) {
             calcStr += `
@@ -134,6 +163,32 @@ export const createConvTranspose2DProgramInfo = (
       }
       return calcStr;
     };
+    const calculateRemainder = (): string => {
+      if (inputChannelsRemainder === 0) {
+        return '';
+      }
+      if (!packInputAs4) {
+        throw new Error(`packInputAs4 ${packInputAs4} is not true.`);
+      }
+      let calcStr = '';
+      if (aComponents === 1) {
+        calcStr += 'dotProd = dotProd';
+        for (let i = 0; i < inputChannelsRemainder; i++) {
+          calcStr += `
+            + ${dy.getByOffset(`x_offset + ${i}`)} * ${w.getByOffset(`w_offset + ${i}`)}`;
+        }
+        calcStr += ';';
+      } else if (aComponents === 2) {
+        if (inputChannelsRemainder !== 2) {
+          throw new Error(`Invalid inputChannelsRemainder ${inputChannelsRemainder}.`);
+        }
+        calcStr += `
+          let xValue = ${dy.getByOffset('x_offset')};
+          let wValue = ${w.getByOffset('w_offset')};
+          dotProd = dotProd + dot(xValue, wValue);`;
+      }
+      return calcStr;
+    };
     const codeSnippet = `
             let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
             let batch = ${output.indicesGet('outputIndices', 0)};
@@ -148,7 +203,12 @@ export const createConvTranspose2DProgramInfo = (
             // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
             // ? = to be determined. : = across all values in that axis.
             var dotProd = ${output.type.value}(0.0);
-            for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+            var wR: u32 = 0;
+            if (uniforms.dilations.x == 1) {
+              // Minimum wR >= 0 that satisfies (dyRCorner + wR) % (uniforms.strides.x) == 0
+              wR = u32(((dyRCorner + i32(uniforms.strides.x) - 1) / i32(uniforms.strides.x)) * i32(uniforms.strides.x) - dyRCorner);
+            }
+            for (; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
               if (wR % uniforms.dilations.x != 0) {
                 continue;
               }
@@ -158,10 +218,13 @@ export const createConvTranspose2DProgramInfo = (
                   wRPerm < 0) {
                 continue;
               }
-              wR = wR + uniforms.strides[0] - 1;
               let idyR: u32 = u32(dyR);
-
-              for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+              var wC: u32 = 0;
+              if (uniforms.dilations.y == 1) {
+                // Minimum wC >= 0 that satisfies (dyCCorner + wC) % (uniforms.strides.y) == 0
+                wC = u32(((dyCCorner + i32(uniforms.strides.y) - 1) / i32(uniforms.strides.y)) * i32(uniforms.strides.y) - dyCCorner);
+              }
+              for (; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
                 if (wC % uniforms.dilations.y != 0) {
                   continue;
                 }
@@ -171,21 +234,24 @@ export const createConvTranspose2DProgramInfo = (
                     fract(dyC) > 0.0 || wCPerm < 0) {
                   continue;
                 }
-                wC = wC + uniforms.strides.y - 1;
                 let idyC: u32 = u32(dyC);
                 var inputChannel = groupId * uniforms.input_channels_per_group;
-                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${aComponents}) {
-                  let xValue = ${
-                    isChannelsLast
-                      ? dy.getByOffset(
-                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
-                        )
-                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
-                  };
+                ${
+                  packInputAs4
+                    ? `
+                var x_offset = ${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents};
+                var w_offset = ${w.indicesToOffset(`${w.type.indices}(wRPerm, wCPerm, inputChannel, wOutChannel)`)} / ${bComponents};
+                  `
+                    : ''
+                }
+                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${packInputAs4 ? 4 : aComponents}) {
                   ${calculateResult()}
-                  inputChannel = inputChannel + ${aComponents};
+                  inputChannel = inputChannel + ${packInputAs4 ? 4 : aComponents};
                 }
+                ${calculateRemainder()}
+                wC = wC + uniforms.strides.y - 1;
               }
+              wR = wR + uniforms.strides[0] - 1;
             }
             let value = dotProd${hasBias ? ` + bias[d1 / ${components}]` : ''};
             ${output.setByOffset('global_idx', 'value')};
@@ -201,7 +267,7 @@ export const createConvTranspose2DProgramInfo = (
   return {
     name: 'ConvTranspose2D',
     shaderCache: {
-      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}`,
+      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}${inputChannelsRemainder}`,
       inputDependencies,
     },
     getRunData: () => ({
diff --git a/js/web/test/data/ops/conv-transpose.jsonc b/js/web/test/data/ops/conv-transpose.jsonc
index f827601b3a89c..a6a799dccee86 100644
--- a/js/web/test/data/ops/conv-transpose.jsonc
+++ b/js/web/test/data/ops/conv-transpose.jsonc
@@ -458,6 +458,152 @@
       }
     ]
   },
+  {
+    "name": "ConvTranspose with output channels = 1",
+    "operator": "ConvTranspose",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "inChannels = 5",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45
+            ],
+            "dims": [1, 5, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [5, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              437, 532, 458, 558, 479, 584, 627, 722, 658, 758, 689, 794, 500, 610, 521, 636, 542, 662, 720, 830, 751,
+              866, 782, 902, 563, 688, 584, 714, 605, 740, 813, 938, 844, 974, 875, 1010
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 6",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9
+            ],
+            "dims": [1, 6, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4],
+            "dims": [6, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              438, 534, 460, 562, 482, 590, 630, 726, 664, 766, 698, 806, 504, 618, 526, 646, 548, 674, 732, 846, 766,
+              886, 800, 926, 570, 702, 592, 730, 614, 758, 834, 966, 868, 1006, 902, 1046
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 7",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18
+            ],
+            "dims": [1, 7, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [7, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              488, 594, 515, 628, 542, 662, 700, 806, 741, 854, 782, 902, 569, 696, 596, 730, 623, 764, 823, 950, 864,
+              998, 905, 1046, 650, 798, 677, 832, 704, 866, 946, 1094, 987, 1142, 1028, 1190
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 8",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18, 1, 2, 3, 4, 5, 6, 7, 8, 9
+            ],
+            "dims": [1, 8, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4],
+            "dims": [8, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              489, 596, 517, 632, 545, 668, 703, 810, 747, 862, 791, 914, 573, 704, 601, 740, 629, 776, 835, 966, 879,
+              1018, 923, 1070, 657, 812, 685, 848, 713, 884, 967, 1122, 1011, 1174, 1055, 1226
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "ConvTranspose without bias addition C",
     "operator": "ConvTranspose",
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
index 96c19af9479e4..891b40710ff99 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
@@ -12,7 +12,7 @@
       },
       "devDependencies": {
         "@vitejs/plugin-vue": "^5.2.1",
-        "vite": "^6.0.5"
+        "vite": "^6.0.11"
       }
     },
     "node_modules/@babel/helper-string-parser": {
@@ -1069,9 +1069,9 @@
       }
     },
     "node_modules/vite": {
-      "version": "6.0.7",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.7.tgz",
-      "integrity": "sha512-RDt8r/7qx9940f8FcOIAH9PTViRrghKaK2K1jY3RaAURrEUbm9Du1mJ72G+jlhtG3WwodnfzY8ORQZbBavZEAQ==",
+      "version": "6.0.11",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.11.tgz",
+      "integrity": "sha512-4VL9mQPKoHy4+FE0NnRE/kbY51TOfaknxAjt3fJbGJxhIpBZiqVzlZDEesWWsuREXHwNdAoOFZ9MkPEVXczHwg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json
index 7a1f370885bf4..9e204875a1d01 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package.json
@@ -13,6 +13,6 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^5.2.1",
-    "vite": "^6.0.5"
+    "vite": "^6.0.11"
   }
 }
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
index ccaeb6654e286..abb24e20a6178 100644
--- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -75,6 +75,7 @@ class GQAAttentionBase {
     int seqlen_present_kv_cache = static_cast<int>(present_key->Shape().GetDims()[2]);
 
     // Compute the attention score.
+    // TODO(fajin): type depends on kernel supportability
     size_t bytes = SafeInt<size_t>(batch_size) * num_heads_ * sequence_length * seqlen_present_kv_cache * sizeof(float);
     auto attention_probs = allocator->Alloc(bytes);
     BufferUniquePtr scratch_buffer(attention_probs, BufferDeleter(allocator));
@@ -198,6 +199,11 @@ class GQAAttentionBase {
           math::GemmEx<float, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size, alpha, q,
                                           static_cast<int>(head_size), k, static_cast<int>(head_size), 0.0f /*bata*/,
                                           output, static_cast<int>(present_buffer_sequence_length), nullptr);
+          // TODO(fajin): update later
+          // } else if (MlasHGemmSupported(CblasNoTrans, CblasTrans)) {
+          //   MlasGemm(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size,
+          //            q, static_cast<int>(head_size), k, static_cast<int>(head_size), output,
+          //            static_cast<int>(present_buffer_sequence_length), alpha, 0.0f /*beta*/, nullptr);
         } else {
           size_t bytes = head_size * (sequence_length + total_seqlen) * sizeof(float);
           auto q_k_fp32 = allocator->Alloc(bytes);
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index 28e4ccec09b32..90e6516ff45d1 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -530,6 +530,222 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
+Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  shader.AddOutput("output", ShaderUsage::UseUniform);
+  shader.AddOutput("scales", ShaderUsage::UseUniform);
+
+  shader.AdditionalImplementation() << R"ADDNL_FN(
+    var<workgroup> max_values : array<input_a_element_t, 4>;
+ )ADDNL_FN";
+
+  shader.MainFunctionBody() << R"MAIN_FN(
+  var local_a = input_a[global_idx];
+  var max_val = subgroupMax(abs(local_a));
+  var max_temp = max(max_val.xy, max_val.zw);
+  var scale = max(max_temp[0], max_temp[1]);
+  if (local_idx % sg_size == 0) {
+    max_values[local_idx / sg_size] = scale;
+  }
+  workgroupBarrier();
+
+  if (sg_size == 8)
+  {
+    scale = max(max_values[0], max_values[1]);
+    scale = max(scale, max_values[2]);
+    scale = max(scale, max_values[3]);
+  }
+  else if (sg_size == 16)
+  {
+    scale = max(max_values[0], max_values[1]);
+  }
+  else
+  {
+    scale = max_values[0];
+  }
+
+  var norm_a = local_a/scale;
+  output[global_idx] = pack4x8snorm(vec4<f32>(norm_a));
+  if (local_idx == 0)
+  {
+    // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
+    scales[workgroup_idx] = scale/127;
+  }
+)MAIN_FN";
+  return Status::OK();
+}
+
+Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  shader.AddInput("scales_a", ShaderUsage::UseUniform);
+  shader.AddInput("input_b", ShaderUsage::UseUniform);
+  shader.AddInput("scales_b", ShaderUsage::UseUniform);
+  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
+
+  // This shader implements co-operative matrix multiply. The key idea here is to
+  // assume there is a primitive for medium size matrix multiply a subgroup can perform,
+  // using all its lanes and pooling all its registers to keep the values in registry.
+  //
+  // The entire workgroup which has N subgroups first loads a tile into shared memory,
+  // Then each subgroup loads a subtile from shared memory into registers and uses
+  // the medium size matrix multiply primitive to perform the math.
+  // The values for tile/subtile size are chosen to conform to the resource limits
+  // of an alderlake/tiger lake gpu. A tile is 64x64, workgroup is 256 threads -
+  // therefore there are 16 subgroups and 16 lanes in each subgroup.
+  // K the hidden dimension is paged in from RAM at k tile size which is 64.
+  // All this puts the shared memory requirement slightly above 16KB.
+  // WebGPU limit is 16KB, output is moved to registers instead of SHM to make
+  // everything fit in shared memory.
+  //
+  // Each subgroup performs a 16 x 64 x 16 multiply which is implemented with
+  // subgroup shuffle as a placeholder for the day the medium matrix mul primitive
+  // becomes available in WGSL. The registry requirements is ~2KB per subgroup, on
+  // Alderlake/Tigerlake subgroup has 8KB of registry space pooling the
+  // 512B of registry from each lane.
+  //
+  // The medium size matmul is implemented using dot4I8Packed, so the inputs for
+  // this shader require A to be int8 quantized with block size 64. B is regular
+  // matmulnbits input with block size 32.
+
+  shader.AdditionalImplementation() << R"ADDNL_FN(
+  const tile_size = 64;
+  const subtile_size = 16;
+  const tile_size_k =  32;
+  const vec_factor = 4;
+  const u32_factor = 4;
+  const tile_size_k_vec = 4;
+  const block_size = 32;
+
+  // Shared memory
+  var<workgroup> tile_A : array<array<vec2<u32>, tile_size_k_vec>, tile_size>;                     // 64 x 32
+  var<workgroup> scale_A : array<output_element_t, tile_size>;                                                  // 64 x 1
+  var<workgroup> tile_B : array<array<vec2<u32>, tile_size_k_vec>, tile_size>;                     // 64 x 32
+  var<workgroup> scale_B : array<output_element_t, tile_size>;                                                  // 64 x 1
+
+  // Private memory
+  var<private> lane_output: array<output_element_t, 16>;
+
+  fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32)
+  {
+    let a_global = a_global_base + row;
+    if (a_global >= uniforms.M)
+    {
+      return;
+    }
+    tile_A[row][col] = input_a[a_global*uniforms.K8+kidx_v+col];
+    if (col == 0)
+    {
+      // kidx_v - covers 8 values of k
+      scale_A[row] = scales_a[a_global*(uniforms.K/128) + kidx_v/16];
+    }
+  }
+
+  fn loadSHMB(b_global_base:u32, kidx_v:u32, row: u32, col: u32)
+  {
+      let b_global = b_global_base + row;
+      if (b_global >= uniforms.N)
+      {
+        return;
+      }
+
+      let b_value = input_b[b_global*uniforms.K8+kidx_v+col];
+      var b_value_lower = vec4<i32>(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4<i32>(8);
+      var b_value_upper = vec4<i32>(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
+      tile_B[row][col][0] = pack4xI8(vec4<i32>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]));
+      tile_B[row][col][1] = pack4xI8(vec4<i32>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]));
+      if (col == 0)
+      {
+        // kidx_v - each kidx_v covers 8 values of k
+        scale_B[row] = scales_b[b_global*(uniforms.K/32) + kidx_v/4];
+      }
+  }
+
+  fn DP4AI(a:vec4<u32>, b:vec4<u32>) -> i32
+  {
+      var local_sum = dot4I8Packed(a[0], b[0]);
+      local_sum += dot4I8Packed(a[1], b[1]);
+      local_sum += dot4I8Packed(a[2], b[2]);
+      local_sum += dot4I8Packed(a[3], b[3]);
+      return local_sum;
+  }
+
+)ADDNL_FN";
+
+  shader.MainFunctionBody() << R"MAIN_FN(
+  // During the load phase we use all 256 threads to load 64 rows of A/B.
+  // For each row we load 4 vectorized elements, which are 32 elements of K.
+  let a_global_base = workgroup_id.x * tile_size;
+  let b_global_base = workgroup_id.y * tile_size;
+  let load_row = u32(local_idx/4);
+  let load_col = u32(local_idx%4);
+
+  // During the compute phase, we have the 64x64 tile split into
+  // subtiles of 16x16. We have a grid of 4x4 subtiles.
+  let subtile_id = u32(local_idx / subtile_size);
+  let subtile_idx = u32(subtile_id / 4);
+  let subtile_idy = u32(subtile_id % 4);
+  let base_A = subtile_idx * 16;
+  let base_B = subtile_idy * 16;
+  // For each subtile we have 16 threads assigned.
+  let a_idx = u32(local_idx % subtile_size);
+
+  // K's vectrorization is 8 items per index. See input_a/input_b.
+  // tile_size_k_vec - is the k tile size in vectorized k units/space (1/8).
+  for (var kidx_v:u32 = 0; kidx_v < uniforms.K8; kidx_v+=tile_size_k_vec)
+  {
+    // Populate shared memory for the workgroup
+    loadSHMA(a_global_base, kidx_v, load_row, load_col);
+    loadSHMB(b_global_base, kidx_v, load_row, load_col);
+    workgroupBarrier();
+
+    var own_a0: vec4<u32> = vec4<u32>(tile_A[base_A + a_idx][0], tile_A[base_A + a_idx][1]);
+    var own_a1: vec4<u32> = vec4<u32>(tile_A[base_A + a_idx][2], tile_A[base_A + a_idx][3]);
+    var own_scale_a = scale_A[base_A + a_idx];
+    if (sg_size == 16)
+    {
+      var own_b0: vec4<u32> = vec4<u32>(tile_B[base_B + sg_id][0], tile_B[base_B + sg_id][1]);
+      var own_b1: vec4<u32> = vec4<u32>(tile_B[base_B + sg_id][2], tile_B[base_B + sg_id][3]);
+      var own_scale_b = scale_B[base_B + sg_id];
+      for (var col:u32 = 0; col < 16; col++)
+      {
+        var local_scale_b = subgroupShuffle(own_scale_b, col);
+        local_scale_b = local_scale_b * own_scale_a;
+        var local_sum = DP4AI(own_a0, subgroupShuffle(own_b0, col));
+        local_sum += DP4AI(own_a1, subgroupShuffle(own_b1, col));
+        lane_output[col] += (output_element_t(local_sum) * local_scale_b);
+      }
+    }
+    else
+    {
+      for (var col:u32 = 0; col < 16; col++)
+      {
+        var b0: vec4<u32> = vec4<u32>(tile_B[base_B + col][0], tile_B[base_B + col][1]);
+        var b1: vec4<u32> = vec4<u32>(tile_B[base_B + col][2], tile_B[base_B + col][3]);
+        var local_sum = DP4AI(own_a0, b0);
+        local_sum += DP4AI(own_a1, b1);
+        lane_output[col] += (output_element_t(local_sum) *  own_scale_a * scale_B[base_B + col]);
+      }
+    }
+    workgroupBarrier();
+  }
+
+  let a_global = a_global_base + base_A + a_idx;
+  let b_global = b_global_base + base_B;
+  let output_idx = ((a_global) * uniforms.N + b_global)/4;
+  // This creates a shader requirement that uniforms.N % 16 == 0
+  if (a_global < uniforms.M && b_global < uniforms.N)
+  {
+    for (var i:u32 = 0; i < 4; i++)
+    {
+      let lidx = i * 4;
+      output[output_idx+i] = vec4<output_element_t>(lane_output[lidx], lane_output[lidx+1] , lane_output[lidx+2], lane_output[lidx+3]);
+    }
+  }
+)MAIN_FN";
+
+  return Status::OK();
+}
+
 Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
   const Tensor* a = context.Input(0);
   const Tensor* b = context.Input(1);
@@ -565,11 +781,54 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   uint32_t components = GetMaxComponents(N);
 
   const bool has_zero_points = zero_points != nullptr;
+  const bool has_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups);
+  // macOS - Avoid using dp4a on Metal, as it does not appear to have native dp4a support.
+  // https://github.com/gpuweb/gpuweb/issues/2677#issuecomment-1713292226
+  const bool use_dp4a = has_subgroup && context.AdapterInfo().backendType != wgpu::BackendType::Metal;
+  if (accuracy_level_ == 4 && block_size == 32 &&
+      batch_count == 1 && components_a == 4 && K % 64 == 0 && N % 16 == 0 &&
+      !has_zero_points && use_dp4a && M >= kMinMForTileOptimization) {
+    constexpr uint32_t kVec4Components = 4;
+    constexpr uint32_t kVec2Components = 2;
+    constexpr uint32_t kU32Components = 4;
+
+    constexpr uint32_t kBlockSizeA = 128;
+    DP4AMatMulQuantizeProgram quantize_program;
+    quantize_program.SetWorkgroupSize(32);
+    quantize_program.SetDispatchGroupSize(M * K / kBlockSizeA, 1, 1);
+    TensorShape a_quant_shape{1, M, K / kU32Components};
+    Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), a_quant_shape);
+    TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA});
+    Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
+    quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)}})
+        .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), gsl::narrow<int>(1)},
+                     {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), gsl::narrow<int>(1)}});
+    ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
+
+    constexpr uint32_t kTileSize = 64;
+    TensorShape reshaped_y_shape{1, M, N / kVec4Components};
+    DP4AMatMulNBitsProgram mul_program;
+    mul_program.SetWorkgroupSize(256);
+    mul_program.SetDispatchGroupSize(
+        (M + kTileSize - 1) / kTileSize,
+        (N + kTileSize - 1) / kTileSize, 1);
+    mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec2Components)},
+                           {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)},
+                           {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kU32Components)},
+                           {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)}})
+        .AddUniformVariables({{static_cast<uint32_t>(M)},
+                              {static_cast<uint32_t>(N)},
+                              {static_cast<uint32_t>(K)},
+                              {static_cast<uint32_t>(K / 8)},
+                              {static_cast<uint32_t>(K / 16)}})
+        .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(kVec4Components)});
+    return context.RunProgram(mul_program);
+  }
 
   // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
   constexpr uint32_t output_number = 1;
   const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
-  const bool use_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups) && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32;
+  const bool use_subgroup = has_subgroup && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32;
   MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points, use_subgroup};
   if (M > kMinMForTileOptimization && block_size == 32) {
     components = 1;
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
index ca3c485566d50..a2470d9268907 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
@@ -35,6 +35,24 @@ class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
   bool use_subgroup_;
 };
 
+class DP4AMatMulQuantizeProgram final : public Program<DP4AMatMulQuantizeProgram> {
+ public:
+  DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+};
+
+class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
+ public:
+  DP4AMatMulNBitsProgram() : Program{"DP4AMatMulNBits"} {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
+      {"M", ProgramUniformVariableDataType::Uint32},
+      {"N", ProgramUniformVariableDataType::Uint32},
+      {"K", ProgramUniformVariableDataType::Uint32},
+      {"K8", ProgramUniformVariableDataType::Uint32},
+      {"K16", ProgramUniformVariableDataType::Uint32});
+};
+
 class MatMulNBits final : public WebGpuKernel {
  public:
   MatMulNBits(const OpKernelInfo& info) : WebGpuKernel(info) {
@@ -42,6 +60,7 @@ class MatMulNBits final : public WebGpuKernel {
     N_ = info.GetAttr<int64_t>("N");
     block_size_ = info.GetAttr<int64_t>("block_size");
     int64_t bits = info.GetAttr<int64_t>("bits");
+    accuracy_level_ = info.GetAttrOrDefault<int64_t>("accuracy_level", 4);
     ORT_ENFORCE(bits == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
   }
@@ -52,6 +71,7 @@ class MatMulNBits final : public WebGpuKernel {
   int64_t K_;
   int64_t N_;
   int64_t block_size_;
+  int64_t accuracy_level_;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/graph/function_template.h b/onnxruntime/core/graph/function_template.h
index 978174d943f14..0d3fee18d5d59 100644
--- a/onnxruntime/core/graph/function_template.h
+++ b/onnxruntime/core/graph/function_template.h
@@ -2,7 +2,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 207c058d899b4..7e0335cc66ef0 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1458,7 +1458,107 @@ MlasRotaryEmbedOneRow(
     T* output
 );
 
-    /**
+/**
+ * @brief Supply matrices data information to half precision gemm functions
+ */
+struct MLAS_HGEMM_DATA_PARAMS {
+    const MLAS_FP16* A; /**< Supplies the address of matrix A */
+    size_t lda;         /**< Supplies the first dimension of matrix A. */
+    const MLAS_FP16* B; /**< Supplies the address of matrix B */
+    size_t ldb;         /**< Supplies the first dimension of matrix B. */
+    MLAS_FP16* C;       /**< Supplies the address of matrix C */
+    size_t ldc;         /**< Supplies the first dimension of matrix C. */
+    uint16_t alpha;     /**< Supplies the scalar alpha multiplier (see GEMM definition). FP16 encoding. */
+    uint16_t beta;      /**< Supplies the scalar beta multiplier (see GEMM definition). FP16 encoding. */
+};
+
+/**
+ * @brief Check whether current CPU supports half precision gemm.
+ */
+bool
+MLASCALL
+MlasHGemmSupported(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB
+    );
+
+/**
+ * @brief  Batched half precision matrix/matrix multiply operation (HGEMM)
+ *
+ * @param TransA     Supplies the transpose operation for matrix A.
+ * @param TransB     Supplies the transpose operation for matrix B.
+ * @param M          Supplies the number of rows of matrix A and matrix C.
+ * @param N          Supplies the number of columns of matrix B and matrix C.
+ * @param K          Supplies the number of columns of matrix A and the number of rows of matrix B.
+ * @param Data       A array of matrices data parameters
+ * @param BatchSize  Supplies number of multiplications in this batch
+ * @param ThreadPool Supplies the thread pool object to use, else nullptr if the
+                     base library threading support should be used.
+ */
+void
+MLASCALL
+MlasGemmBatch(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const MLAS_HGEMM_DATA_PARAMS* Data,
+    size_t BatchSize,
+    MLAS_THREADPOOL* ThreadPool
+    );
+
+/**
+ * @brief  half precision matrix/matrix multiply operation (HGEMM)
+ *         C = alpha * op(A) * op(B) + beta * C
+ *
+ * @param TransA  Supplies the transpose operation for matrix A. Currently only support CblasNoTrans.
+ * @param TransB  Supplies the transpose operation for matrix B. Currently only support CblasTrans.
+ * @param M       Supplies the number of rows of matrix A and matrix C.
+ * @param N       Supplies the number of columns of matrix B and matrix C.
+ * @param K       Supplies the number of columns of matrix A and the number of rows of matrix B.
+ * @param A       Supplies the address of matrix A
+ * @param lda     Supplies the first dimension of matrix A.
+ * @param B       Supplies the address of matrix B
+ * @param ldb     Supplies the first dimension of matrix B.
+ * @param C       Supplies the address of matrix C
+ * @param ldc     Supplies the first dimension of matrix C.
+ * @param alpha   Supplies the scalar alpha multiplier (see GEMM definition)
+ * @param beta    Supplies the scalar beta multiplier (see GEMM definition)
+ * @param ThreadPool Supplies the thread pool object to use, else nullptr if the base library threading support
+ *                   should be used.
+ */
+inline
+void
+MlasGemm(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const MLAS_FP16* A,
+    size_t lda,
+    const MLAS_FP16* B,
+    size_t ldb,
+    MLAS_FP16* C,
+    size_t ldc,
+    uint16_t alpha,
+    uint16_t beta,
+    MLAS_THREADPOOL* ThreadPool
+) {
+    MLAS_HGEMM_DATA_PARAMS Data;
+    Data.A = A;
+    Data.lda = lda;
+    Data.B = B;
+    Data.ldb = ldb;
+    Data.C = C;
+    Data.ldc = ldc;
+    Data.alpha = alpha;
+    Data.beta = beta;
+    MlasGemmBatch(TransA, TransB, M, N, K, &Data, 1, ThreadPool);
+}
+
+/**
  * @brief Whether current CPU supports FP16 acceleration.
 */
 bool MLASCALL
diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
index f4c49905ebbd7..acee567162b9d 100644
--- a/onnxruntime/core/mlas/lib/fp16_common.h
+++ b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -349,4 +349,103 @@ MlasBitwiseSelectFloat16x4(MLAS_UINT16X4 select, MLAS_FLOAT16X4 ones, MLAS_FLOAT
     return vbsl_f16(select, ones, zeros);
 }
 
+MLAS_FORCEINLINE
+void
+Transpose8x8(MLAS_FLOAT16X8& v0, MLAS_FLOAT16X8& v1, MLAS_FLOAT16X8& v2, MLAS_FLOAT16X8& v3,
+             MLAS_FLOAT16X8& v4, MLAS_FLOAT16X8& v5, MLAS_FLOAT16X8& v6, MLAS_FLOAT16X8& v7)
+{
+    // |v00|v01|v02|v03|v04|v05|v06|v07|
+    // |v10|v11|v12|v13|v14|v15|v16|v17|
+    // |v20|v21|v22|v23|v24|v25|v26|v27|
+    // |v30|v31|v32|v33|v34|v35|v36|v37|
+    // |v40|v41|v42|v43|v44|v45|v46|v47|
+    // |v50|v51|v52|v53|v54|v55|v56|v57|
+    // |v60|v61|v62|v63|v64|v65|v66|v67|
+    // |v70|v71|v72|v73|v74|v75|v76|v77|
+    float16x8x2_t t01 = vtrnq_f16(v0, v1);
+    float16x8x2_t t23 = vtrnq_f16(v2, v3);
+    float16x8x2_t t45 = vtrnq_f16(v4, v5);
+    float16x8x2_t t67 = vtrnq_f16(v6, v7);
+    // |v00|v10|v02|v12|v04|v14|v06|v16|
+    // |v01|v11|v03|v13|v05|v15|v07|v17|
+    // |v20|v30|v22|v32|v24|v34|v26|v36|
+    // |v21|v31|v23|v33|v25|v35|v27|v37|
+    // |v40|v50|v42|v52|v44|v54|v46|v56|
+    // |v41|v51|v43|v53|v45|v55|v47|v57|
+    // |v60|v70|v62|v72|v64|v74|v66|v76|
+    // |v61|v71|v63|v73|v65|v75|v67|v77|
+    float32x4x2_t t02 = vtrnq_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0]));
+    float32x4x2_t t13 = vtrnq_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1]));
+    float32x4x2_t t46 = vtrnq_f32(vreinterpretq_f32_f16(t45.val[0]), vreinterpretq_f32_f16(t67.val[0]));
+    float32x4x2_t t57 = vtrnq_f32(vreinterpretq_f32_f16(t45.val[1]), vreinterpretq_f32_f16(t67.val[1]));
+    // |v00|v10|v20|v30|v04|v14|v24|v34|
+    // |v01|v11|v21|v31|v05|v15|v25|v35|
+    // |v02|v12|v22|v32|v06|v16|v26|v36|
+    // |v03|v13|v23|v33|v07|v17|v27|v37|
+    // |v40|v50|v60|v70|v44|v54|v64|v74|
+    // |v41|v51|v61|v71|v45|v55|v65|v75|
+    // |v42|v52|v62|v72|v46|v56|v66|v76|
+    // |v43|v53|v63|v73|v47|v57|v67|v77|
+    v0 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t02.val[0]), vreinterpretq_f64_f32(t46.val[0])));
+    v4 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t02.val[0]), vreinterpretq_f64_f32(t46.val[0])));
+    v2 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t02.val[1]), vreinterpretq_f64_f32(t46.val[1])));
+    v6 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t02.val[1]), vreinterpretq_f64_f32(t46.val[1])));
+    v1 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t13.val[0]), vreinterpretq_f64_f32(t57.val[0])));
+    v5 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t13.val[0]), vreinterpretq_f64_f32(t57.val[0])));
+    v3 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t13.val[1]), vreinterpretq_f64_f32(t57.val[1])));
+    v7 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t13.val[1]), vreinterpretq_f64_f32(t57.val[1])));
+    // |v00|v10|v20|v30|v40|v50|v60|v70|
+    // |v01|v11|v21|v31|v41|v51|v61|v71|
+    // |v02|v12|v22|v32|v42|v52|v62|v72|
+    // |v03|v13|v23|v33|v43|v53|v63|v73|
+    // |v04|v14|v24|v34|v44|v54|v64|v74|
+    // |v05|v15|v25|v35|v45|v55|v65|v75|
+    // |v06|v16|v26|v36|v46|v56|v66|v76|
+    // |v07|v17|v27|v37|v47|v57|v67|v77|
+}
+
+MLAS_FORCEINLINE
+void
+Transpose4x8(MLAS_FLOAT16X8& v0, MLAS_FLOAT16X8& v1, MLAS_FLOAT16X8& v2, MLAS_FLOAT16X8& v3)
+{
+    // |v00|v01|v02|v03|v04|v05|v06|v07|
+    // |v10|v11|v12|v13|v14|v15|v16|v17|
+    // |v20|v21|v22|v23|v24|v25|v26|v27|
+    // |v30|v31|v32|v33|v34|v35|v36|v37|
+    //  =>
+    // |v00|v10|v20|v30|v04|v14|v24|v34|
+    // |v01|v11|v21|v31|v05|v15|v25|v35|
+    // |v02|v12|v22|v32|v06|v16|v26|v36|
+    // |v03|v13|v23|v33|v07|v17|v27|v37|
+    float16x8x2_t t01 = vtrnq_f16(v0, v1);
+    float16x8x2_t t23 = vtrnq_f16(v2, v3);
+
+    v0 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
+    v2 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
+    v1 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
+    v3 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
+}
+
+MLAS_FORCEINLINE
+void
+Transpose4x4(MLAS_FLOAT16X4& v0, MLAS_FLOAT16X4& v1, MLAS_FLOAT16X4& v2, MLAS_FLOAT16X4& v3)
+{
+    // |v00|v01|v02|v03|
+    // |v10|v11|v12|v13|
+    // |v20|v21|v22|v23|
+    // |v30|v31|v32|v33|
+    //  =>
+    // |v00|v10|v20|v30|
+    // |v01|v11|v21|v31|
+    // |v02|v12|v22|v32|
+    // |v03|v13|v23|v33|
+    float16x4x2_t t01 = vtrn_f16(v0, v1);
+    float16x4x2_t t23 = vtrn_f16(v2, v3);
+
+    v0 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
+    v1 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
+    v2 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
+    v3 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
+}
+
 #endif  // fp16 vector intrinsic supported
diff --git a/onnxruntime/core/mlas/lib/halfgemm.cpp b/onnxruntime/core/mlas/lib/halfgemm.cpp
index 49387d2fc998f..65ab0e9ce4630 100644
--- a/onnxruntime/core/mlas/lib/halfgemm.cpp
+++ b/onnxruntime/core/mlas/lib/halfgemm.cpp
@@ -324,6 +324,176 @@ MlasHalfGemmKernel<MLAS_HALF_GEMM_KERNEL_DEFAULT>(
     }
 }
 
+bool
+MLASCALL
+MlasHGemmSupported(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB
+) {
+    auto* dispatch = GetMlasPlatform().HGemmDispatch;
+    if (TransA == CblasNoTrans && TransB == CblasTrans) {
+        return dispatch &&
+        dispatch->HGemmKernel_TransposedB &&
+        dispatch->HPackBKernel_TransposedB &&
+        dispatch->HGemmKernel_TransposedPackedB;
+    }
+
+    return false;
+}
+
+void
+HGemmOperation(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t K, // full K slice
+    const MLAS_HGEMM_DATA_PARAMS* DataParams,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+) {
+    const size_t lda = DataParams->lda;
+    const size_t ldb = DataParams->ldb;
+    const size_t ldc = DataParams->ldc;
+    const _mlas_fp16_ alpha = DataParams->alpha;
+    const _mlas_fp16_ beta = DataParams->beta;
+    auto* dispatch = GetMlasPlatform().HGemmDispatch;
+    constexpr size_t StrideM = 2;
+    const auto beta_add = MLAS_FP16(1.0f);
+    constexpr size_t buffer_size = MLAS_HGEMM_STRIDEN * MLAS_HGEMM_STRIDEK;
+    MLAS_DECLSPEC_ALIGN(MLAS_FP16 PackedB[buffer_size], 16 * sizeof(_mlas_fp16_));
+
+    if (TransA == CblasNoTrans && TransB == CblasTrans) {
+        const auto* A = DataParams->A + RangeStartM * lda;
+        const auto* B = DataParams->B + RangeStartN * ldb;
+        auto* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+
+        if (RangeCountM <= StrideM) {
+            if (!dispatch || !dispatch->HGemmKernel_TransposedB) {
+                MLAS_THROW_EX(std::runtime_error, "hgemm does not have A x Transposed(B) kernels");
+            }
+            // When M is small, B is visited once. The overhead of Pack(B') exceeds the benefits
+            // from A x Pack(B'). Therefore directly calculate A x B'.
+            // Without PackB, to utilize memory locality, iterate full K.
+            constexpr size_t StrideN = 16;
+            for (size_t n = 0, countN; n < RangeCountN; n += countN) {
+                countN = std::min(StrideN, RangeCountN - n);
+                dispatch->HGemmKernel_TransposedB(A, B, C, RangeCountM, countN, K, lda, ldb, ldc, alpha, beta);
+                B += countN * ldb;
+                C += countN;
+            }
+        } else {
+            if (!dispatch || !dispatch->HPackBKernel_TransposedB || !dispatch->HGemmKernel_TransposedPackedB) {
+                MLAS_THROW_EX(std::runtime_error, "hgemm does not have A x Transposed(B) kernels");
+            }
+            // 16N is the smallest pack unit.
+            const size_t StrideK = std::min(K, size_t(MLAS_HGEMM_STRIDEK));
+            const size_t StrideN = buffer_size/StrideK & (~15); // >= MLAS_HGEMM_STRIDEN
+            for (size_t n = 0, countN; n < RangeCountN; n += countN) {
+                countN = std::min(StrideN, RangeCountN - n);
+                const MLAS_FP16* a = A;
+                const MLAS_FP16* b = B;
+                MLAS_FP16* c = C;
+                for (size_t k = 0, countK; k < K; k += countK) {
+                    countK = std::min(StrideK, K - k);
+                    dispatch->HPackBKernel_TransposedB(b, PackedB, countN, countK, ldb);
+                    const MLAS_FP16* aa = a;
+                    MLAS_FP16* cc = c;
+                    for (size_t m = 0, countM; m < RangeCountM; m += countM) {
+                        countM = std::min(StrideM, RangeCountM - m);
+                        // First K iteration, beta is applied to the whole C. In rest K iterations, use add mode.
+                        dispatch->HGemmKernel_TransposedPackedB(
+                            aa, PackedB, cc, countM, countN, countK, lda, ldc, alpha, k == 0 ? beta : beta_add.val);
+                        aa += countM * lda;
+                        cc += countM * ldc;
+                    }
+                    a += countK;
+                    b += countK;
+                }
+                B += countN * ldb;
+                C += countN;
+            }
+        }
+    } else {
+        MLAS_THROW_EX(std::runtime_error, "hgemm currently only support A x Transpoe(B)");
+    }
+}
+
+void
+MLASCALL
+MlasGemmBatch(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const MLAS_HGEMM_DATA_PARAMS* Data,
+    size_t BatchSize,
+    MLAS_THREADPOOL* ThreadPool
+) {
+    if (!ThreadPool) {
+        for (size_t gemm_i = 0; gemm_i < BatchSize; gemm_i++) {
+            HGemmOperation(TransA, TransB, K, &Data[gemm_i], 0, M, 0, N);
+        }
+        return;
+    }
+
+    const double Complexity = double(M) * double(N) * double(K) * double(BatchSize);
+    ptrdiff_t TargetThreadCount;
+
+    if (Complexity < double(MLAS_HGEMM_THREAD_COMPLEXITY) * GetMlasPlatform().MaximumThreadCount) {
+        TargetThreadCount = ptrdiff_t(Complexity / double(MLAS_HGEMM_THREAD_COMPLEXITY)) + 1;
+    } else {
+        TargetThreadCount = GetMlasPlatform().MaximumThreadCount;
+    }
+
+    ptrdiff_t MaximumThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+    if (TargetThreadCount >= MaximumThreadCount) {
+        TargetThreadCount = MaximumThreadCount;
+    }
+
+    // Segment the operation across multiple threads.
+
+    ptrdiff_t ThreadsPerGemm = TargetThreadCount / BatchSize;
+    if (ThreadsPerGemm < 1) {
+        ThreadsPerGemm = 1;
+    }
+
+    constexpr size_t StrideM = 128;
+
+    size_t nc = N;
+    if (ThreadsPerGemm > 1) {
+        // more than one thread per GEMM
+
+        const size_t BlockedM = MlasDivRoundup(M, StrideM);
+        const size_t max_nc = MlasDivRoundup(N * BlockedM, ThreadsPerGemm);
+        if (max_nc < nc) {
+            nc = std::min(
+                nc, MlasDivRoundup(max_nc, MLAS_HGEMM_STRIDEN_THREAD_ALIGN) * MLAS_HGEMM_STRIDEN_THREAD_ALIGN);
+        }
+    }
+    const size_t StrideN = nc;
+
+    const size_t ThreadCountM = MlasDivRoundup(M, StrideM);
+    const size_t ThreadCountN = MlasDivRoundup(N, StrideN);
+    ThreadsPerGemm = ThreadCountM * ThreadCountN;
+
+    MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * static_cast<ptrdiff_t>(BatchSize), [&](ptrdiff_t tid) {
+        const auto gemm_i = tid / ThreadsPerGemm;
+        const auto blk_i = tid % ThreadsPerGemm;
+
+        const ptrdiff_t ThreadIdN = blk_i / ThreadCountM;
+        const ptrdiff_t ThreadIdM = blk_i % ThreadCountM;
+
+        const size_t RangeStartM = ThreadIdM * StrideM;
+        const size_t RangeCountM = std::min(M - RangeStartM, (size_t)StrideM);
+
+        const size_t RangeStartN = ThreadIdN * StrideN;
+        const size_t RangeCountN = std::min(N - RangeStartN, (size_t)StrideN);
+
+        HGemmOperation(TransA, TransB, K, &Data[gemm_i], RangeStartM, RangeCountM, RangeStartN, RangeCountN);
+    });
+}
 
 const MLAS_HALFGEMM_DISPATCH MlasHalfGemmDispatchDefault = {
     MlasHalfGemmOperation<MLAS_HALF_GEMM_KERNEL_DEFAULT>,
diff --git a/onnxruntime/core/mlas/lib/halfgemm.h b/onnxruntime/core/mlas/lib/halfgemm.h
index 61e2fbb0afc6a..e280e6d40973f 100644
--- a/onnxruntime/core/mlas/lib/halfgemm.h
+++ b/onnxruntime/core/mlas/lib/halfgemm.h
@@ -513,3 +513,125 @@ MlasHalfGemmGetDispatch()
     return &MlasHalfGemmDispatchDefault;
 #endif
 }
+
+namespace hgemm_neon {
+
+void HPackB_TransposedB_Kernel(
+    const MLAS_FP16* B,
+    MLAS_FP16* PackedB,
+    size_t CountN,
+    size_t CountK,
+    size_t ldb
+);
+
+void HGemm_TransposedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* B,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+);
+
+void HGemm_TransposedPackedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* PackedB,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+);
+
+}  // namespace hgemm_neon
+
+struct MLAS_HGEMM_DISPATCH {
+    /**
+     * @brief Pack the B matrix segment. B is column-major. Elements from CountK rows x N columns are packed
+     *        continuously in row-major.
+     *        First pack CountK rows x 16 columns, then pack CountK rows x 8 columns.
+     *        If there are < 8 columns left, pad the columns with 0.
+     * @param      B                   the first element of the B matrix segment. Column major.
+     * @param[out] PackedB             the first element of the packed B matrix segment.
+     * @param      CountN              the number of columns of B chunk.
+     * @param      CountK              the number of rows of B chunk.
+     */
+    typedef void(HPackBKernel_TransposedB_Fn) (
+        const MLAS_FP16* B,
+        MLAS_FP16* PackedB,
+        size_t CountN,
+        size_t CountK,
+        size_t ldb
+    );
+
+    HPackBKernel_TransposedB_Fn* HPackBKernel_TransposedB = nullptr;
+
+    /**
+     * @brief C = alpha * A * Transpose(B) + beta * C. CountM <= 2. B is not packed. Used when M is small.
+     *
+     * @param       A                   first row of the A matrix segment. Row major.
+     * @param       B                   first column of the B matrix segment. Column major.
+     * @param[out]  C                   first element of the output matrix segment. Row major.
+     * @param       CountM              the number of rows of A chunk.
+     * @param       CountN              the number of columns of B chunk.
+     * @param       CountK              the number of columns of A chunk and the number of rows of B chunk.
+     * @param       lda                 the leading dimension of A.
+     * @param       ldb                 the leading dimension of B.
+     * @param       ldc                 the leading dimension of C.
+     * @param       alpha               the alpha scalar value.
+     * @param       beta                the beta scalar value.
+     */
+    typedef void(HGemmKernel_TransposedB_Fn)(
+        const MLAS_FP16* A,
+        const MLAS_FP16* B,
+        MLAS_FP16* C,
+        size_t CountM,
+        size_t CountN,
+        size_t CountK,
+        size_t lda,
+        size_t ldb,
+        size_t ldc,
+        _mlas_fp16_ alpha,
+        _mlas_fp16_ beta
+    );
+
+    HGemmKernel_TransposedB_Fn* HGemmKernel_TransposedB = nullptr;
+
+     /**
+     * @brief C = alpha * A * Transpose(B) + beta * C. CountM <= 2. B has been packed using HPackBKernel_TransposedB_Fn.
+     *        Use when M is large.
+     *
+     * @param       A                   first row of the A matrix segment. Row major.
+     * @param       PackedB             first element of the packed B buffer.
+     * @param[out]  C                   first element of the output matrix segment. Row major.
+     * @param       CountM              the number of rows of A chunk.
+     * @param       CountN              the number of columns of B chunk.
+     * @param       CountK              the number of columns of A chunk and the number of rows of B chunk.
+     * @param       lda                 the leading dimension of A.
+     * @param       ldc                 the leading dimension of C.
+     * @param       alpha               the alpha scalar value.
+     * @param       beta                the beta scalar value.
+     */
+    typedef void(HGemmKernel_TransposedPackedB_Fn)(
+        const MLAS_FP16* A,
+        const MLAS_FP16* PackedB,
+        MLAS_FP16* C,
+        size_t CountM,
+        size_t CountN,
+        size_t CountK,
+        size_t lda,
+        size_t ldc,
+        _mlas_fp16_ alpha,
+        _mlas_fp16_ beta
+    );
+
+    HGemmKernel_TransposedPackedB_Fn* HGemmKernel_TransposedPackedB = nullptr;
+};
diff --git a/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp
new file mode 100644
index 0000000000000..02ce38fcb21d6
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp
@@ -0,0 +1,1572 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    halfgemm_kernel_neon_fp16.cpp
+
+Abstract:
+
+    This module implements half precision GEMM kernel for neon.
+
+--*/
+
+#include <arm_neon.h>
+
+#include "halfgemm.h"
+#include "fp16_common.h"
+
+namespace hgemm_neon {
+
+void HPackB_TransposedB_Kernel(
+    const MLAS_FP16* B,
+    MLAS_FP16* PackedB,
+    size_t CountN,
+    size_t CountK,
+    size_t ldb
+) {
+    const _mlas_fp16_* B_data = reinterpret_cast<const _mlas_fp16_*>(B);
+    _mlas_fp16_* PackedB_data = reinterpret_cast<_mlas_fp16_*>(PackedB);
+
+    for (; CountN >= 16; CountN -= 16, B_data += 16 * ldb) {
+        const _mlas_fp16_* b = B_data;
+        size_t k = CountK;
+        constexpr size_t step = 8 * 16; // pack 8 * 16
+        for (; k >= 8; k -= 8, b += 8, PackedB_data += step) {
+            float16x8_t v0 = MlasLoadFloat16x8(b);
+            float16x8_t v1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t v2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t v3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t v4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t v5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t v6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t v7 = MlasLoadFloat16x8(b + 7 * ldb);
+            float16x8_t v8 = MlasLoadFloat16x8(b + 8 * ldb);
+            float16x8_t v9 = MlasLoadFloat16x8(b + 9 * ldb);
+            float16x8_t vA = MlasLoadFloat16x8(b + 10 * ldb);
+            float16x8_t vB = MlasLoadFloat16x8(b + 11 * ldb);
+            float16x8_t vC = MlasLoadFloat16x8(b + 12 * ldb);
+            float16x8_t vD = MlasLoadFloat16x8(b + 13 * ldb);
+            float16x8_t vE = MlasLoadFloat16x8(b + 14 * ldb);
+            float16x8_t vF = MlasLoadFloat16x8(b + 15 * ldb);
+            Transpose8x8(v0, v1, v2, v3, v4, v5, v6, v7);
+            Transpose8x8(v8, v9, vA, vB, vC, vD, vE, vF);
+
+            MlasStoreFloat16x8(PackedB_data, v0);
+            MlasStoreFloat16x8(PackedB_data + 8, v8);
+            MlasStoreFloat16x8(PackedB_data + 16, v1);
+            MlasStoreFloat16x8(PackedB_data + 24, v9);
+            MlasStoreFloat16x8(PackedB_data + 32, v2);
+            MlasStoreFloat16x8(PackedB_data + 40, vA);
+            MlasStoreFloat16x8(PackedB_data + 48, v3);
+            MlasStoreFloat16x8(PackedB_data + 56, vB);
+            MlasStoreFloat16x8(PackedB_data + 64, v4);
+            MlasStoreFloat16x8(PackedB_data + 72, vC);
+            MlasStoreFloat16x8(PackedB_data + 80, v5);
+            MlasStoreFloat16x8(PackedB_data + 88, vD);
+            MlasStoreFloat16x8(PackedB_data + 96, v6);
+            MlasStoreFloat16x8(PackedB_data + 104, vE);
+            MlasStoreFloat16x8(PackedB_data + 112, v7);
+            MlasStoreFloat16x8(PackedB_data + 120, vF);
+        }
+
+        if (k & 4) {
+            float16x4_t v0 = MlasLoadFloat16x4(b);
+            float16x4_t v1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t v2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t v3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t v4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t v5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t v6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t v7 = MlasLoadFloat16x4(b + 7 * ldb);
+            float16x4_t v8 = MlasLoadFloat16x4(b + 8 * ldb);
+            float16x4_t v9 = MlasLoadFloat16x4(b + 9 * ldb);
+            float16x4_t vA = MlasLoadFloat16x4(b + 10 * ldb);
+            float16x4_t vB = MlasLoadFloat16x4(b + 11 * ldb);
+            float16x4_t vC = MlasLoadFloat16x4(b + 12 * ldb);
+            float16x4_t vD = MlasLoadFloat16x4(b + 13 * ldb);
+            float16x4_t vE = MlasLoadFloat16x4(b + 14 * ldb);
+            float16x4_t vF = MlasLoadFloat16x4(b + 15 * ldb);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            Transpose4x4(v8, v9, vA, vB);
+            Transpose4x4(vC, vD, vE, vF);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            MlasStoreFloat16x4(PackedB_data + 8, v8);
+            MlasStoreFloat16x4(PackedB_data + 12, vC);
+            MlasStoreFloat16x4(PackedB_data + 16, v1);
+            MlasStoreFloat16x4(PackedB_data + 20, v5);
+            MlasStoreFloat16x4(PackedB_data + 24, v9);
+            MlasStoreFloat16x4(PackedB_data + 28, vD);
+            MlasStoreFloat16x4(PackedB_data + 32, v2);
+            MlasStoreFloat16x4(PackedB_data + 36, v6);
+            MlasStoreFloat16x4(PackedB_data + 40, vA);
+            MlasStoreFloat16x4(PackedB_data + 44, vE);
+            MlasStoreFloat16x4(PackedB_data + 48, v3);
+            MlasStoreFloat16x4(PackedB_data + 52, v7);
+            MlasStoreFloat16x4(PackedB_data + 56, vB);
+            MlasStoreFloat16x4(PackedB_data + 60, vF);
+
+            k -= 4, b += 4, PackedB_data += 4 * 16;
+        }
+
+        if (k > 0) {
+            float16x4_t v0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t v1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t v2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t v3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t v4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t v5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t v6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t v7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            float16x4_t v8 = MlasLoadPartialFloat16x4(b + 8 * ldb, k);
+            float16x4_t v9 = MlasLoadPartialFloat16x4(b + 9 * ldb, k);
+            float16x4_t vA = MlasLoadPartialFloat16x4(b + 10 * ldb, k);
+            float16x4_t vB = MlasLoadPartialFloat16x4(b + 11 * ldb, k);
+            float16x4_t vC = MlasLoadPartialFloat16x4(b + 12 * ldb, k);
+            float16x4_t vD = MlasLoadPartialFloat16x4(b + 13 * ldb, k);
+            float16x4_t vE = MlasLoadPartialFloat16x4(b + 14 * ldb, k);
+            float16x4_t vF = MlasLoadPartialFloat16x4(b + 15 * ldb, k);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            Transpose4x4(v8, v9, vA, vB);
+            Transpose4x4(vC, vD, vE, vF);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            MlasStoreFloat16x4(PackedB_data + 8, v8);
+            MlasStoreFloat16x4(PackedB_data + 12, vC);
+            if (k > 1) {
+                MlasStoreFloat16x4(PackedB_data + 16, v1);
+                MlasStoreFloat16x4(PackedB_data + 20, v5);
+                MlasStoreFloat16x4(PackedB_data + 24, v9);
+                MlasStoreFloat16x4(PackedB_data + 28, vD);
+            }
+            if (k > 2) {
+                MlasStoreFloat16x4(PackedB_data + 32, v2);
+                MlasStoreFloat16x4(PackedB_data + 36, v6);
+                MlasStoreFloat16x4(PackedB_data + 40, vA);
+                MlasStoreFloat16x4(PackedB_data + 44, vE);
+            }
+
+            PackedB_data += k * 16;
+        }
+    }
+
+    if (CountN & 8) {
+        const _mlas_fp16_* b = B_data;
+        size_t k = CountK;
+        constexpr size_t step = 8 * 8; // pack 8 * 8
+        for (; k >= 8; k -= 8, b += 8, PackedB_data += step) {
+            float16x8_t v0 = MlasLoadFloat16x8(b);
+            float16x8_t v1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t v2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t v3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t v4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t v5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t v6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t v7 = MlasLoadFloat16x8(b + 7 * ldb);
+            Transpose8x8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+            MlasStoreFloat16x8(PackedB_data, v0);
+            MlasStoreFloat16x8(PackedB_data + 8, v1);
+            MlasStoreFloat16x8(PackedB_data + 16, v2);
+            MlasStoreFloat16x8(PackedB_data + 24, v3);
+            MlasStoreFloat16x8(PackedB_data + 32, v4);
+            MlasStoreFloat16x8(PackedB_data + 40, v5);
+            MlasStoreFloat16x8(PackedB_data + 48, v6);
+            MlasStoreFloat16x8(PackedB_data + 56, v7);
+        }
+
+        if (k & 4) {
+            float16x4_t v0 = MlasLoadFloat16x4(b);
+            float16x4_t v1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t v2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t v3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t v4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t v5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t v6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t v7 = MlasLoadFloat16x4(b + 7 * ldb);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            MlasStoreFloat16x4(PackedB_data + 8, v1);
+            MlasStoreFloat16x4(PackedB_data + 12, v5);
+            MlasStoreFloat16x4(PackedB_data + 16, v2);
+            MlasStoreFloat16x4(PackedB_data + 20, v6);
+            MlasStoreFloat16x4(PackedB_data + 24, v3);
+            MlasStoreFloat16x4(PackedB_data + 28, v7);
+            k -= 4, b += 4, PackedB_data += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t v0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t v1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t v2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t v3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t v4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t v5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t v6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t v7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            if (k > 1) {
+                MlasStoreFloat16x4(PackedB_data + 8, v1);
+                MlasStoreFloat16x4(PackedB_data + 12, v5);
+            }
+            if (k > 2) {
+                MlasStoreFloat16x4(PackedB_data + 16, v2);
+                MlasStoreFloat16x4(PackedB_data + 20, v6);
+            }
+
+            PackedB_data += k * 8;
+        }
+
+        B_data += 8 * ldb;
+        CountN -= 8;
+    }
+
+    if (CountN > 0) {
+        const _mlas_fp16_* b = B_data;
+        size_t k = CountK;
+        constexpr size_t step = 8 * 8; // pack extended 8 * 8
+        for (; k >= 8; k -= 8, b += 8, PackedB_data += step) {
+            float16x8_t v[8];
+            size_t i = 0;
+            for (; i < CountN; ++i) {
+                v[i] = MlasLoadFloat16x8(b + i * ldb);
+            }
+            for (; i < 8; ++i) {
+                v[i] = MlasZeroFloat16x8();
+            }
+            Transpose8x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+            MlasStoreFloat16x8(PackedB_data, v[0]);
+            MlasStoreFloat16x8(PackedB_data + 8, v[1]);
+            MlasStoreFloat16x8(PackedB_data + 16, v[2]);
+            MlasStoreFloat16x8(PackedB_data + 24, v[3]);
+            MlasStoreFloat16x8(PackedB_data + 32, v[4]);
+            MlasStoreFloat16x8(PackedB_data + 40, v[5]);
+            MlasStoreFloat16x8(PackedB_data + 48, v[6]);
+            MlasStoreFloat16x8(PackedB_data + 56, v[7]);
+        }
+
+        if (k & 4) {
+            float16x4_t v[8];
+            size_t i = 0;
+            for (; i < CountN; ++i) {
+                v[i] = MlasLoadFloat16x4(b + i * ldb);
+            }
+            for (; i < 8; ++i) {
+                v[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(v[0], v[1], v[2], v[3]);
+            Transpose4x4(v[4], v[5], v[6], v[7]);
+            MlasStoreFloat16x4(PackedB_data, v[0]);
+            MlasStoreFloat16x4(PackedB_data + 4, v[4]);
+            MlasStoreFloat16x4(PackedB_data + 8, v[1]);
+            MlasStoreFloat16x4(PackedB_data + 12, v[5]);
+            MlasStoreFloat16x4(PackedB_data + 16, v[2]);
+            MlasStoreFloat16x4(PackedB_data + 20, v[6]);
+            MlasStoreFloat16x4(PackedB_data + 24, v[3]);
+            MlasStoreFloat16x4(PackedB_data + 28, v[7]);
+            k -= 4, b += 4, PackedB_data += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t v[8];
+            size_t i = 0;
+            for (; i < CountN; ++i) {
+                v[i] = MlasLoadPartialFloat16x4(b + i * ldb, k);
+            }
+            for (; i < 8; ++i) {
+                v[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(v[0], v[1], v[2], v[3]);
+            Transpose4x4(v[4], v[5], v[6], v[7]);
+            MlasStoreFloat16x4(PackedB_data, v[0]);
+            MlasStoreFloat16x4(PackedB_data + 4, v[4]);
+            if (k > 1) {
+                MlasStoreFloat16x4(PackedB_data + 8, v[1]);
+                MlasStoreFloat16x4(PackedB_data + 12, v[5]);
+            }
+            if (k > 2) {
+                MlasStoreFloat16x4(PackedB_data + 16, v[2]);
+                MlasStoreFloat16x4(PackedB_data + 20, v[6]);
+            }
+        }
+    }
+}
+
+MLAS_FORCEINLINE
+float16x8_t addq_f16x4(float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3) {
+    v0 = vaddq_f16(v0, v1);
+    v2 = vaddq_f16(v2, v3);
+    v0 = vaddq_f16(v0, v2);
+    return v0;
+}
+
+MLAS_FORCEINLINE
+float16x8_t addq_f16x8(float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3,
+                       float16x8_t v4, float16x8_t v5, float16x8_t v6, float16x8_t v7) {
+    return vaddq_f16(addq_f16x4(v0, v1, v2, v3), addq_f16x4(v4, v5, v6, v7));
+}
+
+MLAS_FORCEINLINE
+float16x8_t maq_lane_f16_accu(float16x8_t accu0, float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3,
+                              float16x4_t a0) {
+    accu0 = vfmaq_lane_f16(accu0, v0, a0, 0);
+    accu0 = vfmaq_lane_f16(accu0, v1, a0, 1);
+    accu0 = vfmaq_lane_f16(accu0, v2, a0, 2);
+    accu0 = vfmaq_lane_f16(accu0, v3, a0, 3);
+    return accu0;
+}
+
+MLAS_FORCEINLINE
+float16x8_t maq_laneq_f16_accu(float16x8_t accu0, float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3,
+                               float16x8_t v4, float16x8_t v5, float16x8_t v6, float16x8_t v7, float16x8_t a0) {
+    accu0 = vfmaq_laneq_f16(accu0, v0, a0, 0);
+    accu0 = vfmaq_laneq_f16(accu0, v1, a0, 1);
+    accu0 = vfmaq_laneq_f16(accu0, v2, a0, 2);
+    accu0 = vfmaq_laneq_f16(accu0, v3, a0, 3);
+    accu0 = vfmaq_laneq_f16(accu0, v4, a0, 4);
+    accu0 = vfmaq_laneq_f16(accu0, v5, a0, 5);
+    accu0 = vfmaq_laneq_f16(accu0, v6, a0, 6);
+    accu0 = vfmaq_laneq_f16(accu0, v7, a0, 7);
+    return accu0;
+}
+
+MLAS_FORCEINLINE
+float16x4_t ma_lane_f16_accu(float16x4_t accu, float16x4_t v0, float16x4_t v1, float16x4_t v2, float16x4_t v3,
+                             float16x4_t a0) {
+    accu = vfma_lane_f16(accu, v0, a0, 0);
+    accu = vfma_lane_f16(accu, v1, a0, 1);
+    accu = vfma_lane_f16(accu, v2, a0, 2);
+    accu = vfma_lane_f16(accu, v3, a0, 3);
+    return accu;
+}
+
+template <int beta_behavior> // 0: beta == 0.0f16, 1: beta == 1.0f16, 2: beta != 0.0f16 && beta != 1.0f16
+void HGemm_TransposedB_Kernel_M1(
+    const _mlas_fp16_* A_data,
+    const _mlas_fp16_* B_data,
+    _mlas_fp16_* C_data,
+    size_t CountN,
+    size_t CountK,
+    size_t ldb,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 8; CountN -= 8, B_data += 8 * ldb, C_data += 8) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        float16x8_t accu2 = MlasZeroFloat16x8();
+        float16x8_t accu3 = MlasZeroFloat16x8();
+        float16x8_t accu4 = MlasZeroFloat16x8();
+        float16x8_t accu5 = MlasZeroFloat16x8();
+        float16x8_t accu6 = MlasZeroFloat16x8();
+        float16x8_t accu7 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t b4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t b5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t b6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t b7 = MlasLoadFloat16x8(b + 7 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = vfmaq_f16(accu0, b0, a0);
+            accu1 = vfmaq_f16(accu1, b1, a0);
+            accu2 = vfmaq_f16(accu2, b2, a0);
+            accu3 = vfmaq_f16(accu3, b3, a0);
+            accu4 = vfmaq_f16(accu4, b4, a0);
+            accu5 = vfmaq_f16(accu5, b5, a0);
+            accu6 = vfmaq_f16(accu6, b6, a0);
+            accu7 = vfmaq_f16(accu7, b7, a0);
+        }
+        Transpose8x8(accu0, accu1, accu2, accu3, accu4, accu5, accu6, accu7);
+        accu0 = addq_f16x8(accu0, accu1, accu2, accu3, accu4, accu5, accu6, accu7); // accumulator of 8 columns
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t b4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t b5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t b6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t b7 = MlasLoadFloat16x4(b + 7 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4);
+            float16x8_t v1 = vcombine_f16(b1, b5);
+            float16x8_t v2 = vcombine_f16(b2, b6);
+            float16x8_t v3 = vcombine_f16(b3, b7);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, v0, v1, v2, v3, a0);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t b4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t b5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t b6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t b7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4), v1, v2;
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            accu0 = vfmaq_lane_f16(accu0, v0, a0, 0);
+            if (k > 1) {
+                v1 = vcombine_f16(b1, b5);
+                accu0 = vfmaq_lane_f16(accu0, v1, a0, 1);
+            }
+            if (k > 2) {
+                v2 = vcombine_f16(b2, b6);
+                accu0 = vfmaq_lane_f16(accu0, v2, a0, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c = MlasLoadFloat16x8(C_data);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vfmaq_f16(c, accu0, alpha_v);
+            MlasStoreFloat16x8(C_data, accu0);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c = MlasLoadFloat16x8(C_data);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu0 = vfmaq_f16(vmulq_f16(c, beta_v), accu0, alpha_v);
+            MlasStoreFloat16x8(C_data, accu0);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vmulq_f16(accu0, alpha_v);
+            MlasStoreFloat16x8(C_data, accu0);
+        }
+    }
+
+    if (CountN & 4) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        float16x8_t accu2 = MlasZeroFloat16x8();
+        float16x8_t accu3 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = vfmaq_f16(accu0, b0, a0);
+            accu1 = vfmaq_f16(accu1, b1, a0);
+            accu2 = vfmaq_f16(accu2, b2, a0);
+            accu3 = vfmaq_f16(accu3, b3, a0);
+        }
+        Transpose4x8(accu0, accu1, accu2, accu3);
+        accu0 = addq_f16x4(accu0, accu1, accu2, accu3); // accumulator of 4 columns
+        float16x4_t accu = vadd_f16(vget_low_f16(accu0), vget_high_f16(accu0));
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu = ma_lane_f16_accu(accu, b0, b1, b2, b3, a0);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            accu = vfma_lane_f16(accu, b0, a0, 0);
+            if (k > 1) {
+                accu = vfma_lane_f16(accu, b1, a0, 1);
+            }
+            if (k > 2) {
+                accu = vfma_lane_f16(accu, b2, a0, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c = MlasLoadFloat16x4(C_data);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vfma_f16(c, accu, alpha_v);
+            MlasStoreFloat16x4(C_data, accu);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c = MlasLoadFloat16x4(C_data);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu = vfma_f16(vmul_f16(c, beta_v), accu, alpha_v);
+            MlasStoreFloat16x4(C_data, accu);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vmul_f16(accu, alpha_v);
+            MlasStoreFloat16x4(C_data, accu);
+        }
+
+        CountN -= 4, B_data += 4 * ldb, C_data += 4;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accus[4];
+        size_t i = 0;
+        for (i = 0; i < 4; ++i) {
+            accus[i] = MlasZeroFloat16x8();
+        }
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            for (i = 0; i < CountN; ++i) {
+                accus[i] = vfmaq_f16(accus[i], MlasLoadFloat16x8(b + i * ldb), a0);
+            }
+        }
+        Transpose4x8(accus[0], accus[1], accus[2], accus[3]);
+        float16x8_t accu0 = addq_f16x4(accus[0], accus[1], accus[2], accus[3]); // accumulator of 4 columns
+        float16x4_t accu = vadd_f16(vget_low_f16(accu0), vget_high_f16(accu0));
+
+        if (k & 4) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadFloat16x4(b + i * ldb);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu = ma_lane_f16_accu(accu, bs[0], bs[1], bs[2], bs[3], a0);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadPartialFloat16x4(b + i * ldb, k);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            accu = vfma_lane_f16(accu, bs[0], a0, 0);
+            if (k > 1) {
+                accu = vfma_lane_f16(accu, bs[1], a0, 1);
+            }
+            if (k > 2) {
+                accu = vfma_lane_f16(accu, bs[2], a0, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vfma_f16(c, accu, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu, CountN);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu = vfma_f16(vmul_f16(c, beta_v), accu, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu, CountN);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vmul_f16(accu, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu, CountN);
+        }
+    }
+}
+
+template <int beta_behavior> // 0: beta == 0.0f16, 1: beta == 1.0f16, 2: beta != 0.0f16 && beta != 1.0f16
+void HGemm_TransposedB_Kernel_M2(
+    const _mlas_fp16_* A_data,
+    const _mlas_fp16_* B_data,
+    _mlas_fp16_* C_data,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 8; CountN -= 8, B_data += 8 * ldb, C_data += 8) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu01 = MlasZeroFloat16x8();
+        float16x8_t accu02 = MlasZeroFloat16x8();
+        float16x8_t accu03 = MlasZeroFloat16x8();
+        float16x8_t accu04 = MlasZeroFloat16x8();
+        float16x8_t accu05 = MlasZeroFloat16x8();
+        float16x8_t accu06 = MlasZeroFloat16x8();
+        float16x8_t accu07 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        float16x8_t accu11 = MlasZeroFloat16x8();
+        float16x8_t accu12 = MlasZeroFloat16x8();
+        float16x8_t accu13 = MlasZeroFloat16x8();
+        float16x8_t accu14 = MlasZeroFloat16x8();
+        float16x8_t accu15 = MlasZeroFloat16x8();
+        float16x8_t accu16 = MlasZeroFloat16x8();
+        float16x8_t accu17 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t b4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t b5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t b6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t b7 = MlasLoadFloat16x8(b + 7 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = vfmaq_f16(accu00, b0, a0);
+            accu01 = vfmaq_f16(accu01, b1, a0);
+            accu02 = vfmaq_f16(accu02, b2, a0);
+            accu03 = vfmaq_f16(accu03, b3, a0);
+            accu04 = vfmaq_f16(accu04, b4, a0);
+            accu05 = vfmaq_f16(accu05, b5, a0);
+            accu06 = vfmaq_f16(accu06, b6, a0);
+            accu07 = vfmaq_f16(accu07, b7, a0);
+            accu10 = vfmaq_f16(accu10, b0, a1);
+            accu11 = vfmaq_f16(accu11, b1, a1);
+            accu12 = vfmaq_f16(accu12, b2, a1);
+            accu13 = vfmaq_f16(accu13, b3, a1);
+            accu14 = vfmaq_f16(accu14, b4, a1);
+            accu15 = vfmaq_f16(accu15, b5, a1);
+            accu16 = vfmaq_f16(accu16, b6, a1);
+            accu17 = vfmaq_f16(accu17, b7, a1);
+        }
+        Transpose8x8(accu00, accu01, accu02, accu03, accu04, accu05, accu06, accu07);
+        Transpose8x8(accu10, accu11, accu12, accu13, accu14, accu15, accu16, accu17);
+        accu00 = addq_f16x8(accu00, accu01, accu02, accu03, accu04, accu05, accu06, accu07);
+        accu10 = addq_f16x8(accu10, accu11, accu12, accu13, accu14, accu15, accu16, accu17);
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t b4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t b5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t b6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t b7 = MlasLoadFloat16x4(b + 7 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4);
+            float16x8_t v1 = vcombine_f16(b1, b5);
+            float16x8_t v2 = vcombine_f16(b2, b6);
+            float16x8_t v3 = vcombine_f16(b3, b7);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu00 = maq_lane_f16_accu(accu00, v0, v1, v2, v3, a0);
+            accu10 = maq_lane_f16_accu(accu10, v0, v1, v2, v3, a1);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t b4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t b5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t b6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t b7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            accu00 = vfmaq_lane_f16(accu00, v0, a0, 0);
+            accu10 = vfmaq_lane_f16(accu10, v0, a1, 0);
+            if (k > 1) {
+                float16x8_t v1 = vcombine_f16(b1, b5);
+                accu00 = vfmaq_lane_f16(accu00, v1, a0, 1);
+                accu10 = vfmaq_lane_f16(accu10, v1, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t v2 = vcombine_f16(b2, b6);
+                accu00 = vfmaq_lane_f16(accu00, v2, a0, 2);
+                accu10 = vfmaq_lane_f16(accu10, v2, a1, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C_data);
+            float16x8_t c1 = MlasLoadFloat16x8(C_data + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vfmaq_f16(c0, accu00, alpha_v);
+            accu10 = vfmaq_f16(c1, accu10, alpha_v);
+            MlasStoreFloat16x8(C_data, accu00);
+            MlasStoreFloat16x8(C_data + ldc, accu10);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C_data);
+            float16x8_t c1 = MlasLoadFloat16x8(C_data + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu00 = vfmaq_f16(vmulq_f16(c0, beta_v), accu00, alpha_v);
+            accu10 = vfmaq_f16(vmulq_f16(c1, beta_v), accu10, alpha_v);
+            MlasStoreFloat16x8(C_data, accu00);
+            MlasStoreFloat16x8(C_data + ldc, accu10);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vmulq_f16(accu00, alpha_v);
+            accu10 = vmulq_f16(accu10, alpha_v);
+            MlasStoreFloat16x8(C_data, accu00);
+            MlasStoreFloat16x8(C_data + ldc, accu10);
+        }
+    }
+
+    if (CountN & 4) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu01 = MlasZeroFloat16x8();
+        float16x8_t accu02 = MlasZeroFloat16x8();
+        float16x8_t accu03 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        float16x8_t accu11 = MlasZeroFloat16x8();
+        float16x8_t accu12 = MlasZeroFloat16x8();
+        float16x8_t accu13 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = vfmaq_f16(accu00, b0, a0);
+            accu01 = vfmaq_f16(accu01, b1, a0);
+            accu02 = vfmaq_f16(accu02, b2, a0);
+            accu03 = vfmaq_f16(accu03, b3, a0);
+            accu10 = vfmaq_f16(accu10, b0, a1);
+            accu11 = vfmaq_f16(accu11, b1, a1);
+            accu12 = vfmaq_f16(accu12, b2, a1);
+            accu13 = vfmaq_f16(accu13, b3, a1);
+        }
+        Transpose4x8(accu00, accu01, accu02, accu03);
+        Transpose4x8(accu10, accu11, accu12, accu13);
+        accu00 = addq_f16x4(accu00, accu01, accu02, accu03);
+        accu10 = addq_f16x4(accu10, accu11, accu12, accu13);
+        float16x4_t accu0 = vadd_f16(vget_low_f16(accu00), vget_high_f16(accu00));
+        float16x4_t accu1 = vadd_f16(vget_low_f16(accu10), vget_high_f16(accu10));
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu0 = ma_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            accu1 = ma_lane_f16_accu(accu1, b0, b1, b2, b3, a1);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            accu0 = vfma_lane_f16(accu0, b0, a0, 0);
+            accu1 = vfma_lane_f16(accu1, b0, a1, 0);
+            if (k > 1) {
+                accu0 = vfma_lane_f16(accu0, b1, a0, 1);
+                accu1 = vfma_lane_f16(accu1, b1, a1, 1);
+            }
+            if (k > 2) {
+                accu0 = vfma_lane_f16(accu0, b2, a0, 2);
+                accu1 = vfma_lane_f16(accu1, b2, a1, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c0 = MlasLoadFloat16x4(C_data);
+            float16x4_t c1 = MlasLoadFloat16x4(C_data + ldc);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu0 = vfma_f16(c0, accu0, alpha_v);
+            accu1 = vfma_f16(c1, accu1, alpha_v);
+            MlasStoreFloat16x4(C_data, accu0);
+            MlasStoreFloat16x4(C_data + ldc, accu1);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c0 = MlasLoadFloat16x4(C_data);
+            float16x4_t c1 = MlasLoadFloat16x4(C_data + ldc);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu0 = vfma_f16(vmul_f16(c0, beta_v), accu0, alpha_v);
+            accu1 = vfma_f16(vmul_f16(c1, beta_v), accu1, alpha_v);
+            MlasStoreFloat16x4(C_data, accu0);
+            MlasStoreFloat16x4(C_data + ldc, accu1);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu0 = vmul_f16(accu0, alpha_v);
+            accu1 = vmul_f16(accu1, alpha_v);
+            MlasStoreFloat16x4(C_data, accu0);
+            MlasStoreFloat16x4(C_data + ldc, accu1);
+        }
+
+        CountN -= 4, B_data += 4 * ldb, C_data += 4;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu0[4];
+        float16x8_t accu1[4];
+        size_t i = 0;
+        for (i = 0; i < 4; ++i) {
+            accu0[i] = MlasZeroFloat16x8();
+            accu1[i] = MlasZeroFloat16x8();
+        }
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            for (i = 0; i < CountN; ++i) {
+                float16x8_t bi = MlasLoadFloat16x8(b + i * ldb);
+                accu0[i] = vfmaq_f16(accu0[i], bi, a0);
+                accu1[i] = vfmaq_f16(accu1[i], bi, a1);
+            }
+        }
+        Transpose4x8(accu0[0], accu0[1], accu0[2], accu0[3]);
+        Transpose4x8(accu1[0], accu1[1], accu1[2], accu1[3]);
+        float16x8_t accu00 = addq_f16x4(accu0[0], accu0[1], accu0[2], accu0[3]);
+        float16x4_t accu_0 = vadd_f16(vget_low_f16(accu00), vget_high_f16(accu00));
+        float16x8_t accu10 = addq_f16x4(accu1[0], accu1[1], accu1[2], accu1[3]);
+        float16x4_t accu_1 = vadd_f16(vget_low_f16(accu10), vget_high_f16(accu10));
+
+        if (k & 4) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadFloat16x4(b + i * ldb);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu_0 = ma_lane_f16_accu(accu_0, bs[0], bs[1], bs[2], bs[3], a0);
+            accu_1 = ma_lane_f16_accu(accu_1, bs[0], bs[1], bs[2], bs[3], a1);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadPartialFloat16x4(b + i * ldb, k);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            accu_0 = vfma_lane_f16(accu_0, bs[0], a0, 0);
+            accu_1 = vfma_lane_f16(accu_1, bs[0], a1, 0);
+            if (k > 1) {
+                accu_0 = vfma_lane_f16(accu_0, bs[1], a0, 1);
+                accu_1 = vfma_lane_f16(accu_1, bs[1], a1, 1);
+            }
+            if (k > 2) {
+                accu_0 = vfma_lane_f16(accu_0, bs[2], a0, 2);
+                accu_1 = vfma_lane_f16(accu_1, bs[2], a1, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c0 = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t c1 = MlasLoadPartialFloat16x4(C_data + ldc, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu_0 = vfma_f16(c0, accu_0, alpha_v);
+            accu_1 = vfma_f16(c1, accu_1, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu_0, CountN);
+            MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c0 = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t c1 = MlasLoadPartialFloat16x4(C_data + ldc, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu_0 = vfma_f16(vmul_f16(c0, beta_v), accu_0, alpha_v);
+            accu_1 = vfma_f16(vmul_f16(c1, beta_v), accu_1, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu_0, CountN);
+            MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu_0 = vmul_f16(accu_0, alpha_v);
+            accu_1 = vmul_f16(accu_1, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu_0, CountN);
+            MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN);
+        }
+    }
+}
+
+// Full K. Directly save to C.
+void HGemm_TransposedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* B,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    if (CountM > 2) {
+        MLAS_THROW_EX(std::runtime_error, "HGemm_TransposedB_Kernel only support <= 2 rows");
+    }
+    const auto* A_data = reinterpret_cast<const _mlas_fp16_*>(A);
+    const auto* B_data = reinterpret_cast<const _mlas_fp16_*>(B);
+    auto* C_data = reinterpret_cast<_mlas_fp16_*>(C);
+    const auto f16_0 = MLAS_FP16(0.0f);
+    const auto f16_1 = MLAS_FP16(1.0f);
+    if (CountM == 1) {
+        if (beta == f16_0.val) {
+            HGemm_TransposedB_Kernel_M1<0>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedB_Kernel_M1<1>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta);
+        } else {
+            HGemm_TransposedB_Kernel_M1<2>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta);
+        }
+    } else {
+        if (beta == f16_0.val) {
+            HGemm_TransposedB_Kernel_M2<0>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedB_Kernel_M2<1>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta);
+        } else {
+            HGemm_TransposedB_Kernel_M2<2>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta);
+        }
+    }
+}
+
+template <int beta_behavior> // 0: beta == 0, 1: beta == 1, 2: beta != 0 && beta != 1
+void HGemm_TransposedPackedB_Kernel_M1(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* PackedB,
+    _mlas_fp16_* C,
+    size_t CountN,
+    size_t CountK,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 16; CountN -= 16, C += 16) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 16) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t b40 = MlasLoadFloat16x8(PackedB + 64);
+            float16x8_t b41 = MlasLoadFloat16x8(PackedB + 72);
+            float16x8_t b50 = MlasLoadFloat16x8(PackedB + 80);
+            float16x8_t b51 = MlasLoadFloat16x8(PackedB + 88);
+            float16x8_t b60 = MlasLoadFloat16x8(PackedB + 96);
+            float16x8_t b61 = MlasLoadFloat16x8(PackedB + 104);
+            float16x8_t b70 = MlasLoadFloat16x8(PackedB + 112);
+            float16x8_t b71 = MlasLoadFloat16x8(PackedB + 120);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = maq_laneq_f16_accu(accu0, b00, b10, b20, b30, b40, b50, b60, b70, a0);
+            accu1 = maq_laneq_f16_accu(accu1, b01, b11, b21, b31, b41, b51, b61, b71, a0);
+        }
+
+        if (k & 4) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, b00, b10, b20, b30, a0);
+            accu1 = maq_lane_f16_accu(accu1, b01, b11, b21, b31, a0);
+            k -= 4, a += 4, PackedB += 4 * 16;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            accu0 = vfmaq_lane_f16(accu0, b00, a0, 0);
+            accu1 = vfmaq_lane_f16(accu1, b01, a0, 0);
+            if (k > 1) {
+                float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+                float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+                accu0 = vfmaq_lane_f16(accu0, b10, a0, 1);
+                accu1 = vfmaq_lane_f16(accu1, b11, a0, 1);
+            }
+            if (k > 2) {
+                float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+                float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+                accu0 = vfmaq_lane_f16(accu0, b20, a0, 2);
+                accu1 = vfmaq_lane_f16(accu1, b21, a0, 2);
+            }
+
+            PackedB += k * 16;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vfmaq_f16(c0, accu0, alpha_v);
+            accu1 = vfmaq_f16(c1, accu1, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+            MlasStoreFloat16x8(C + 8, accu1);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu0 = vfmaq_f16(vmulq_f16(c0, beta_v), accu0, alpha_v);
+            accu1 = vfmaq_f16(vmulq_f16(c1, beta_v), accu1, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+            MlasStoreFloat16x8(C + 8, accu1);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vmulq_f16(accu0, alpha_v);
+            accu1 = vmulq_f16(accu1, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+            MlasStoreFloat16x8(C + 8, accu1);
+        }
+    }
+
+    if (CountN & 8) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu0 = vfmaq_lane_f16(accu0, b0, a0, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu0 = vfmaq_lane_f16(accu0, b1, a0, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu0 = vfmaq_lane_f16(accu0, b2, a0, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vfmaq_f16(c0, accu0, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu0 = vfmaq_f16(vmulq_f16(c0, beta_v), accu0, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vmulq_f16(accu0, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+        }
+
+        CountN -= 8, C += 8;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu0 = vfmaq_lane_f16(accu0, b0, a0, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu0 = vfmaq_lane_f16(accu0, b1, a0, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu0 = vfmaq_lane_f16(accu0, b2, a0, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        float16x4_t accu_low = vget_low_f16(accu0);
+        float16x4_t accu_high = vget_high_f16(accu0);
+
+        if (CountN & 4) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vfma_f16(c0, accu_low, alpha_v));
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStoreFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu_low, alpha_v));
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vmul_f16(accu_low, alpha_v));
+            }
+
+            CountN -= 4, C += 4;
+            accu_low = accu_high;
+        }
+
+        if (CountN) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vfma_f16(c0, accu_low, alpha_v), CountN);
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStorePartialFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu_low, alpha_v), CountN);
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vmul_f16(accu_low, alpha_v), CountN);
+            }
+        }
+    }
+}
+
+template <int beta_behavior> // 0: beta == 0, 1: beta == 1, 2: beta != 0 && beta != 1
+void HGemm_TransposedPackedB_Kernel_M2(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* PackedB,
+    _mlas_fp16_* C,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 16; CountN -= 16, C += 16) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu01 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        float16x8_t accu11 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 16) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t b40 = MlasLoadFloat16x8(PackedB + 64);
+            float16x8_t b41 = MlasLoadFloat16x8(PackedB + 72);
+            float16x8_t b50 = MlasLoadFloat16x8(PackedB + 80);
+            float16x8_t b51 = MlasLoadFloat16x8(PackedB + 88);
+            float16x8_t b60 = MlasLoadFloat16x8(PackedB + 96);
+            float16x8_t b61 = MlasLoadFloat16x8(PackedB + 104);
+            float16x8_t b70 = MlasLoadFloat16x8(PackedB + 112);
+            float16x8_t b71 = MlasLoadFloat16x8(PackedB + 120);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = maq_laneq_f16_accu(accu00, b00, b10, b20, b30, b40, b50, b60, b70, a0);
+            accu01 = maq_laneq_f16_accu(accu01, b01, b11, b21, b31, b41, b51, b61, b71, a0);
+            accu10 = maq_laneq_f16_accu(accu10, b00, b10, b20, b30, b40, b50, b60, b70, a1);
+            accu11 = maq_laneq_f16_accu(accu11, b01, b11, b21, b31, b41, b51, b61, b71, a1);
+        }
+
+        if (k & 4) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu00 = maq_lane_f16_accu(accu00, b00, b10, b20, b30, a0);
+            accu01 = maq_lane_f16_accu(accu01, b01, b11, b21, b31, a0);
+            accu10 = maq_lane_f16_accu(accu10, b00, b10, b20, b30, a1);
+            accu11 = maq_lane_f16_accu(accu11, b01, b11, b21, b31, a1);
+            k -= 4, a += 4, PackedB += 4 * 16;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            accu00 = vfmaq_lane_f16(accu00, b00, a0, 0);
+            accu01 = vfmaq_lane_f16(accu01, b01, a0, 0);
+            accu10 = vfmaq_lane_f16(accu10, b00, a1, 0);
+            accu11 = vfmaq_lane_f16(accu11, b01, a1, 0);
+            if (k > 1) {
+                float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+                float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+                accu00 = vfmaq_lane_f16(accu00, b10, a0, 1);
+                accu01 = vfmaq_lane_f16(accu01, b11, a0, 1);
+                accu10 = vfmaq_lane_f16(accu10, b10, a1, 1);
+                accu11 = vfmaq_lane_f16(accu11, b11, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+                float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+                accu00 = vfmaq_lane_f16(accu00, b20, a0, 2);
+                accu01 = vfmaq_lane_f16(accu01, b21, a0, 2);
+                accu10 = vfmaq_lane_f16(accu10, b20, a1, 2);
+                accu11 = vfmaq_lane_f16(accu11, b21, a1, 2);
+            }
+            PackedB += k * 16;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c00 = MlasLoadFloat16x8(C);
+            float16x8_t c01 = MlasLoadFloat16x8(C + 8);
+            float16x8_t c10 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t c11 = MlasLoadFloat16x8(C + ldc + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vfmaq_f16(c00, accu00, alpha_v);
+            accu01 = vfmaq_f16(c01, accu01, alpha_v);
+            accu10 = vfmaq_f16(c10, accu10, alpha_v);
+            accu11 = vfmaq_f16(c11, accu11, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + 8, accu01);
+            MlasStoreFloat16x8(C + ldc, accu10);
+            MlasStoreFloat16x8(C + ldc + 8, accu11);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c00 = MlasLoadFloat16x8(C);
+            float16x8_t c01 = MlasLoadFloat16x8(C + 8);
+            float16x8_t c10 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t c11 = MlasLoadFloat16x8(C + ldc + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu00 = vfmaq_f16(vmulq_f16(c00, beta_v), accu00, alpha_v);
+            accu01 = vfmaq_f16(vmulq_f16(c01, beta_v), accu01, alpha_v);
+            accu10 = vfmaq_f16(vmulq_f16(c10, beta_v), accu10, alpha_v);
+            accu11 = vfmaq_f16(vmulq_f16(c11, beta_v), accu11, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + 8, accu01);
+            MlasStoreFloat16x8(C + ldc, accu10);
+            MlasStoreFloat16x8(C + ldc + 8, accu11);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vmulq_f16(accu00, alpha_v);
+            accu01 = vmulq_f16(accu01, alpha_v);
+            accu10 = vmulq_f16(accu10, alpha_v);
+            accu11 = vmulq_f16(accu11, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + 8, accu01);
+            MlasStoreFloat16x8(C + ldc, accu10);
+            MlasStoreFloat16x8(C + ldc + 8, accu11);
+        }
+    }
+
+    if (CountN & 8) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = maq_laneq_f16_accu(accu00, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+            accu10 = maq_laneq_f16_accu(accu10, b0, b1, b2, b3, b4, b5, b6, b7, a1);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu00 = maq_lane_f16_accu(accu00, b0, b1, b2, b3, a0);
+            accu10 = maq_lane_f16_accu(accu10, b0, b1, b2, b3, a1);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu00 = vfmaq_lane_f16(accu00, b0, a0, 0);
+            accu10 = vfmaq_lane_f16(accu10, b0, a1, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu00 = vfmaq_lane_f16(accu00, b1, a0, 1);
+                accu10 = vfmaq_lane_f16(accu10, b1, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu00 = vfmaq_lane_f16(accu00, b2, a0, 2);
+                accu10 = vfmaq_lane_f16(accu10, b2, a1, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vfmaq_f16(c0, accu00, alpha_v);
+            accu10 = vfmaq_f16(c1, accu10, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + ldc, accu10);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu00 = vfmaq_f16(vmulq_f16(c0, beta_v), accu00, alpha_v);
+            accu10 = vfmaq_f16(vmulq_f16(c1, beta_v), accu10, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + ldc, accu10);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vmulq_f16(accu00, alpha_v);
+            accu10 = vmulq_f16(accu10, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + ldc, accu10);
+        }
+
+        CountN -= 8, C += 8;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+            accu1 = maq_laneq_f16_accu(accu1, b0, b1, b2, b3, b4, b5, b6, b7, a1);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            accu1 = maq_lane_f16_accu(accu1, b0, b1, b2, b3, a1);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu0 = vfmaq_lane_f16(accu0, b0, a0, 0);
+            accu1 = vfmaq_lane_f16(accu1, b0, a1, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu0 = vfmaq_lane_f16(accu0, b1, a0, 1);
+                accu1 = vfmaq_lane_f16(accu1, b1, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu0 = vfmaq_lane_f16(accu0, b2, a0, 2);
+                accu1 = vfmaq_lane_f16(accu1, b2, a1, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        float16x4_t accu0_low = vget_low_f16(accu0);
+        float16x4_t accu0_high = vget_high_f16(accu0);
+        float16x4_t accu1_low = vget_low_f16(accu1);
+        float16x4_t accu1_high = vget_high_f16(accu1);
+
+        if (CountN & 4) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t c1 = MlasLoadFloat16x4(C + ldc);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vfma_f16(c0, accu0_low, alpha_v));
+                MlasStoreFloat16x4(C + ldc, vfma_f16(c1, accu1_low, alpha_v));
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t c1 = MlasLoadFloat16x4(C + ldc);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStoreFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu0_low, alpha_v));
+                MlasStoreFloat16x4(C + ldc, vfma_f16(vmul_f16(c1, beta_v), accu1_low, alpha_v));
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vmul_f16(accu0_low, alpha_v));
+                MlasStoreFloat16x4(C + ldc, vmul_f16(accu1_low, alpha_v));
+            }
+            CountN -= 4, C += 4;
+            accu0_low = accu0_high;
+            accu1_low = accu1_high;
+        }
+
+        if (CountN) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t c1 = MlasLoadPartialFloat16x4(C + ldc, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vfma_f16(c0, accu0_low, alpha_v), CountN);
+                MlasStorePartialFloat16x4(C + ldc, vfma_f16(c1, accu1_low, alpha_v), CountN);
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t c1 = MlasLoadPartialFloat16x4(C + ldc, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStorePartialFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu0_low, alpha_v), CountN);
+                MlasStorePartialFloat16x4(C + ldc, vfma_f16(vmul_f16(c1, beta_v), accu1_low, alpha_v), CountN);
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vmul_f16(accu0_low, alpha_v), CountN);
+                MlasStorePartialFloat16x4(C + ldc, vmul_f16(accu1_low, alpha_v), CountN);
+            }
+        }
+    }
+}
+
+void HGemm_TransposedPackedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* PackedB,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    if (CountM > 2) {
+        MLAS_THROW_EX(std::runtime_error, "HGemm_TransposedPackedB_Kernel only support <= 2 rows");
+    }
+
+    const auto* A_data = reinterpret_cast<const _mlas_fp16_*>(A);
+    const auto* PackedB_data = reinterpret_cast<const _mlas_fp16_*>(PackedB);
+    auto* C_data = reinterpret_cast<_mlas_fp16_*>(C);
+    const auto f16_0 = MLAS_FP16(0.0f);
+    const auto f16_1 = MLAS_FP16(1.0f);
+    if (CountM == 1) {
+        if (beta == f16_0.val) {
+            HGemm_TransposedPackedB_Kernel_M1<0>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedPackedB_Kernel_M1<1>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta);
+        } else {
+            HGemm_TransposedPackedB_Kernel_M1<2>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta);
+        }
+    } else {
+        if (beta == f16_0.val) {
+            HGemm_TransposedPackedB_Kernel_M2<0>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedPackedB_Kernel_M2<1>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta);
+        } else {
+            HGemm_TransposedPackedB_Kernel_M2<2>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta);
+        }
+    }
+}
+
+}  // namespace hgemm_neon
diff --git a/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp
new file mode 100644
index 0000000000000..5b131a8e41f21
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp
@@ -0,0 +1,28 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    hgemm_kernel_neon.cpp
+
+Abstract:
+
+    This module implements half precision GEMM kernel for neon.
+
+--*/
+
+#include "mlasi.h"
+#include "halfgemm.h"
+
+const MLAS_HGEMM_DISPATCH MlasHGemmDispatchNeon = [](){
+    MLAS_HGEMM_DISPATCH d;
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    d.HPackBKernel_TransposedB = hgemm_neon::HPackB_TransposedB_Kernel;
+    d.HGemmKernel_TransposedB = hgemm_neon::HGemm_TransposedB_Kernel;
+    d.HGemmKernel_TransposedPackedB = hgemm_neon::HGemm_TransposedPackedB_Kernel;
+#endif
+    return d;
+}();
diff --git a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
index 69e37d2b916d1..5b1f9d7d4a2dc 100644
--- a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
+++ b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
@@ -93,39 +93,6 @@ Transpose8x8(uint8x8_t& v0, uint8x8_t& v1, uint8x8_t& v2, uint8x8_t& v3,
     v7 = vreinterpret_u8_u32(c3.val[1]);
 }
 
-MLAS_FORCEINLINE void
-Transpose4x8(float16x8_t& v0, float16x8_t& v1, float16x8_t& v2, float16x8_t& v3)
-{
-    // |v00|v01|v02|v03|v04|v05|v06|v07|
-    // |v10|v11|v12|v13|v14|v15|v16|v17|
-    // |v20|v21|v22|v23|v24|v25|v26|v27|
-    // |v30|v31|v32|v33|v34|v35|v36|v37|
-    //  =>
-    // |v00|v10|v20|v30|v04|v14|v24|v34|
-    // |v01|v11|v21|v31|v05|v15|v25|v35|
-    // |v02|v12|v22|v32|v06|v16|v26|v36|
-    // |v03|v13|v23|v33|v07|v17|v27|v37|
-    float16x8x2_t t01 = vtrnq_f16(v0, v1);
-    float16x8x2_t t23 = vtrnq_f16(v2, v3);
-
-    v0 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
-    v1 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
-    v2 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
-    v3 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
-}
-
-MLAS_FORCEINLINE void
-Transpose4x4(float16x4_t& v0, float16x4_t& v1, float16x4_t& v2, float16x4_t& v3)
-{
-    float16x4x2_t t01 = vtrn_f16(v0, v1);
-    float16x4x2_t t23 = vtrn_f16(v2, v3);
-
-    v0 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
-    v1 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
-    v2 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
-    v3 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
-}
-
 void
 HQ4BitGemmPackQuantBData_CompFp16(
     size_t N,
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 100d7d47751aa..56fad6bb3412a 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -301,6 +301,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
 // Define the default strides to step through slices of the input matrices.
 //
 
+#define MLAS_HGEMM_STRIDEN                          32
+#define MLAS_HGEMM_STRIDEK                          512
 #define MLAS_SGEMM_STRIDEN                          128
 #define MLAS_SGEMM_STRIDEK                          128
 #define MLAS_SGEMM_PACKED_STRIDEN                   128
@@ -317,6 +319,7 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
 // the effort at this time.
 //
 
+#define MLAS_HGEMM_STRIDEN_THREAD_ALIGN             16
 #define MLAS_SGEMM_STRIDEN_THREAD_ALIGN             16
 #define MLAS_DGEMM_STRIDEN_THREAD_ALIGN             8
 #define MLAS_QGEMM_STRIDEN_THREAD_ALIGN             16
@@ -944,6 +947,7 @@ extern "C" {
 #define MLAS_SGEMM_THREAD_COMPLEXITY                (size_t(64) * size_t(1024))
 #define MLAS_DGEMM_THREAD_COMPLEXITY                (size_t(64) * size_t(1024))
 #define MLAS_QGEMM_THREAD_COMPLEXITY                65536
+#define MLAS_HGEMM_THREAD_COMPLEXITY                65536
 
 #if defined(__aarch64__) && defined(__linux__)
 #define MLAS_SBGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024))
@@ -1055,6 +1059,12 @@ extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni;
 struct MLAS_ROPE_DISPATCH;
 extern const MLAS_ROPE_DISPATCH MlasRopeDispatchNeon;
 
+//
+// half gemm dispatch structure
+//
+struct MLAS_HGEMM_DISPATCH;
+extern const MLAS_HGEMM_DISPATCH MlasHGemmDispatchNeon;
+
 
 //
 // Quantized depthwise convolution kernels.
@@ -1217,6 +1227,7 @@ struct MLAS_PLATFORM {
     MLAS_CAST_F32_TO_F16_KERNEL* CastF32ToF16Kernel;
 
     const MLAS_ROPE_DISPATCH* RopeDispatch{nullptr};
+    const MLAS_HGEMM_DISPATCH* HGemmDispatch{nullptr};
 };
 
 inline
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index ec572a4150292..026a954bbc6c2 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -544,6 +544,7 @@ Return Value:
     this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
     this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
     this->RopeDispatch = &MlasRopeDispatchNeon;
+    this->HGemmDispatch = &MlasHGemmDispatchNeon;
 
     //
     // Check if the processor supports ASIMD dot product instructions.
diff --git a/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc b/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc
index b68cbaf85bcff..b1d6c51f693fd 100644
--- a/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc
@@ -147,7 +147,7 @@ static Status MatchAndProcess(
       RuntimeOptimizationRecord::ProducedOpIdVector produced_op_ids{};
       produced_op_ids.reserve(action_saved_state.produced_node_op_schemas.size());
 
-      for (const auto op_schema : action_saved_state.produced_node_op_schemas) {
+      for (const auto& op_schema : action_saved_state.produced_node_op_schemas) {
         produced_op_ids.push_back(utils::MakeOpId(*op_schema));
         if (save_context->record_produced_node_op_schema) {
           status = save_context->record_produced_node_op_schema(*op_schema);
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc
index 950ac247a2046..489cd19b11302 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.cc
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc
@@ -64,6 +64,10 @@ EtwRegistrationManager& EtwRegistrationManager::Instance() {
   return instance;
 }
 
+bool EtwRegistrationManager::SupportsETW() {
+  return true;
+}
+
 bool EtwRegistrationManager::IsEnabled() const {
   std::lock_guard<std::mutex> lock(provider_change_mutex_);
   return is_enabled_;
@@ -248,5 +252,19 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
 }
 }  // namespace logging
 }  // namespace onnxruntime
+#else
+// ETW is not supported on this platform but should still define a dummy EtwRegistrationManager
+// so that it can be used in the EP provider bridge.
+namespace onnxruntime {
+namespace logging {
+EtwRegistrationManager& EtwRegistrationManager::Instance() {
+  static EtwRegistrationManager instance;
+  return instance;
+}
 
+bool EtwRegistrationManager::SupportsETW() {
+  return false;
+}
+}  // namespace logging
+}  // namespace onnxruntime
 #endif  // ETW_TRACE_LOGGING_SUPPORTED
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index 2a798a28f13de..62b762886ca82 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -60,6 +60,9 @@ class EtwRegistrationManager {
   // Singleton instance access
   static EtwRegistrationManager& Instance();
 
+  // Returns true if ETW is supported at all.
+  static bool SupportsETW();
+
   // Check if ETW logging is enabled
   bool IsEnabled() const;
 
@@ -110,5 +113,33 @@ class EtwRegistrationManager {
 
 }  // namespace logging
 }  // namespace onnxruntime
+#else
+// ETW is not supported on this platform but should still define a dummy EtwRegistrationManager
+// so that it can be used in the EP provider bridge.
+#include "core/common/logging/severity.h"
 
+namespace onnxruntime {
+namespace logging {
+class EtwRegistrationManager {
+ public:
+  using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                 ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                 PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
+
+  static EtwRegistrationManager& Instance();
+  static bool SupportsETW();
+  bool IsEnabled() const { return false; }
+  UCHAR Level() const { return 0; }
+  Severity MapLevelToSeverity() { return Severity::kFATAL; }
+  uint64_t Keyword() const { return 0; }
+  HRESULT Status() const { return 0; }
+  void RegisterInternalCallback(const EtwInternalCallback& callback) {}
+  void UnregisterInternalCallback(const EtwInternalCallback& callback) {}
+
+ private:
+  EtwRegistrationManager() = default;
+  ~EtwRegistrationManager() = default;
+};
+}  // namespace logging
+}  // namespace onnxruntime
 #endif  // ETW_TRACE_LOGGING_SUPPORTED
diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
index 118cb4a3ba4bd..fbbd4273757d5 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@@ -150,14 +150,14 @@ struct PoolAttributes {
         case AutoPadType::VALID:
           *pad_head = 0;
           *pad_tail = 0;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, 0, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, 0, 0, dilation);
           break;
         case AutoPadType::SAME_LOWER: {
           int64_t legacy_target_size = (in_size + stride - 1) / stride;
           int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
           *pad_head = (pad_needed + 1) / 2;
           *pad_tail = pad_needed - *pad_head;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
           break;
         }
         case AutoPadType::SAME_UPPER: {
@@ -165,7 +165,7 @@ struct PoolAttributes {
           int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
           *pad_head = pad_needed / 2;
           *pad_tail = pad_needed - *pad_head;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
           break;
         }
         default: {
@@ -173,7 +173,7 @@ struct PoolAttributes {
         }
       }
     } else {
-      *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head + *pad_tail, dilation);
+      *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
     }
   }
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -184,13 +184,21 @@ struct PoolAttributes {
   int64_t ComputeOutputSize(int64_t in_size,
                             int64_t stride,
                             int64_t kernel,
-                            int64_t pad_needed,
+                            int64_t pad_head,
+                            int64_t pad_tail,
                             int64_t dilation) const {
-    if (ceil_mode == 0) {
-      return static_cast<int64_t>(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1);
+    int64_t numerator = in_size + pad_head + pad_tail - dilation * (kernel - 1) - 1;
+    int64_t out_size = numerator / stride + 1;
+
+    if (ceil_mode == 1) {
+      out_size = static_cast<int64_t>(std::ceil(static_cast<float>(numerator) / stride)) + 1;
+      // Ensure that the last pooling starts inside the image (at least 1 pixel)
+      // Reference: https://github.com/onnx/onnx/pull/5741
+      if ((out_size - 1) * stride >= in_size + pad_head) {
+        --out_size;
+      }
     }
-    return static_cast<int64_t>(
-        std::ceil(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1));
+    return out_size;
   }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
diff --git a/onnxruntime/core/providers/cpu/nn/pool_functors.h b/onnxruntime/core/providers/cpu/nn/pool_functors.h
index d3205278b72f6..476a9a0338969 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_functors.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_functors.h
@@ -406,6 +406,7 @@ struct AveragePool1DTask final {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       y_d[ph] = 0;
       int total_elements = 0;
       for (int64_t h = hstart; h < hend; h += dilation_h) {
@@ -461,9 +462,11 @@ struct AveragePool2DTask final {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       for (int64_t pw = 0; pw < pooled_width; ++pw) {
         int64_t wstart = pw * stride_w - pads[1];
         int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        wend = std::min(wend, width + pads[3]);
         const int64_t pool_index = ph * pooled_width + pw;
         y_d[pool_index] = 0;
         int total_elements = 0;
@@ -532,12 +535,15 @@ struct AveragePool3DTask {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       for (int64_t pw = 0; pw < pooled_width; ++pw) {
         int64_t wstart = pw * stride_w - pads[1];
         int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        wend = std::min(wend, width + pads[3]);
         for (int64_t pd = 0; pd < pooled_depth; ++pd) {
           int64_t dstart = pd * stride_d - pads[2];
           int64_t dend = dstart + kernel_shape[2] * dilation_d;
+          dend = std::min(dend, depth + pads[5]);
           const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
           y_d[pool_index] = 0;
           int total_elements = 0;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
index 91cad034d8854..fd1720d69eebd 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
index 75a66d3a14643..5874eb1e7dc3b 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
index 5599fbdc69bdd..91d1a38e71e6f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
index 9059de817e210..03329b9159c06 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
index 9821d9267c71f..becd677e32ac1 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
index a8394faec51be..fa5e292be0ecd 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
index 5477cd16f9c01..a7a837ae210b4 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
index ef8709641e2d0..039d8510bb8d2 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
index 7d0e04fbd7b0e..ed9062f894660 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc
index 218c41d6f07c0..fc2348951edb7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
index b5e9c011990ce..986ce78fb1acc 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
index d6da9181b5a3d..ccd3f8b571fcb 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
index 66eefcd6e4840..cff96c2f1ff99 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc
index d7b35572e6cd1..250b190091a52 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
index 6a633c443c9e5..e3dcee1e3d597 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/framework/tensorprotoutils.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
index aeadbd17053cf..a80742aef9cb6 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
index b0404ebec0583..8127de0a0f05f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
index a2a4786b72ec7..10c5efb84ed8f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
index d13b81c2a14b8..eb81f5e3f59ee 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
index a6da290753b74..fbb353f949f48 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc
index c8641093ee7eb..d65c069851c1f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
index f2f9165d2f3cc..fad5d8289c6b0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
index 44403010c936c..af5aeba6c8236 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
index facdc7132dc00..52b075b0271ef 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
index a2a8b4512b028..8fa915de95a72 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
index edee298ad1ccf..7509fd15f1c5e 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 #include <algorithm>
 
 #include "core/common/logging/logging.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
index fb3ca5e6175fa..44510c33c004c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
index 6fe5ca32fe044..4a9e3eb00a787 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
index dbd960ee5536c..77df9d2fd771c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
index 95cd813800c9a..b9ebbace8d391 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 4db335afa98b0..3cbf7d1ee40e8 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #pragma once
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 #include <unordered_set>
 
 #include "core/common/inlined_containers_fwd.h"
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 79674fd706151..3df231e53e7c0 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -2,13 +2,15 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
-#include "core/graph/constants.h"
-#include "core/providers/qnn/builder/qnn_model.h"
 
 #include <iostream>
 #include <fstream>
 #include <filesystem>
 
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model.h"
+
 namespace onnxruntime {
 namespace qnn {
 
@@ -51,9 +53,9 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
     // There is only one EPContext node in one filtered graph -- this is guaranteed by GetCapability
     const onnxruntime::GraphViewer& graph_viewer(fused_nodes_and_graphs[i].filtered_graph);
     ORT_RETURN_IF(graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node->OpType(), "Should only filter in the EPContext node.");
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node.OpType(), "Should only filter in the EPContext node.");
+    NodeAttrHelper node_helper(ep_context_node);
     int64_t is_main_context = node_helper.Get(MAIN_CONTEXT, static_cast<int64_t>(0));
     if (1 == is_main_context) {
       main_context_pos.push_back(static_cast<int>(i));
@@ -68,17 +70,16 @@ Status CreateNodeArgs(const std::vector<std::string>& names,
                       const std::unordered_map<std::string, OnnxTensorInfo>& tensor_info_table,
                       std::vector<NodeArg*>& node_args,
                       onnxruntime::Graph& graph) {
-  using namespace ONNX_NAMESPACE;
   for (size_t i = 0; i < names.size(); ++i) {
     std::string name = names[i];
     ORT_RETURN_IF(tensor_info_table.find(name) == tensor_info_table.end(), "Tensor name: ", name, " not found in tensor_info_table");
     const OnnxTensorInfo& tensor_info = tensor_info_table.at(name);
-    TypeProto tensor_type;
-    tensor_type.mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
+    std::unique_ptr<ONNX_NAMESPACE::TypeProto> tensor_type = Factory<ONNX_NAMESPACE::TypeProto>::Create();
+    tensor_type->mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
     for (size_t j = 0; j < tensor_info.shape_.size(); ++j) {
-      tensor_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
+      tensor_type->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
     }
-    auto& input_arg = graph.GetOrCreateNodeArg(name, &tensor_type);
+    auto& input_arg = graph.GetOrCreateNodeArg(name, tensor_type.get());
     node_args.push_back(&input_arg);
   }
   return Status::OK();
@@ -161,8 +162,8 @@ Status TryGetMaxSpillFillSize(const std::vector<IExecutionProvider::FusedNodeAnd
     auto index = main_context_pos_list[i];
     const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[index].filtered_graph);
     ORT_RETURN_IF(main_ctx_graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = main_ctx_graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *main_ctx_graph_viewer.Nodes().begin();
+    NodeAttrHelper node_helper(ep_context_node);
     int64_t max_size = node_helper.Get(MAX_SIZE, static_cast<int64_t>(0));
     if (max_size > max_spill_fill_size) {
       max_spill_fill_size = max_size;
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 92c5391b40f09..3dfa0ae21001b 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -6,12 +6,8 @@
 #include <string>
 #include <vector>
 
-#include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/graph/model.h"
-#include "core/framework/execution_provider.h"
+#include "core/providers/qnn/builder/qnn_def.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h
index 05398c3f22ea2..0846275496ebf 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder.h
@@ -3,9 +3,7 @@
 
 #pragma once
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index e411c2a6bf536..3d66003fb2bca 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -5,8 +5,6 @@
 #include <unordered_map>
 #include <string>
 
-#include <core/graph/graph.h>
-
 #include "op_builder_factory.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
index c685fa065e2ba..e3a6141c292dd 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
@@ -1,14 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index ed70111087e19..cd1ee72e00d4f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -2,15 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include <utility>
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include <core/providers/common.h>
-
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/cpu/tensor/transpose.h"
-#include "core/common/safeint.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -271,37 +265,189 @@ Status BaseOpBuilder::SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper&
   return Status::OK();
 }
 
-Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                                           const onnx::TensorProto& initializer,
-                                           const std::vector<size_t>& perm,
-                                           std::vector<uint8_t>& transposed_data) const {
-  const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer.data_type())->GetElementType();
-  const auto tensor_shape_dims = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-  TensorShape tensor_shape{tensor_shape_dims};
-  AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-  Tensor in_tensor = Tensor(tensor_dtype, tensor_shape, cpu_allocator);
-
-  auto rank = perm.size();
-  std::vector<int64_t> new_tensor_shape_dims;
-  std::vector<size_t> permutations;
-  new_tensor_shape_dims.reserve(rank);
-  permutations.reserve(rank);
-  for (int64_t p : perm) {
-    permutations.push_back(p);
-    new_tensor_shape_dims.push_back(tensor_shape_dims[p]);
+static Status GetTransposeStrides(const TensorShape& input_shape,
+                                  gsl::span<const size_t> perm,
+                                  gsl::span<size_t> input_strides,
+                                  gsl::span<size_t> output_strides) {
+  const size_t rank = input_shape.NumDimensions();
+  ORT_RETURN_IF_NOT(perm.size() == rank, "Expected perm size of ", rank);
+  ORT_RETURN_IF_NOT(input_strides.size() == rank, "Expected input_strides size of ", rank);
+  ORT_RETURN_IF_NOT(output_strides.size() == rank, "Expected output_strides size of ", rank);
+  std::vector<int64_t> output_shape_dims(rank);
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<int64_t, size_t>(input_shape.GetDims(), perm, output_shape_dims)));
+  const TensorShape output_shape = TensorShape::FromExistingBuffer(output_shape_dims);
+
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? input_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    input_strides[i] = static_cast<size_t>(stride);
+  }
+
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? output_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    output_strides[i] = static_cast<size_t>(stride);
   }
 
-  TensorShape new_tensor_shape(new_tensor_shape_dims);
-  Tensor out_tensor = Tensor(tensor_dtype, new_tensor_shape, cpu_allocator);
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), initializer, in_tensor));
-  ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor));
-  onnx::TensorProto new_tensor_proto = onnxruntime::utils::TensorToTensorProto(out_tensor, "test");
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(new_tensor_proto, transposed_data));
+  return Status::OK();
+}
+
+// Internal function to transpose data of rank 5 with the given permutation.
+// Example: transpose input from either (N,C,H,W,D) or (C,N,H,W,D) to (H,W,D,C,N).
+static Status TransposeDataRank5(const TensorShape& input_shape,
+                                 gsl::span<const size_t> perm,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer) {
+  std::array<size_t, 5> input_strides = {};
+  std::array<size_t, 5> output_strides = {};
+  ORT_RETURN_IF_ERROR(GetTransposeStrides(input_shape, perm, input_strides, output_strides));
+
+  std::vector<size_t> perm_inverse(perm.size());
+  ORT_RETURN_IF_ERROR(qnn::utils::InvertPerm<size_t>(perm, perm_inverse));
+
+  for (int64_t d0 = 0; d0 < input_shape[0]; ++d0) {
+    for (int64_t d1 = 0; d1 < input_shape[1]; ++d1) {
+      for (int64_t d2 = 0; d2 < input_shape[2]; ++d2) {
+        for (int64_t d3 = 0; d3 < input_shape[3]; ++d3) {
+          for (int64_t d4 = 0; d4 < input_shape[4]; ++d4) {
+            const size_t src_elem_index = ((d0 * input_strides[0]) +
+                                           (d1 * input_strides[1]) +
+                                           (d2 * input_strides[2]) +
+                                           (d3 * input_strides[3]) +
+                                           (d4 * input_strides[4]));
+            const size_t dst_elem_index = ((d0 * output_strides[perm_inverse[0]]) +
+                                           (d1 * output_strides[perm_inverse[1]]) +
+                                           (d2 * output_strides[perm_inverse[2]]) +
+                                           (d3 * output_strides[perm_inverse[3]]) +
+                                           (d4 * output_strides[perm_inverse[4]]));
+
+            const size_t src_byte_index = src_elem_index * elem_byte_size;
+            const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+            assert(src_byte_index < input_buffer.size());
+            assert(dst_byte_index < output_buffer.size());
+
+            std::memcpy(&output_buffer[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+          }
+        }
+      }
+    }
+  }
 
   return Status::OK();
 }
 
+Status BaseOpBuilder::TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
+                                            std::vector<uint32_t>& data_shape,
+                                            const onnx::TensorProto& initializer,
+                                            std::vector<uint8_t>& transposed_data) const {
+  ORT_RETURN_IF_NOT(data_shape.size() == 2, "Expected shape of rank 2");
+
+  std::array<size_t, 2> perm = {1, 0};
+  std::vector<uint32_t> output_shape(data_shape.size());
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<uint32_t, size_t>(data_shape, perm, output_shape)));
+
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  for (size_t row = 0; row < data_shape[0]; row++) {
+    for (size_t col = 0; col < data_shape[1]; col++) {
+      const size_t src_elem_index = (row * data_shape[1] + col);
+      const size_t dst_elem_index = (col * output_shape[1] + row);
+      const size_t src_byte_index = src_elem_index * elem_byte_size;
+      const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+      assert(src_byte_index < input_buffer.size());
+      assert(dst_byte_index < transposed_data.size());
+
+      std::memcpy(&transposed_data[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+    }
+  }
+
+  data_shape = std::move(output_shape);  // Update parameter with final transposed shape
+  return Status::OK();
+}
+
+Status BaseOpBuilder::TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+  return TransposeFromNchwToHwcn(std::move(input_shape), elem_byte_size, input_buffer, transposed_data, is_3d);
+}
+
+Status BaseOpBuilder::TransposeFromNchwToHwcn(std::vector<int64_t>&& original_input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  std::vector<int64_t> input_shape_dims = std::move(original_input_shape_dims);
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Only support input of rank 4 or 5 but got rank ",
+                    rank);
+  ORT_RETURN_IF_NOT(output_buffer.size() == input_buffer.size(),
+                    "Expected output buffer's size to equal the input buffer's size: ",
+                    output_buffer.size(), " != ", input_buffer.size());
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Invalid element byte size due to potentially unsupported type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (N,C,H,W,1)
+  }
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            nchw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+  return TransposeFromCnhwToHwcn(std::move(input_shape), elem_byte_size, input_buffer, transposed_data, is_3d);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(std::vector<int64_t>&& original_input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  std::vector<int64_t> input_shape_dims = std::move(original_input_shape_dims);
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Only support input of rank 4 or 5 but got rank ",
+                    rank);
+  ORT_RETURN_IF_NOT(output_buffer.size() == input_buffer.size(),
+                    "Expected output buffer's size to equal the input buffer's size: ",
+                    output_buffer.size(), " != ", input_buffer.size());
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Invalid element byte size due to potentially unsupported type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (C,N,H,W,1)
+  }
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            cnhw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
+}
+
 Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrapper,
                                            const NodeUnit& node_unit,
                                            Qnn_Scalar_t& axis_qnn_scalar,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 055c0f6ccf2fa..8e34b5d87cc68 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -3,11 +3,11 @@
 
 #pragma once
 
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
-#include "core/framework/allocator.h"
 
 #include "QnnOpDef.h"
 
@@ -215,7 +215,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to channel last
-  Status NchwShapeToNhwc(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& nhwc_shape) const {
+  template <typename T>
+  Status NchwShapeToNhwc(gsl::span<const T> nchw_shape, gsl::span<T> nhwc_shape) const {
     ORT_RETURN_IF_NOT(nchw_shape.size() == 4, "shape should have 4 dimension NCHW.");
     nhwc_shape[0] = nchw_shape[0];
     nhwc_shape[1] = nchw_shape[2];
@@ -226,7 +227,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to HWCN shape, required for Conv weight
-  Status NchwShapeToHwcn(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status NchwShapeToHwcn(gsl::span<const T> nchw_shape, gsl::span<T> hwcn_shape) const {
     if (nchw_shape.size() == 4) {
       hwcn_shape[0] = nchw_shape[2];
       hwcn_shape[1] = nchw_shape[3];
@@ -246,7 +248,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // CNHW shape to HWCN shape, required for Conv weight
-  Status CnhwShapeToHwcn(const std::vector<uint32_t>& cnhw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status CnhwShapeToHwcn(gsl::span<const T> cnhw_shape, gsl::span<T> hwcn_shape) const {
     if (cnhw_shape.size() == 4) {
       hwcn_shape[0] = cnhw_shape[2];
       hwcn_shape[1] = cnhw_shape[3];
@@ -264,37 +267,31 @@ class BaseOpBuilder : public IOpBuilder {
 
     return Status::OK();
   }
-  Status TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                              const onnx::TensorProto& initializer,
-                              const std::vector<size_t>& perm,
-                              std::vector<uint8_t>& transposed_data) const;
 
   Status TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? nchw2hwcn_perm_3d : nchw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+  Status TransposeFromNchwToHwcn(std::vector<int64_t>&& input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? cnhw2hwcn_perm_3d : cnhw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+  Status TransposeFromCnhwToHwcn(std::vector<int64_t>&& input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
                                std::vector<uint32_t>& data_shape,
                                const onnx::TensorProto& initializer,
-                               std::vector<uint8_t>& transposed_data) const {
-    auto tmp = data_shape[0];
-    data_shape[0] = data_shape[1];
-    data_shape[1] = tmp;
-    std::vector<size_t> two_dim_trans_perm{1, 0};
-    return TransposeInitializer(qnn_model_wrapper, initializer, two_dim_trans_perm, transposed_data);
-  }
+                               std::vector<uint8_t>& transposed_data) const;
 
   // Onnx Pads is [x1_begin, x2_begin, x1_end, x2_end], QNN requires [x1_begin, x1_end, x2_begin, x2_end]
   void ReArranagePads(std::vector<uint32_t>& pads) const {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
index 07abcf1c7bf84..14f50fa78c1a9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
@@ -5,16 +5,11 @@
 #include <cmath>
 #include <utility>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/float16.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class BatchNormOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
index d3bdee02437e4..3139c05378171 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
@@ -4,12 +4,11 @@
 #include <string>
 #include <vector>
 
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index e5dc4d04afefd..23b3dfb063ba2 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -4,14 +4,11 @@
 #include <cassert>
 #include <limits>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class ClipOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 12887f0fb72d6..0f92778252d48 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -211,9 +206,9 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Change shape to HWCN, it could be initializer or normal input
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -413,9 +408,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Create the final shape after the weights are transposed to HWCN.
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -434,16 +429,6 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
         return static_cast<int64_t>(dim);
       });
 
-      const TensorShape tensor_shape = TensorShape::FromExistingBuffer(shape_2d_int64);  // Does not own shape data.
-      const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(
-                                             input_info.initializer_tensor->data_type())
-                                             ->GetElementType();
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, unpacked_tensor));
-
-      Tensor tensor_2d(tensor_dtype, tensor_shape, unpacked_tensor.data(), OrtMemoryInfo{});  // Does not own data.
-      ONNX_NAMESPACE::TensorProto reshaped_initializer = onnxruntime::utils::TensorToTensorProto(tensor_2d,
-                                                                                                 reshape_output);
-
       // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
       if (input_info.quant_param.IsPerChannel()) {
         ORT_RETURN_IF_ERROR(input_info.quant_param.HandleUnsqueeze<uint32_t>(input_info.shape, shape_2d));
@@ -452,10 +437,21 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       //
       // Get transposed initializer bytes.
       //
+      std::vector<uint8_t> original_tensor_bytes;
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor,
+                                                                  original_tensor_bytes));
+      unpacked_tensor.resize(original_tensor_bytes.size());
+      const size_t elem_byte_size = qnn::utils::GetElementSizeByType(
+          static_cast<ONNX_NAMESPACE::TensorProto_DataType>(input_info.initializer_tensor->data_type()));
+      ORT_RETURN_IF(elem_byte_size == 0, "Can't get element byte size from given ONNX type for initializer ",
+                    input1_name.c_str());
+
       if (conv_type == OnnxConvType::kConv) {
-        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(std::move(shape_2d_int64), elem_byte_size, original_tensor_bytes,
+                                                    unpacked_tensor, /*is_3d*/ false));
       } else if (conv_type == OnnxConvType::kConvTranspose) {
-        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(std::move(shape_2d_int64), elem_byte_size, original_tensor_bytes,
+                                                    unpacked_tensor, /*is_3d*/ false));
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
       }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index 64f676aaa9875..2bae3452199a5 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -1,14 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index 5549716751d4b..d25ec3f333bf1 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -2,14 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
index eeee26c177281..76bc766d2b04d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -1,14 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
index 4b8d079c0062a..d77d9534bf1c4 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
index d1a0e88686f39..fc92f42b376bc 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -2,16 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
index 2f66069b6609e..3c9bdf0e7f8aa 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
@@ -2,11 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
 
 #include "QnnOpDef.h"  // From QNN SDK: contains QNN constants (e.g., op names, param values).
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
index 850fd2875818e..5a158af8d542a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -1,13 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
-#include "core/providers/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index 5fc6d42a8a179..40e0ccdd4a6dd 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -1,15 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index ef1990ad8e69a..795886fa255ed 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index 77bc58bd6f833..a98110bc96fb2 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -2,16 +2,13 @@
 // Licensed under the MIT License.
 
 #include <algorithm>
-#include <string>
 #include <array>
+#include <set>
+#include <string>
 #include <vector>
 
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-#include "core/providers/common.h"
-#include "core/framework/endian_utils.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
@@ -71,7 +68,7 @@ class ReduceOpBuilder : public BaseOpBuilder {
   using AxesQnnIntType = uint32_t;
 
   Status GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                    InlinedHashSet<AxesOnnxIntType>& axes_set) const;
+                    std::set<AxesOnnxIntType>& axes_set) const;
 
   // Maps an operator type to the opset in which "axes" became an input instead of an attribute.
   static const std::array<int, REDUCE_OP_TYPE_COUNT> opset_with_axes_as_input;
@@ -87,7 +84,7 @@ const std::array<int, REDUCE_OP_TYPE_COUNT> ReduceOpBuilder::opset_with_axes_as_
 };
 
 Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                                   InlinedHashSet<AxesOnnxIntType>& axes_set) const {
+                                   std::set<AxesOnnxIntType>& axes_set) const {
   ReduceOpType reduce_op_type = GetReduceOpType(node_unit.OpType());
   if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_UNKNOWN) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unknown reduce operator ", node_unit.OpType());
@@ -146,10 +143,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod
       auto src_span = gsl::make_span(axes_bytes.data(), axes_bytes.size());
       auto dst_span = gsl::make_span(reduce_axes.data(), reduce_axes.size());
 
-      // Copy initializer bytes (stored in little-endian order) to vector of int64_t.
-      // ReadLittleEndian returns a status error if the source and destination spans do not have
-      // matching byte sizes.
-      ORT_RETURN_IF_ERROR(onnxruntime::utils::ReadLittleEndian(src_span, dst_span));
+      std::memcpy(dst_span.data(), src_span.data(), src_span.size_bytes());
     }
   }
 
@@ -218,7 +212,7 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   //
   // Handle axes param.
   //
-  InlinedHashSet<AxesOnnxIntType> axes_set;
+  std::set<AxesOnnxIntType> axes_set;
   ORT_RETURN_IF_ERROR(GetAxesSet(qnn_model_wrapper, node_unit, axes_set));
   const size_t num_axes = axes_set.size();
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
index b6f414da950d8..6fd67a72b64e1 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
@@ -1,15 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index c62fca88b6ec2..5e173b7aff030 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -5,17 +5,10 @@
 #include <cassert>
 #include <unordered_map>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/qnn/builder/qnn_model_wrapper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index cf8726675434f..48c637cd2e951 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-#include "core/util/qmath.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -170,15 +164,16 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
   // Check LeakyRelu input 0 to see if it's quantized tensor
   bool is_quantized_tensor = node_unit.Outputs()[0].quant_param.has_value();
   if (is_quantized_tensor) {
-    float scale;
-    uint8_t zero_point;
-    int64_t num_of_elements = 1;
-    concurrency::ThreadPool* thread_pool = nullptr;
-    GetQuantizationParameter(&tensor_data.alpha, num_of_elements, scale, zero_point, thread_pool);
-    unpacked_data.resize(1);
-    ParQuantizeLinearStd(&tensor_data.alpha, unpacked_data.data(), num_of_elements, scale, zero_point, thread_pool);
-    quantize_param = QnnQuantParamsWrapper(scale, static_cast<int32_t>(zero_point));
     qnn_data_type = QNN_DATATYPE_UFIXED_POINT_8;
+    std::array<float, 1> scales = {1.0f};
+    std::array<int32_t, 1> offsets = {0};
+    std::array<uint32_t, 1> shape = {1};
+    auto float_data = gsl::make_span<const float>(&tensor_data.alpha, 1);
+    ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(float_data, shape, scales, offsets, qnn_data_type));
+
+    unpacked_data.resize(1);
+    ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(float_data, shape, scales, offsets, unpacked_data, qnn_data_type));
+    quantize_param = QnnQuantParamsWrapper(scales[0], static_cast<int32_t>(offsets[0]));
   } else {
     const auto& inputs = node_unit.Inputs();
     TensorInfo input_info = {};
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index b033c8723ea86..fcc7d27c3ada4 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -1,17 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
 
-#include "core/framework/tensorprotoutils.h"
-
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -86,26 +81,22 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel
   ORT_RETURN_IF_NOT(initializer_proto->has_data_type(), "Expected initializer ", input_name.c_str(),
                     " to have a proto data type.");
 
-  // Create empty Tensor.
-  const auto* dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer_proto->data_type())->GetElementType();
-  TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*initializer_proto);
-  Tensor tensor(dtype, shape, std::make_shared<CPUAllocator>());
-
-  // Deserialize initializer into Tensor.
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      onnxruntime::Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), *initializer_proto, tensor));
+  // Deserialize initializer into byte buffer
+  std::vector<uint8_t> initializer_bytes;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*initializer_proto, initializer_bytes));
 
   Status status;
 
   // Copy Tensor of int32_t or int64_t elems into output (int64_ts).
-  if (tensor.IsDataType<int64_t>()) {
-    gsl::span<const int64_t> tensor_elems = tensor.DataAsSpan<int64_t>();
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer_proto->data_type());
+  if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    gsl::span<const int64_t> tensor_elems = ReinterpretAsSpan<int64_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
-  } else if (tensor.IsDataType<int32_t>()) {
-    gsl::span<const int32_t> tensor_elems = tensor.DataAsSpan<int32_t>();
+  } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+    gsl::span<const int32_t> tensor_elems = ReinterpretAsSpan<int32_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
   } else {
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", DataTypeImpl::ToString(dtype),
+    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", onnx_type,
                              " is not supported for Slice initializer input ", input.node_arg.Name().c_str());
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
index b62534bacf426..7326523737383 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
@@ -1,15 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index ba5ad2cf03cef..1db9a8f1e3e15 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
index 851ca84dce075..1d518c3ed5359 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
index d22c0811682d0..adaa13912ae50 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/framework/utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 namespace onnxruntime {
 namespace qnn {
 const int TOPK_MIN_INPUT = 2;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
index a42d7312f0203..bcd8a6d0f78f6 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
@@ -4,12 +4,11 @@
 #include <string>
 #include <vector>
 
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 077e89a6c701c..8df4e5bb3ba39 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -17,18 +17,14 @@
 #include "HTP/QnnHtpSystemContext.h"
 #include "Saver/QnnSaver.h"
 #include <gsl/gsl>
-#include "core/framework/endian_utils.h"
-#include "core/common/logging/capture.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/qnn_allocator.h"
+#include "core/providers/qnn/qnn_telemetry.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#ifdef _WIN32
-#include <winmeta.h>
-#include "core/platform/tracing.h"
-#endif
-
 // Flag to determine if Backend should do node validation for each opNode added
 #define DO_GRAPH_NODE_VALIDATIONS 1
 
@@ -262,12 +258,12 @@ void QnnLogging(const char* format,
   const auto data_type = ::onnxruntime::logging::DataType::SYSTEM;
 
   if (logger.OutputIsEnabled(severity, data_type)) {
-    ::onnxruntime::logging::Capture(logger,
-                                    severity,
-                                    ::onnxruntime::logging::Category::onnxruntime,
-                                    data_type,
-                                    ORT_WHERE)
-        .ProcessPrintf(format, argument_parameter);
+    auto log_capture = Factory<logging::Capture>::Create(logger,
+                                                         severity,
+                                                         logging::Category::onnxruntime,
+                                                         data_type,
+                                                         ORT_WHERE);
+    log_capture->ProcessPrintf(format, argument_parameter);
   }
 }
 
@@ -408,25 +404,25 @@ Status QnnBackendManager::CreateDevice() {
     // Set SoC Model. The *enum* Qnn_SocModel_t is deprecated and will not be updated in the future. Therefore,
     // must use the latest SDK documentation to get the SoC model of the latest HW.
     if (soc_model_ != QNN_SOC_MODEL_UNKNOWN) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
-      custom_config.socModel = soc_model_;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+      custom_config->socModel = soc_model_;
 
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
 
     // Set the minimum HTP architecture. The driver will use ops that are compatible with this minimum architecture.
     if (htp_arch_ != QNN_HTP_DEVICE_ARCH_NONE) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
-      custom_config.arch.arch = htp_arch_;
-      custom_config.arch.deviceId = device_id_;
-
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+      custom_config->arch.arch = htp_arch_;
+      custom_config->arch.deviceId = device_id_;
+
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
   }
 
@@ -1163,15 +1159,16 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   }
 
   bool tracelogging_provider_ep_enabled = false;
-  const Env& env = Env::Default();
-  auto& provider = env.GetTelemetryProvider();
-  auto level = provider.Level();
+#ifdef _WIN32
+  auto& provider = QnnTelemetry::Instance();
   if (provider.IsEnabled()) {
+    auto level = provider.Level();
     auto keyword = provider.Keyword();
     if ((keyword & static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0 && level >= 5) {
       tracelogging_provider_ep_enabled = true;
     }
   }
+#endif  // defined(_WIN32)
 
   // ETW disabled previously, but enabled now
   if (ProfilingLevel::INVALID == profiling_level_etw_ && tracelogging_provider_ep_enabled) {
@@ -1389,18 +1386,8 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
     const std::string& timingSource,
     const std::string& eventLevel,
     const char* eventIdentifier) {
-  TraceLoggingWrite(
-      telemetry_provider_handle,
-      "QNNProfilingEvent",
-      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)),
-      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
-      TraceLoggingValue(timestamp, "Timestamp"),
-      TraceLoggingString(message.c_str(), "Message"),
-      TraceLoggingString(qnnScalarValue.c_str(), "Value"),
-      TraceLoggingString(unit.c_str(), "Unit of Measurement"),
-      TraceLoggingString(timingSource.c_str(), "Timing Source"),
-      TraceLoggingString(eventLevel.c_str(), "Event Level"),
-      TraceLoggingString(eventIdentifier, "Event Identifier"));
+  QnnTelemetry& qnn_telemetry = QnnTelemetry::Instance();
+  qnn_telemetry.LogQnnProfileEvent(timestamp, message, qnnScalarValue, unit, timingSource, eventLevel, eventIdentifier);
 }
 #endif
 
@@ -1552,7 +1539,8 @@ void* QnnBackendManager::LoadLib(const char* file_name, int flags, std::string&
   auto file_path = std::filesystem::path(file_name);
   if (!file_path.is_absolute()) {
     // construct an absolute path from ORT runtime path + file_name and check whether it exists.
-    auto pathstring = Env::Default().GetRuntimePath() + ToPathString(file_name);
+    const Env& env = GetDefaultEnv();
+    auto pathstring = env.GetRuntimePath() + ToPathString(file_name);
     auto absolute_path = pathstring.c_str();
     if (std::filesystem::exists(std::filesystem::path(absolute_path))) {
       // load library from absolute path and search for dependencies there.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 685e03f17cdd3..4a69859a7e841 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -22,9 +22,8 @@
 #include "QnnLog.h"
 #include "QnnTypes.h"
 #include "System/QnnSystemInterface.h"
-#include "core/common/status.h"
-#include "core/common/logging/logging.h"
-#include "core/common/path_string.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_context_mem_handle_manager.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
index 9dd9bbaa08d64..b581cd90537d9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
@@ -3,7 +3,8 @@
 
 #pragma once
 
-#include <core/common/inlined_containers_fwd.h>
+#include <gsl/gsl>
+#include <vector>
 
 namespace onnxruntime {
 namespace qnn {
@@ -49,9 +50,9 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default CustomConfigType object.
    */
-  CustomConfigType& PushCustomConfig() {
-    custom_configs_.push_back(custom_config_init_);
-    return custom_configs_.back();
+  gsl::not_null<CustomConfigType*> PushCustomConfig() {
+    custom_configs_.push_back(std::make_unique<CustomConfigType>(custom_config_init_));
+    return custom_configs_.back().get();
   }
 
   /**
@@ -60,15 +61,15 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default BaseConfigType object.
    */
-  BaseConfigType& PushConfig() {
-    configs_.push_back(base_config_init_);
-    BaseConfigType& config = configs_.back();
+  gsl::not_null<BaseConfigType*> PushConfig() {
+    configs_.push_back(std::make_unique<BaseConfigType>(base_config_init_));
+    BaseConfigType* config = configs_.back().get();
 
     // Add pointer to this new config to the list of config pointers.
     if (IsNullTerminated()) {
-      config_ptrs_.back() = &config;  // Replace last nullptr entry.
+      config_ptrs_.back() = config;  // Replace last nullptr entry.
     } else {
-      config_ptrs_.push_back(&config);
+      config_ptrs_.push_back(config);
     }
 
     return config;
@@ -81,9 +82,14 @@ class QnnConfigsBuilder {
 
   BaseConfigType base_config_init_;
   CustomConfigType custom_config_init_;
-  InlinedVector<CustomConfigType> custom_configs_;
-  InlinedVector<BaseConfigType> configs_;
-  InlinedVector<const BaseConfigType*> config_ptrs_;
+
+  // Store elements of unique_ptrs instead of by value because std::vector reallocation would change the
+  // location of elements in memory. BaseConfigType objects may contain pointers to CustomConfigType objects,
+  // so we need to make sure that pointers to these objects are stable in memory.
+  std::vector<std::unique_ptr<CustomConfigType>> custom_configs_;
+  std::vector<std::unique_ptr<BaseConfigType>> configs_;
+
+  std::vector<const BaseConfigType*> config_ptrs_;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc
index 22bbc2d48e8e4..4d868c6ab96f6 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc
@@ -5,7 +5,7 @@
 
 #include "HTP/QnnHtpMem.h"
 
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/qnn_allocator.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h
index 397ea8bad6d9a..0dd8a8466d1cf 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h
@@ -9,10 +9,7 @@
 
 #include "QnnInterface.h"
 
-#include "core/common/common.h"
-#include "core/common/inlined_containers.h"
-#include "core/common/logging/logging.h"
-#include "core/common/status.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index f0619eb218245..148fa115d40e5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -9,8 +9,7 @@
 #include <memory>
 #include <climits>
 #include <type_traits>
-#include "core/graph/basic_types.h"
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 5f8b7f35eea8b..a9ccb9cc15206 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -7,15 +7,12 @@
 #include <gsl/gsl>
 #include "QnnOpDef.h"
 
-#include "core/framework/utils.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/qnn_allocator.h"
 #include "core/providers/qnn/shared_context.h"
-#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -104,7 +101,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
   // valid throughout the lifetime of the ModelBuilder
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = GetQDQNodeUnits(graph_viewer, logger);
 
   // This name must be same with the EPContext node name
   const auto& graph_name = fused_node.Name();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index 2f220e708c50e..3a2a080aa391f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -6,14 +6,11 @@
 #include <mutex>
 #include <vector>
 
-#include "core/common/status.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/rpcmem_library.h"
-#include "core/session/onnxruntime_cxx_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 79f8f176a2e76..6bd12959afbdf 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
@@ -8,10 +10,7 @@
 #include <utility>
 #include <vector>
 
-#include "qnn_model_wrapper.h"
-#include "core/common/safeint.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
@@ -461,7 +460,7 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
   ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
                 scale_name.c_str());
   gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
-  TensorShape scale_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*scale_tensor_proto);
+  TensorShape scale_shape(qnn::utils::GetInitializerShape<int64_t>(*scale_tensor_proto));
 
   // Check the number of scale values to determine if the tensor is per-channel.
   // This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis because even a
@@ -636,29 +635,13 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
 
   // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
   if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
-
-    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
-    // Docs explicitly state that masking off top 4 bits should not be required.
-    for (size_t i = 0; i < dst.size(); i++) {
-      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
-    }
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
+    const size_t num_int4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(num_int4_elems, unpacked_tensor));
   } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
+    const size_t num_uint4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(num_uint4_elems, unpacked_tensor));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index d018ca12d6451..203250204d7f8 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -7,13 +7,10 @@
 #include <string>
 #include <vector>
 
-#include "core/common/status.h"
 #include "QnnInterface.h"
 #include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
index f9ef01411310f..276fbaae3b3c9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
@@ -8,8 +8,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
index caf4725626338..3af2fdd1f0276 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
@@ -6,9 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
index 90fe44c3af059..d3d552bc172ec 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
index 76b1726646486..5094ad96724f5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
@@ -6,9 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
index 3b67f13492a46..0a1b16d24ffcd 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index c398d1fae5097..e947da1a60e7a 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -10,8 +10,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
index 5548d7d37c378..93b2fca296389 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
@@ -4,8 +4,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
index 0d11d21906ccb..c4cf4e8a20a92 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
@@ -7,8 +7,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
index 23330f5616d73..01c15cf4bebe6 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -4,10 +4,10 @@
 #pragma once
 #include <memory>
 #include <vector>
-#include "QnnTypes.h"
-#include "core/common/common.h"
 #include <gsl/gsl>
-#include "core/framework/node_unit.h"
+
+#include "core/providers/qnn/ort_api.h"
+#include "QnnTypes.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 08d3120260cea..56c3d3e803d9b 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -5,14 +5,13 @@
 
 #include <algorithm>
 #include <functional>
+#include <limits>
 #include <map>
 #include <numeric>
 #include <string>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/common/safeint.h"
-#include "core/framework/data_types.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 
 namespace onnxruntime {
@@ -66,6 +65,42 @@ size_t GetElementSizeByType(ONNXTensorElementDataType elem_type) {
   return pos->second;
 }
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type) {
+  switch (onnx_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:
+      return sizeof(Int4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:
+      return sizeof(UInt4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      return sizeof(int8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      return sizeof(uint8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:
+      return sizeof(int16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:
+      return sizeof(uint16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      return sizeof(int32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+      return sizeof(uint32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      return sizeof(int64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+      return sizeof(uint64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      return 2;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+      return sizeof(float);
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+      return sizeof(double);
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      return sizeof(bool);
+    default:
+      return 0;
+  }
+  // Unreachable
+}
+
 size_t GetQnnTensorDataSizeInBytes(gsl::span<const uint32_t> shape, Qnn_DataType_t element_type) {
   ORT_ENFORCE(!shape.empty(), "Empty shape not allowed.");  // TODO can we just treat empty shape as a scalar?
   SafeInt<size_t> data_length = GetElementSizeByType(element_type);
@@ -507,39 +542,22 @@ bool OnnxDataTypeToQnnDataType(const int32_t onnx_data_type, Qnn_DataType_t& qnn
 }
 
 std::pair<float, float> CheckMinMax(float rmin, float rmax) {
-  // Ensure a minimum range of 0.0001 (required by QNN)
-  rmax = std::max(rmax, rmin + 0.0001f);
-
   // Both QNN and ORT require the range to include 0.0f
   rmin = std::min(rmin, 0.0f);
   rmax = std::max(rmax, 0.0f);
 
+  // Ensure a minimum range of 0.0001 (required by QNN)
+  rmax = std::max(rmax, rmin + 0.0001f);
+
   return std::make_pair(rmin, rmax);
 }
 
-template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
-                   T& qmin,
-                   T& qmax) {
-  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<int8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<int16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
-    qmin = static_cast<T>(std::numeric_limits<int32_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
-  } else {
-    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
+inline float RoundHalfToEven(float input) {
+  if (!std::isfinite(input)) {
+    return input;
   }
-  return Status::OK();
+  // std::remainder returns x - n, where n is the integral value nearest to x. When |x - n| = 0.5, n is chosen to be even
+  return input - std::remainderf(input, 1.f);
 }
 
 Status GetQuantParams(float rmin,
@@ -555,20 +573,22 @@ Status GetQuantParams(float rmin,
     rmin = -abs_max;
   }
 
-  float qmin = 0.0f;
-  float qmax = 255.0f;
-  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax));
+  double rmin_dbl = static_cast<double>(rmin);
+  double rmax_dbl = static_cast<double>(rmax);
+  double qmin = 0.0;
+  double qmax = 0.0;
+  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax, symmetric));
 
-  scale = (rmax - rmin) / (qmax - qmin);
-  float initial_zero_point = 0.0f;
+  double scale_dbl = (rmax_dbl - rmin_dbl) / (qmax - qmin);
+  double initial_zero_point = 0.0;
   if (symmetric) {
-    initial_zero_point = std::round(rmin + rmax) / 2;
+    initial_zero_point = std::round(rmin_dbl + rmax_dbl) / 2;
   } else {
-    initial_zero_point = qmin - (rmin / scale);
+    initial_zero_point = qmin - (rmin_dbl / scale_dbl);
   }
-  zero_point = static_cast<int32_t>(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
-  // To match QNN quantization definition
-  zero_point = 0 - zero_point;
+  zero_point = static_cast<int32_t>(RoundHalfToEven(static_cast<float>(Saturate(qmax, qmin, initial_zero_point))));
+  zero_point = -zero_point;  // Negate to match QNN quantization definition.
+  scale = static_cast<float>(scale_dbl);
   return Status::OK();
 }
 
@@ -590,6 +610,126 @@ Status Quantize(const double double_value,
   return Status::OK();
 }
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end) {
+  size_t size = 1;
+  for (size_t i = start; i < end; i++) {
+    size *= shape[i];
+  }
+  return size;
+}
+
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric, std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      float rmin = std::numeric_limits<float>::max();
+      float rmax = std::numeric_limits<float>::lowest();
+      for (size_t j = 0; j < block_size; j++) {
+        rmin = std::min(rmin, data[i]);
+        rmax = std::max(rmax, data[i]);
+        i++;
+      }
+
+      scales[bd] = 1.0f;
+      offsets[bd] = 0;
+      ORT_RETURN_IF_ERROR(GetQuantParams(rmin, rmax, data_type, scales[bd], offsets[bd], symmetric));
+    }
+  }
+
+  assert(i == data.size());
+  return Status::OK();
+}
+
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+  size_t expected_num_quant_bytes = GetElementSizeByType(data_type) * data.size();
+  ORT_RETURN_IF_NOT(quant_bytes.size() == expected_num_quant_bytes,
+                    "Cannot quantize data because output buffer is not the correct size");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      switch (data_type) {
+        case QNN_DATATYPE_SFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int8_t)], sizeof(int8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint8_t)], sizeof(uint8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int16_t)], sizeof(int16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint16_t)], sizeof(uint16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_32: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int32_t)], sizeof(int32_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int32_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        default:
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported quantization data type for QuantizeData");
+      }
+      i += block_size;
+    }
+  }
+  assert(i == data.size());
+
+  return Status::OK();
+}
+
 std::string_view GetQnnErrorMessage(const QNN_INTERFACE_VER_TYPE& qnn_interface, Qnn_ErrorHandle_t qnn_error_handle) {
   // From QNN SDK: The memory is statically owned and should not be freed by the caller.
   const char* error_msg = nullptr;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 950f349c5006f..853debb61a12f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -2,11 +2,13 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include <algorithm>
 #include <functional>
 #include <numeric>
 #include <string>
 #include <string_view>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include <gsl/gsl>
@@ -14,9 +16,7 @@
 #include "QnnInterface.h"
 #include "QnnTypes.h"
 
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/framework/node_unit.h"
-#include "core/util/qmath.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -27,6 +27,8 @@ size_t GetElementSizeByType(const Qnn_DataType_t& data_type);
 
 size_t GetElementSizeByType(ONNXTensorElementDataType elem_type);
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type);
+
 size_t GetQnnTensorDataSizeInBytes(gsl::span<const uint32_t> shape, Qnn_DataType_t element_data_type);
 
 bool QnnTensorHasDynamicShape(const Qnn_Tensor_t& tensor);
@@ -83,7 +85,30 @@ static bool ArrayHasString(const std::array<std::string_view, N>& strings, std::
 std::pair<float, float> CheckMinMax(float rmin, float rmax);
 
 template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type, T& qmin, T& qmax);
+Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
+                   T& qmin,
+                   T& qmax,
+                   bool symmetric = false) {
+  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<int8_t>::min() + static_cast<int8_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<int16_t>::min() + static_cast<int16_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
+    qmin = static_cast<T>(std::numeric_limits<int32_t>::min() + static_cast<int32_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
+  } else {
+    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
+  }
+  return Status::OK();
+}
 
 template <typename T>
 inline T Saturate(const T qmax,
@@ -113,6 +138,104 @@ Status Quantize(const double double_value,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value);
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end);
+
+// Computes the quantization parameters (scales and offsets) for the given data.
+// Supports both per-tensor and per-channel quantization. Must provide an axis argument
+// for per-channel quantization.
+// The offsets use the QNN convention where offset = -zero_point.
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric = false,
+                          std::optional<int64_t> axis = std::nullopt);
+
+// Quantizes the given float data using the provided quantization parameters (scales and offsets).
+// Supports both per-tensor and per-channel quantization. Must provide an axis argument
+// for per-channel quantization.
+// The provided offsets must use the QNN convention where offset = -zero_point.
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis = std::nullopt);
+
+// Quantizes (per-tensor) the given float data using the provided scale and offset.
+// The provided offset must use the QNN convention where offset = -zero_point.
+template <typename QuantType>
+inline Status QuantizeData(gsl::span<const float> data, float scale, int32_t offset,
+                           /*out*/ gsl::span<uint8_t> quant_bytes) {
+  const size_t num_elems = data.size();
+  const size_t expected_output_bytes = sizeof(QuantType) * num_elems;
+  ORT_RETURN_IF_NOT(expected_output_bytes == quant_bytes.size(),
+                    "Output buffer is not large enough to hold quantized bytes.");
+  const double clip_min = static_cast<double>(std::numeric_limits<QuantType>::lowest());
+  const double clip_max = static_cast<double>(std::numeric_limits<QuantType>::max());
+
+  QuantType* output = reinterpret_cast<QuantType*>(quant_bytes.data());
+  for (size_t i = 0; i < num_elems; ++i) {
+    const double scale_dbl = static_cast<double>(scale);
+    const double offset_dbl = static_cast<double>(offset);
+    double float_val = std::nearbyint(static_cast<double>(data[i]) / scale_dbl) - offset_dbl;
+    float_val = std::max(float_val, clip_min);
+    float_val = std::min(float_val, clip_max);
+    output[i] = static_cast<QuantType>(float_val);
+  }
+  return Status::OK();
+}
+
+// Re-writes a buffer of packed 4-bit elements to a buffer of unpacked 8-bit elements.
+// QNN requires that 4-bit weights are unpacked to 8-bit.
+template <bool Signed>
+Status UnpackInt4ToInt8(size_t num_int4_elems, std::vector<uint8_t>& data_bytes) {
+  if constexpr (Signed) {  // INT4
+    std::vector<uint8_t> packed_int4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
+    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
+
+    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
+    // Docs explicitly state that masking off top 4 bits should not be required, but we have to do it.
+    for (size_t i = 0; i < dst.size(); i++) {
+      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
+    }
+  } else {  // UINT4
+    std::vector<uint8_t> packed_uint4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_uint4_bytes.data()), packed_uint4_bytes.size());
+    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+std::vector<T> GetInitializerShape(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
+  const auto& dims = tensor_proto.dims();
+  std::vector<T> tensor_shape_vec(static_cast<size_t>(dims.size()));
+  for (int i = 0; i < dims.size(); ++i) {
+    tensor_shape_vec[i] = static_cast<T>(dims[i]);
+  }
+
+  return tensor_shape_vec;
+}
+
+template <typename T, typename P>
+Status PermuteShape(gsl::span<const T> input_shape, gsl::span<const P> perm, gsl::span<T> output_shape) {
+  const size_t rank = input_shape.size();
+  ORT_RETURN_IF_NOT(rank == perm.size() && rank == output_shape.size(),
+                    "PermuteShape(): expect all arguments to have the same rank.");
+
+  for (size_t i = 0; i < rank; ++i) {
+    size_t p = static_cast<size_t>(perm[i]);
+    output_shape[i] = input_shape[p];
+  }
+
+  return Status::OK();
+}
+
 // Gets error message associated with QNN error handle value.
 std::string_view GetQnnErrorMessage(const QNN_INTERFACE_VER_TYPE& qnn_interface,
                                     Qnn_ErrorHandle_t qnn_error_handle);
diff --git a/onnxruntime/core/providers/qnn/ort_api.cc b/onnxruntime/core/providers/qnn/ort_api.cc
new file mode 100644
index 0000000000000..809593b409dad
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/ort_api.cc
@@ -0,0 +1,211 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/ort_api.h"
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+namespace onnxruntime {
+
+#if BUILD_QNN_EP_STATIC_LIB
+static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
+
+void RunOnUnload(std::function<void()> function) {
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> guard(mutex);
+  if (!s_run_on_unload_) {
+    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
+  }
+  s_run_on_unload_->push_back(std::move(function));
+}
+
+struct OnUnload {
+  ~OnUnload() {
+    if (!s_run_on_unload_)
+      return;
+
+    for (auto& function : *s_run_on_unload_)
+      function();
+
+    s_run_on_unload_.reset();
+  }
+
+} g_on_unload;
+#endif  // BUILD_QNN_EP_STATIC_LIB
+
+std::vector<const Node*> Graph__Nodes(const Graph& graph) {
+#if BUILD_QNN_EP_STATIC_LIB
+  std::vector<const Node*> nodes;
+  nodes.reserve(graph.NumberOfNodes());
+
+  for (const Node& node : graph.Nodes()) {
+    nodes.push_back(&node);
+  }
+
+  return nodes;
+#else
+  return graph.Nodes();
+#endif
+}
+
+#if BUILD_QNN_EP_STATIC_LIB
+#define NODE_ATTR_ITER_VAL(iter) (iter)->second
+#else
+#define NODE_ATTR_ITER_VAL(iter) (iter)->second()
+#endif
+
+NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
+    : node_attributes_(node.GetAttributes()) {}
+
+NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit)
+    : node_attributes_(node_unit.GetNode().GetAttributes()) {}
+
+float NodeAttrHelper::Get(const std::string& key, float def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return NODE_ATTR_ITER_VAL(entry).f();
+  }
+
+  return def_val;
+}
+
+int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<int32_t>(NODE_ATTR_ITER_VAL(entry).i());
+  }
+
+  return def_val;
+}
+
+uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<uint32_t>(NODE_ATTR_ITER_VAL(entry).i());
+  }
+
+  return def_val;
+}
+
+int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return NODE_ATTR_ITER_VAL(entry).i();
+  }
+
+  return def_val;
+}
+
+const std::string& NodeAttrHelper::Get(const std::string& key, const std::string& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return NODE_ATTR_ITER_VAL(entry).s();
+  }
+
+  return def_val;
+}
+
+std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    std::vector<int32_t> v;
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
+                   [](int64_t val) -> int32_t { return narrow<int32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<uint32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    std::vector<uint32_t> v;
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
+                   [](int64_t val) -> uint32_t { return narrow<uint32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int64_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    return std::vector<int64_t>{cbegin, cend};
+  }
+
+  return def_val;
+}
+
+std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector<float>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).floats();
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    return std::vector<float>{cbegin, cend};
+  }
+
+  return def_val;
+}
+
+std::optional<float> NodeAttrHelper::GetFloat(const std::string& key) const {
+  std::optional<float> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = NODE_ATTR_ITER_VAL(entry).f();
+  }
+
+  return result;
+}
+
+std::optional<int64_t> NodeAttrHelper::GetInt64(const std::string& key) const {
+  std::optional<int64_t> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = NODE_ATTR_ITER_VAL(entry).i();
+  }
+
+  return result;
+}
+
+std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& key) const {
+  std::optional<std::vector<float>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).floats();
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    result = std::vector<float>(cbegin, cend);
+  }
+
+  return result;
+}
+
+std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string& key) const {
+  std::optional<std::vector<int64_t>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    result = std::vector<int64_t>(cbegin, cend);
+  }
+
+  return result;
+}
+
+std::optional<std::string> NodeAttrHelper::GetString(const std::string& key) const {
+  std::optional<std::string> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = NODE_ATTR_ITER_VAL(entry).s();
+  }
+
+  return result;
+}
+
+bool NodeAttrHelper::HasAttr(const std::string& key) const {
+  return node_attributes_.find(key) != node_attributes_.end();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
new file mode 100644
index 0000000000000..030ebbb54c615
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -0,0 +1,178 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License
+
+#pragma once
+
+// This compilation unit (ort_api.h/.cc) encapsulates the interface between the EP and ORT in a manner
+// that allows QNN EP to built either as a static library or a dynamic shared library.
+// The preprocessor macro `BUILD_QNN_EP_STATIC_LIB` is defined and set to 1 if QNN EP
+// is built as a static library.
+
+#if BUILD_QNN_EP_STATIC_LIB
+// Includes when building QNN EP statically
+#ifdef _WIN32
+#include <Windows.h>
+#include <winmeta.h>
+#include "core/platform/tracing.h"
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
+#include "onnx/defs/data_type_utils.h"
+#include "core/common/common.h"
+#include "core/common/status.h"
+#include "core/common/safeint.h"
+#include "core/common/logging/logging.h"
+#include "core/common/logging/capture.h"
+#include "core/common/path_string.h"
+#include "core/platform/env.h"
+#include "core/framework/data_types.h"
+#include "core/framework/float16.h"
+#include "core/framework/run_options.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/model_metadef_id_generator.h"
+#include "core/framework/compute_capability.h"
+#include "core/framework/tensor_shape.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/framework/utils.h"
+#include "core/graph/constants.h"
+#include "core/graph/basic_types.h"
+#include "core/graph/model.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/common.h"
+#include "core/providers/partitioning_utils.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#else
+// Includes when building QNN EP as a shared library
+#include "core/providers/shared_library/provider_api.h"
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
+#endif
+
+#include "core/common/inlined_containers.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
+
+#include <memory>
+#include <vector>
+
+namespace onnxruntime {
+#if BUILD_QNN_EP_STATIC_LIB
+using Node_EdgeEnd = Node::EdgeEnd;
+#endif
+
+#if BUILD_QNN_EP_STATIC_LIB
+void RunOnUnload(std::function<void()> function);
+inline const Env& GetDefaultEnv() { return Env::Default(); }
+#endif
+
+inline void InitOrtCppApi() {
+#if BUILD_QNN_EP_STATIC_LIB
+  // Do nothing. Including "onnxruntime_cxx_api.h" normally initializes the global api_ object.
+#else
+  // Call util function in provider bridge that initializes the global api_ object.
+  InitProviderOrtApi();
+#endif
+}
+
+/// <summary>
+/// Creates an onnxruntime or onnx object. Works for both static and shared library builds of QNN EP.
+/// <!-- Example: auto model = Factory<Model>::Create(/* args ... */); -->
+/// Example: auto model = Factory&lt;Model&gt;::Create(/* args ... */);
+/// </summary>
+/// <typeparam name="T">Type of the object to create</typeparam>
+template <typename T>
+struct Factory {
+  template <typename... Params>
+  static inline std::unique_ptr<T> Create(Params&&... params) {
+#if BUILD_QNN_EP_STATIC_LIB
+    return std::make_unique<T>(std::forward<Params>(params)...);
+#else
+    return T::Create(std::forward<Params>(params)...);
+#endif
+  }
+};
+
+inline const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions& run_options) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return run_options.config_options;
+#else
+  return run_options.GetConfigOptions();
+#endif
+}
+
+inline std::unique_ptr<IndexedSubGraph>& ComputeCapability__SubGraph(ComputeCapability& compute_cability) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return compute_cability.sub_graph;
+#else
+  return compute_cability.SubGraph();
+#endif
+}
+
+inline std::vector<NodeIndex>& IndexedSubGraph__Nodes(IndexedSubGraph& indexed_sub_graph) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return indexed_sub_graph.nodes;
+#else
+  return indexed_sub_graph.Nodes();
+#endif
+}
+
+std::vector<const Node*> Graph__Nodes(const Graph& graph);
+
+inline std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
+GetQDQNodeUnits(const GraphViewer& graph_viewer, const logging::Logger& logger) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return QDQ::GetAllNodeUnits(graph_viewer, logger);
+#else
+  return QDQ::GetAllNodeUnits(&graph_viewer, logger);
+#endif
+}
+
+/**
+ * Wrapping onnxruntime::Node for retrieving attribute values
+ */
+class NodeAttrHelper {
+ public:
+  explicit NodeAttrHelper(const Node& node);
+
+  // Get the attributes from the target node of the node_unit
+  explicit NodeAttrHelper(const NodeUnit& node_unit);
+
+  /*
+   * Get with default
+   */
+  float Get(const std::string& key, float def_val) const;
+  std::vector<float> Get(const std::string& key, const std::vector<float>& def_val) const;
+
+  int64_t Get(const std::string& key, int64_t def_val) const;
+  std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
+
+  const std::string& Get(const std::string& key, const std::string& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to int32_t
+  int32_t Get(const std::string& key, int32_t def_val) const;
+  std::vector<int32_t> Get(const std::string& key, const std::vector<int32_t>& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to uint32_t
+  uint32_t Get(const std::string& key, uint32_t def_val) const;
+  std::vector<uint32_t> Get(const std::string& key, const std::vector<uint32_t>& def_val) const;
+
+  /*
+   * Get without default.
+   */
+  std::optional<float> GetFloat(const std::string& key) const;
+  std::optional<std::vector<float>> GetFloats(const std::string& key) const;
+
+  std::optional<int64_t> GetInt64(const std::string& key) const;
+  std::optional<std::vector<int64_t>> GetInt64s(const std::string& key) const;
+
+  std::optional<std::string> GetString(const std::string& key) const;
+
+  bool HasAttr(const std::string& key) const;
+
+ private:
+  const NodeAttributes& node_attributes_;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.cc b/onnxruntime/core/providers/qnn/qnn_allocator.cc
index 68dac682756d5..1fb8742f724cd 100644
--- a/onnxruntime/core/providers/qnn/qnn_allocator.cc
+++ b/onnxruntime/core/providers/qnn/qnn_allocator.cc
@@ -7,9 +7,7 @@
 #include <cstddef>
 #include <algorithm>
 
-#include "core/common/common.h"
-#include "core/common/safeint.h"
-#include "core/mlas/inc/mlas.h"  // for MlasGetPreferredBufferAlignment()
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
@@ -52,7 +50,8 @@ struct AllocationHeader {
 };
 
 size_t AllocationAlignment() {
-  return std::max(alignof(AllocationHeader), MlasGetPreferredBufferAlignment());
+  constexpr size_t min_allocation_alignment = 64;  // Equal to MlasGetPreferredBufferAlignment()
+  return std::max(alignof(AllocationHeader), min_allocation_alignment);
 }
 
 size_t DivRoundUp(size_t a, size_t b) {  // TODO is there already a helper function somewhere for this?
diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.h b/onnxruntime/core/providers/qnn/qnn_allocator.h
index f642368697aae..e64f38f494b35 100644
--- a/onnxruntime/core/providers/qnn/qnn_allocator.h
+++ b/onnxruntime/core/providers/qnn/qnn_allocator.h
@@ -6,11 +6,7 @@
 #include <memory>
 #include <mutex>
 
-#include "core/common/common.h"
-#include "core/common/inlined_containers.h"
-#include "core/common/logging/logging.h"
-#include "core/common/status.h"
-#include "core/framework/allocator.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/rpcmem_library.h"
 
 namespace onnxruntime::qnn {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index e9d6884b8c8ca..b1555b6050928 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -6,60 +6,22 @@
 #include <filesystem>
 #include <unordered_set>
 
-#include "core/framework/compute_capability.h"
-#include "core/framework/kernel_registry.h"
-#include "core/framework/run_options.h"
-#include "core/graph/graph_viewer.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/platform/env.h"
-#include "core/providers/common.h"
-#include "core/providers/partitioning_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/qnn_allocator.h"
+#include "core/providers/qnn/qnn_telemetry.h"
 #include "core/providers/qnn/rpcmem_library.h"
 #include "core/providers/qnn/shared_context.h"
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/session/onnxruntime_run_options_config_keys.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
-
-#ifdef _WIN32
-#include <Windows.h>
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
 
 namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
 
-static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
-
-void RunOnUnload(std::function<void()> function) {
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> guard(mutex);
-  if (!s_run_on_unload_) {
-    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
-  }
-  s_run_on_unload_->push_back(std::move(function));
-}
-
-struct OnUnload {
-  ~OnUnload() {
-    if (!s_run_on_unload_)
-      return;
-
-    for (auto& function : *s_run_on_unload_)
-      function();
-
-    s_run_on_unload_.reset();
-  }
-
-} g_on_unload;
-
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
   std::transform(profiling_level_string.begin(),
@@ -196,17 +158,20 @@ qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned
 }
 
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
-                                           const SessionOptions* session_options)
+                                           const ConfigOptions* config_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
-  if (session_options) {
-    disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
+  InitOrtCppApi();
+  metadef_id_generator_ = Factory<ModelMetadefIdGenerator>::Create();
+
+  if (config_options) {
+    disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
 
-    context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
+    context_cache_enabled_ = config_options->GetConfigOrDefault(
                                  kOrtSessionOptionEpContextEnable, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
-    std::string embed_mode = session_options->config_options.GetConfigOrDefault(
+    std::string embed_mode = config_options->GetConfigOrDefault(
         kOrtSessionOptionEpContextEmbedMode, "0");
     if ("1" == embed_mode) {
       qnn_context_embed_mode_ = true;
@@ -217,18 +182,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
 
-    context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    context_cache_path_cfg_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
 
     // For the case that workaround QNN context PD memory limit, user need split the model into pieces and
     // generate the QNN context model separately.
     // It could happen that the generated EPContext node in separate graph has same node name.
     // User can set this context_node_name_prefix for each split pieces to avoid that happens.
-    context_node_name_prefix_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
+    context_node_name_prefix_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
     LOGS_DEFAULT(VERBOSE) << "User specified QNN context node name prefix: " << context_node_name_prefix_;
 
     share_ep_contexts_ =
-        session_options->config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+        config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_;
   }
 
@@ -249,8 +214,9 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   // separate out the profiling level for ETW in case it gets disabled later when we extract the events
   // set to invalid to indicate that ETW is no enabled when we setup QNN
   qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID;
-  const Env& env = Env::Default();
-  auto& provider = env.GetTelemetryProvider();
+
+#ifdef _WIN32
+  auto& provider = qnn::QnnTelemetry::Instance();
   if (provider.IsEnabled()) {
     auto level = provider.Level();
     auto keyword = provider.Keyword();
@@ -260,6 +226,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       }
     }
   }
+#endif  // defined(_WIN32)
 
   // In case ETW gets disabled later
   auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
@@ -412,47 +379,53 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
                                    soc_model,
                                    enable_htp_weight_sharing});
 
-#ifdef _WIN32
-  auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
-  // Register callback for ETW capture state (rundown)
-  callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
-      [&etwRegistrationManager, this](
-          LPCGUID SourceId,
-          ULONG IsEnabled,
-          UCHAR Level,
-          ULONGLONG MatchAnyKeyword,
-          ULONGLONG MatchAllKeyword,
-          PEVENT_FILTER_DESCRIPTOR FilterData,
-          PVOID CallbackContext) {
-        ORT_UNUSED_PARAMETER(SourceId);
-        ORT_UNUSED_PARAMETER(MatchAnyKeyword);
-        ORT_UNUSED_PARAMETER(MatchAllKeyword);
-        ORT_UNUSED_PARAMETER(FilterData);
-        ORT_UNUSED_PARAMETER(CallbackContext);
-
-        if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
-          if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
-            auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
-            (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity);
-          }
-          if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
-            if (Level != 0) {
-              // Commenting out Dynamic QNN Profiling for now
-              // There seems to be a crash in 3rd party QC QnnHtp.dll with this.
-              // Repro Scenario - start ETW tracing prior to session creation.
-              //    Then disable/enable ETW Tracing with the code below uncommented a few times
-              // auto profiling_level_etw = GetProfilingLevelFromETWLevel(Level);
-              // (void)qnn_backend_manager_->SetProfilingLevelETW(profiling_level_etw);
+#if defined(_WIN32)
+  if (onnxruntime::logging::EtwRegistrationManager::SupportsETW()) {
+    auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
+    // Register callback for ETW capture state (rundown)
+    callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
+        [&etwRegistrationManager, this](
+            LPCGUID SourceId,
+            ULONG IsEnabled,
+            UCHAR Level,
+            ULONGLONG MatchAnyKeyword,
+            ULONGLONG MatchAllKeyword,
+            PEVENT_FILTER_DESCRIPTOR FilterData,
+            PVOID CallbackContext) {
+          ORT_UNUSED_PARAMETER(SourceId);
+          ORT_UNUSED_PARAMETER(MatchAnyKeyword);
+          ORT_UNUSED_PARAMETER(MatchAllKeyword);
+          ORT_UNUSED_PARAMETER(FilterData);
+          ORT_UNUSED_PARAMETER(CallbackContext);
+
+          if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
+            if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
+              auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
+              (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity);
+            }
+            if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
+              if (Level != 0) {
+                // Commenting out Dynamic QNN Profiling for now
+                // There seems to be a crash in 3rd party QC QnnHtp.dll with this.
+                // Repro Scenario - start ETW tracing prior to session creation.
+                //    Then disable/enable ETW Tracing with the code below uncommented a few times
+                // auto profiling_level_etw = GetProfilingLevelFromETWLevel(Level);
+                // (void)qnn_backend_manager_->SetProfilingLevelETW(profiling_level_etw);
+                //
+                // NOTE(1/2/2025): It is possible that the above was not working in part because it is using the
+                // *logging ETW* subsystem to modify profiling, which should use an entirely different
+                // ETW provider (see QnnTelemetry). Should add callbacks for profiling to the QnnTelemetry ETW provider.
+              }
             }
           }
-        }
 
-        if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) {
-          // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID);
-          (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt);
-        }
-      });
-  etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
+          if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) {
+            // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID);
+            (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt);
+          }
+        });
+    etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
+  }
 #endif
 }
 
@@ -466,7 +439,7 @@ QNNExecutionProvider::~QNNExecutionProvider() {
   }
 
   // Unregister the ETW callback
-#ifdef _WIN32
+#if defined(_WIN32)
   if (callback_ETWSink_provider_ != nullptr) {
     logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_);
   }
@@ -498,9 +471,10 @@ static void LogNodeSupport(const logging::Logger& logger,
     oss << "\tREASON : " << support_status.ErrorMessage() << std::endl;
   }
 
-  logging::Capture(logger, log_severity, logging::Category::onnxruntime,
-                   log_data_type, call_site)
-          .Stream()
+  auto log_capture = Factory<logging::Capture>::Create(logger, log_severity,
+                                                       logging::Category::onnxruntime,
+                                                       log_data_type, call_site);
+  log_capture->Stream()
       << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
       << " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
       << std::endl
@@ -604,11 +578,11 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
                                          const logging::Logger& logger) {
   for (auto fused_node_and_graph : fused_nodes_and_graphs) {
     const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    NodeAttrHelper node_helper(ep_context_node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
-    const std::string& graph_name = ep_context_node->Name();
+    const std::string& graph_name = ep_context_node.Name();
     bool has_shared_qnn_model = SharedContext::GetInstance().HasQnnModel(graph_name);
     if (!has_shared_qnn_model) {
       LOGS(logger, VERBOSE) << "Graph: " << graph_name << " from EpContext node not found from shared EP contexts.";
@@ -623,7 +597,7 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
 static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
                               const size_t num_nodes_in_graph,
                               std::vector<std::unique_ptr<ComputeCapability>>& result,
-                              const utils::GenerateMetadefNameFn& gen_metadef_name,
+                              const std::function<std::string()>& gen_metadef_name,
                               const logging::Logger& logger) {
   std::unordered_set<const Node*> supported_nodes{};
   std::vector<std::vector<const Node*>> supported_groups{};
@@ -683,7 +657,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   const auto gen_metadef_name = [&]() {
     uint64_t model_hash;
-    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_->GenerateId(graph_viewer, model_hash);
     return MakeString(QNN, context_node_name_prefix_, "_", model_hash, "_", metadef_id);
   };
 
@@ -734,7 +708,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = GetQDQNodeUnits(graph_viewer, logger);
 
   // remove is_qnn_ctx_model related code
   const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map,
@@ -777,11 +751,14 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
     bool is_valid_partition = true;
     size_t nodes_in_partition = 0;
 
-    if (partition && partition->sub_graph) {
-      nodes_in_partition = partition->sub_graph->nodes.size();
+    if (partition && ComputeCapability__SubGraph(*partition)) {
+      const auto& subgraph = ComputeCapability__SubGraph(*partition);
+      const auto& subgraph_nodes = IndexedSubGraph__Nodes(*subgraph);
+
+      nodes_in_partition = subgraph_nodes.size();
 
       if (nodes_in_partition == 1 && !is_qnn_ctx_model) {
-        const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
+        const Node* node = graph_viewer.GetNode(subgraph_nodes[0]);
 
         if (!node) {
           LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
@@ -850,34 +827,34 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector<NodeComputeInfo>& nod
 void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const {
   if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP) {
     if (htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushCustomConfig();
-      htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-      htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-      htp_graph_opt_config.optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
-
-      QnnGraph_Config_t& graph_opt_config = configs_builder.PushConfig();
-      graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config.customConfig = &htp_graph_opt_config;
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config = configs_builder.PushCustomConfig();
+      htp_graph_opt_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+      htp_graph_opt_config->optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+      htp_graph_opt_config->optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
+
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config = configs_builder.PushConfig();
+      graph_opt_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config->customConfig = htp_graph_opt_config;
     }
 
     if (vtcm_size_in_mb_ > 0) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
-      htp_graph_opt_config_vtcm.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-      htp_graph_opt_config_vtcm.vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
+      htp_graph_opt_config_vtcm->option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+      htp_graph_opt_config_vtcm->vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
 
-      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushConfig();
-      graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm;
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config_vtcm = configs_builder.PushConfig();
+      graph_opt_config_vtcm->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config_vtcm->customConfig = htp_graph_opt_config_vtcm;
     }
 
     if (enable_HTP_FP16_precision_) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_precision_config = configs_builder.PushCustomConfig();
-      htp_graph_precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-      htp_graph_precision_config.precision = QNN_PRECISION_FLOAT16;
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_precision_config = configs_builder.PushCustomConfig();
+      htp_graph_precision_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+      htp_graph_precision_config->precision = QNN_PRECISION_FLOAT16;
 
-      QnnGraph_Config_t& graph_precision_config = configs_builder.PushConfig();
-      graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_precision_config.customConfig = &htp_graph_precision_config;
+      gsl::not_null<QnnGraph_Config_t*> graph_precision_config = configs_builder.PushConfig();
+      graph_precision_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_precision_config->customConfig = htp_graph_precision_config;
     }
   }
 }
@@ -933,10 +910,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
       if (EpSharedContextsHasAllGraphs(fused_nodes_and_graphs, logger)) {
         for (auto fused_node_and_graph : fused_nodes_and_graphs) {
           const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-          const auto& ep_context_node = graph_viewer.Nodes().begin();
+          const Node& ep_context_node = *graph_viewer.Nodes().begin();
           const Node& fused_node = fused_node_and_graph.fused_node;
           const std::string& graph_meta_id = fused_node.Name();
-          std::string key = ep_context_node->Name();
+          std::string key = ep_context_node.Name();
           auto qnn_model_shared = SharedContext::GetInstance().GetSharedQnnModel(key);
           ORT_RETURN_IF(nullptr == qnn_model_shared, "Graph: " + key + " not found from shared EP contexts.");
           ORT_RETURN_IF_ERROR(qnn_model_shared->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -978,10 +955,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
 
     for (auto fused_node_and_graph : fused_nodes_and_graphs) {
       const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-      const auto& ep_context_node = graph_viewer.Nodes().begin();
+      const Node& ep_context_node = *graph_viewer.Nodes().begin();
       const Node& fused_node = fused_node_and_graph.fused_node;
       const std::string& graph_meta_id = fused_node.Name();
-      std::string key = ep_context_node->Name();
+      std::string key = ep_context_node.Name();
       ORT_RETURN_IF(qnn_models.find(key) == qnn_models.end(), key + " key name not exist in table qnn_models.");
       auto qnn_model = std::move(qnn_models[key]);
       ORT_RETURN_IF_ERROR(qnn_model->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -1022,7 +999,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                                           buffer_size,
                                                                           max_spill_fill_buffer_size));
     }
-    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    qnn_ep_context_model_ = Factory<Model>::Create(std::string{"qnn_ep_context_model"}, false, logger);
     ORT_RETURN_IF_ERROR(qnn::CreateEPContextNodes(qnn_ep_context_model_.get(),
                                                   context_buffer.get(),
                                                   buffer_size,
@@ -1041,8 +1018,8 @@ const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const
   InlinedVector<const Node*> ep_context_nodes;
   if (qnn_ep_context_model_) {
     const auto& graph = qnn_ep_context_model_->MainGraph();
-    for (const auto& node : graph.Nodes()) {
-      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    for (gsl::not_null<const Node*> node : Graph__Nodes(graph)) {
+      ep_context_nodes.push_back(graph.GetNode(node->Index()));
     }
   }
 
@@ -1133,22 +1110,34 @@ void QNNExecutionProvider::ReleasePerThreadContext() const {
   per_thread_context_cache->erase(cached_context_it);
 }
 
+static bool TryGetConfigEntry(const ConfigOptions& config_options, const std::string& key, std::string& value) {
+  std::optional<std::string> new_value = config_options.GetConfigEntry(key);
+  if (!new_value.has_value()) {
+    return false;
+  }
+
+  value = *new_value;
+  return true;
+}
+
 Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
   auto backend_type = qnn_backend_manager_->GetQnnBackendType();
   if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = RunOptions__GetConfigOptions(run_options);
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
 
   std::string rpc_latency = "";
   uint32_t rpc_control_latency = 0;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
     rpc_control_latency = static_cast<uint32_t>(std::stoul(rpc_latency));
     LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
   }
@@ -1174,9 +1163,11 @@ Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::R
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = RunOptions__GetConfigOptions(run_options);
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 317b34e66a6e4..48f41c4da384f 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -3,32 +3,25 @@
 
 #pragma once
 
-#include "core/framework/execution_provider.h"
-#include "core/framework/session_options.h"
-#include "core/framework/model_metadef_id_generator.h"
-#include "core/graph/model.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "core/providers/qnn/rpcmem_library.h"
 #include "HTP/QnnHtpGraph.h"
-#include <memory>
-#include <vector>
-#include <set>
-#include <string>
-#include <unordered_map>
-#ifdef _WIN32
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
 
 namespace onnxruntime {
 
-void RunOnUnload(std::function<void()> function);
-
 // Logical device representation.
 class QNNExecutionProvider : public IExecutionProvider {
  public:
-  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options);
+  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
   virtual ~QNNExecutionProvider();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider);
 
@@ -90,14 +83,14 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
   std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
-  ModelMetadefIdGenerator metadef_id_generator_;
+  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
   uint32_t device_id_ = 0;
   qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   uint32_t default_rpc_control_latency_ = 0;
   bool enable_HTP_FP16_precision_ = true;
   bool share_ep_contexts_ = false;
   bool enable_spill_fill_buffer_ = false;
-#ifdef _WIN32
+#if defined(_WIN32)
   onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
 #endif
   qnn::ModelSettings model_settings_ = {};
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index 4095d7ff02a33..d4dd446751359 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -2,32 +2,68 @@
 // Licensed under the MIT License
 
 #include "core/providers/qnn/qnn_provider_factory_creator.h"
-
-#include "core/session/abi_session_options_impl.h"
 #include "core/providers/qnn/qnn_execution_provider.h"
-#include "core/session/ort_apis.h"
 
 namespace onnxruntime {
 struct QNNProviderFactory : IExecutionProviderFactory {
-  QNNProviderFactory(const ProviderOptions& provider_options_map, const SessionOptions* session_options)
-      : provider_options_map_(provider_options_map), session_options_(session_options) {
+  QNNProviderFactory(const ProviderOptions& provider_options_map, const ConfigOptions* config_options)
+      : provider_options_map_(provider_options_map), config_options_(config_options) {
   }
 
   ~QNNProviderFactory() override {
   }
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<QNNExecutionProvider>(provider_options_map_, session_options_);
+    return std::make_unique<QNNExecutionProvider>(provider_options_map_, config_options_);
   }
 
  private:
   ProviderOptions provider_options_map_;
-  const SessionOptions* session_options_;
+  const ConfigOptions* config_options_;
 };
 
+#if BUILD_QNN_EP_STATIC_LIB
 std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
                                                                              const SessionOptions* session_options) {
-  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, session_options);
+  const ConfigOptions* config_options = nullptr;
+  if (session_options != nullptr) {
+    config_options = &session_options->config_options;
+  }
+
+  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, config_options);
 }
+#else
+struct QNN_Provider : Provider {
+  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* param) override {
+    if (param == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL options to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    std::array<const void*, 2> pointers_array = *reinterpret_cast<const std::array<const void*, 2>*>(param);
+    const ProviderOptions* provider_options = reinterpret_cast<const ProviderOptions*>(pointers_array[0]);
+    const ConfigOptions* config_options = reinterpret_cast<const ConfigOptions*>(pointers_array[1]);
+
+    if (provider_options == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL ProviderOptions to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    return std::make_shared<onnxruntime::QNNProviderFactory>(*provider_options, config_options);
+  }
+
+  void Initialize() override {}
+  void Shutdown() override {}
+} g_provider;
+#endif  // BUILD_QNN_EP_STATIC_LIB
 
 }  // namespace onnxruntime
+
+#if !BUILD_QNN_EP_STATIC_LIB
+extern "C" {
+
+ORT_API(onnxruntime::Provider*, GetProvider) {
+  return &onnxruntime::g_provider;
+}
+}
+#endif  // !BUILD_QNN_EP_STATIC_LIB
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
index 80f9d99b804e7..46b6c15b40553 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
@@ -11,6 +11,9 @@
 namespace onnxruntime {
 struct SessionOptions;
 
+// Defined in core/session/provider_bridge_ort.cc if built as a shared library (default build config).
+// Defined in core/providers/qnn/qnn_provider_factory.cc if built as a static library.
+// The preprocessor macro `BUILD_QNN_EP_STATIC_LIB` is defined and set to 1 if QNN is built as a static library.
 struct QNNProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options_map,
                                                            const SessionOptions* session_options);
diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.cc b/onnxruntime/core/providers/qnn/qnn_telemetry.cc
new file mode 100644
index 0000000000000..b2c8350bfe8ca
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/qnn_telemetry.cc
@@ -0,0 +1,211 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/qnn_telemetry.h"
+
+#ifdef _WIN32
+#if !BUILD_QNN_EP_STATIC_LIB
+// ETW includes
+// need space after Windows.h to prevent clang-format re-ordering breaking the build.
+// TraceLoggingProvider.h must follow Windows.h
+#include <Windows.h>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 26440)  // Warning C26440 from TRACELOGGING_DEFINE_PROVIDER
+#endif
+
+#include <TraceLoggingProvider.h>
+#include <evntrace.h>
+#include <winmeta.h>
+#include "core/platform/windows/TraceLoggingConfig.h"
+
+// Seems this workaround can be dropped when we drop support for VS2017 toolchains
+// https://developercommunity.visualstudio.com/content/problem/85934/traceloggingproviderh-is-incompatible-with-utf-8.html
+#ifdef _TlgPragmaUtf8Begin
+#undef _TlgPragmaUtf8Begin
+#define _TlgPragmaUtf8Begin
+#endif
+
+#ifdef _TlgPragmaUtf8End
+#undef _TlgPragmaUtf8End
+#define _TlgPragmaUtf8End
+#endif
+
+// Different versions of TraceLoggingProvider.h contain different macro variable names for the utf8 begin and end,
+// and we need to cover the lower case version as well.
+#ifdef _tlgPragmaUtf8Begin
+#undef _tlgPragmaUtf8Begin
+#define _tlgPragmaUtf8Begin
+#endif
+
+#ifdef _tlgPragmaUtf8End
+#undef _tlgPragmaUtf8End
+#define _tlgPragmaUtf8End
+#endif
+
+TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntime",
+                             // {3a26b1ff-7484-7484-7484-15261f42614d}
+                             (0x3a26b1ff, 0x7484, 0x7484, 0x74, 0x84, 0x15, 0x26, 0x1f, 0x42, 0x61, 0x4d),
+                             TraceLoggingOptionMicrosoftTelemetry());
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
+#include "core/providers/qnn/ort_api.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+#if !BUILD_QNN_EP_STATIC_LIB
+std::mutex QnnTelemetry::mutex_;
+std::mutex QnnTelemetry::provider_change_mutex_;
+uint32_t QnnTelemetry::global_register_count_ = 0;
+bool QnnTelemetry::enabled_ = true;
+UCHAR QnnTelemetry::level_ = 0;
+UINT64 QnnTelemetry::keyword_ = 0;
+std::vector<const QnnTelemetry::EtwInternalCallback*> QnnTelemetry::callbacks_;
+std::mutex QnnTelemetry::callbacks_mutex_;
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
+QnnTelemetry::QnnTelemetry() {
+#if !BUILD_QNN_EP_STATIC_LIB
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (global_register_count_ == 0) {
+    // TraceLoggingRegister is fancy in that you can only register once GLOBALLY for the whole process
+    HRESULT hr = TraceLoggingRegisterEx(telemetry_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
+    if (SUCCEEDED(hr)) {
+      global_register_count_ += 1;
+    }
+  }
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+}
+
+QnnTelemetry::~QnnTelemetry() {
+#if !BUILD_QNN_EP_STATIC_LIB
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (global_register_count_ > 0) {
+    global_register_count_ -= 1;
+    if (global_register_count_ == 0) {
+      TraceLoggingUnregister(telemetry_provider_handle);
+    }
+  }
+
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  callbacks_.clear();
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+}
+
+QnnTelemetry& QnnTelemetry::Instance() {
+  static QnnTelemetry instance;
+  return instance;
+}
+
+bool QnnTelemetry::IsEnabled() const {
+#if BUILD_QNN_EP_STATIC_LIB
+  const Env& env = GetDefaultEnv();
+  auto& provider = env.GetTelemetryProvider();
+  return provider.IsEnabled();
+#else
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return enabled_;
+#endif
+}
+
+UCHAR QnnTelemetry::Level() const {
+#if BUILD_QNN_EP_STATIC_LIB
+  const Env& env = GetDefaultEnv();
+  auto& provider = env.GetTelemetryProvider();
+  return provider.Level();
+#else
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return level_;
+#endif
+}
+
+UINT64 QnnTelemetry::Keyword() const {
+#if BUILD_QNN_EP_STATIC_LIB
+  const Env& env = GetDefaultEnv();
+  auto& provider = env.GetTelemetryProvider();
+  return provider.Keyword();
+#else
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return keyword_;
+#endif
+}
+
+void QnnTelemetry::LogQnnProfileEvent(uint64_t timestamp,
+                                      const std::string& message,
+                                      const std::string& qnnScalarValue,
+                                      const std::string& unit,
+                                      const std::string& timingSource,
+                                      const std::string& eventLevel,
+                                      const char* eventIdentifier) const {
+  TraceLoggingWrite(
+      telemetry_provider_handle,
+      "QNNProfilingEvent",
+      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)),
+      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
+      TraceLoggingValue(timestamp, "Timestamp"),
+      TraceLoggingString(message.c_str(), "Message"),
+      TraceLoggingString(qnnScalarValue.c_str(), "Value"),
+      TraceLoggingString(unit.c_str(), "Unit of Measurement"),
+      TraceLoggingString(timingSource.c_str(), "Timing Source"),
+      TraceLoggingString(eventLevel.c_str(), "Event Level"),
+      TraceLoggingString(eventIdentifier, "Event Identifier"));
+}
+
+void QnnTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) {
+#if BUILD_QNN_EP_STATIC_LIB
+  WindowsTelemetry::RegisterInternalCallback(callback);
+#else
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  callbacks_.push_back(&callback);
+#endif
+}
+
+void QnnTelemetry::UnregisterInternalCallback(const EtwInternalCallback& callback) {
+#if BUILD_QNN_EP_STATIC_LIB
+  WindowsTelemetry::UnregisterInternalCallback(callback);
+#else
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(),
+                                [&callback](const EtwInternalCallback* ptr) {
+                                  return ptr == &callback;
+                                });
+  callbacks_.erase(new_end, callbacks_.end());
+#endif
+}
+
+#if !BUILD_QNN_EP_STATIC_LIB
+void NTAPI QnnTelemetry::ORT_TL_EtwEnableCallback(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG IsEnabled,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _In_opt_ PVOID CallbackContext) {
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  enabled_ = (IsEnabled != 0);
+  level_ = Level;
+  keyword_ = MatchAnyKeyword;
+
+  InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+}
+
+void QnnTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                                   ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
+                                   PVOID CallbackContext) {
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  for (const auto& callback : callbacks_) {
+    (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  }
+}
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
+}  // namespace qnn
+}  // namespace onnxruntime
+#endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.h b/onnxruntime/core/providers/qnn/qnn_telemetry.h
new file mode 100644
index 0000000000000..a2d42c518c1ac
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/qnn_telemetry.h
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#ifdef _WIN32
+#include <Windows.h>
+
+#if !BUILD_QNN_EP_STATIC_LIB
+#include <TraceLoggingProvider.h>
+#endif
+
+#include <functional>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "core/providers/qnn/ort_api.h"
+
+#if !BUILD_QNN_EP_STATIC_LIB
+TRACELOGGING_DECLARE_PROVIDER(telemetry_provider_handle);
+#endif
+
+namespace onnxruntime {
+namespace qnn {
+
+/// <summary>
+/// Singleton class used to log QNN profiling events to the ONNX Runtime telemetry tracelogging provider.
+///
+/// When QNN EP is a DLL, we must define our own tracelogging provider handle via TRACELOGGING_DEFINE_PROVIDER.
+/// TraceLogging documentation states that separate DLLs cannot share the same tracelogging provider handle. See:
+/// https://learn.microsoft.com/en-us/windows/win32/api/traceloggingprovider/nf-traceloggingprovider-tracelogging_define_provider#remarks
+///
+/// When QNN EP is a static library, we use the tracelogging provider handle already defined
+/// in core/platform/windows/telemetry.h/.cc. In this case, we forward method calls to the
+/// ORT Env's telemetry provider.
+/// </summary>
+class QnnTelemetry {
+ public:
+  static QnnTelemetry& Instance();
+  bool IsEnabled() const;
+
+  // Get the current logging level
+  unsigned char Level() const;
+
+  // Get the current keyword
+  UINT64 Keyword() const;
+
+  // Logs QNN profiling event as trace logging event.
+  void LogQnnProfileEvent(uint64_t timestamp,
+                          const std::string& message,
+                          const std::string& qnnScalarValue,
+                          const std::string& unit,
+                          const std::string& timingSource,
+                          const std::string& eventLevel,
+                          const char* eventIdentifier) const;
+
+  using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                 ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                 PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
+
+  static void RegisterInternalCallback(const EtwInternalCallback& callback);
+
+  static void UnregisterInternalCallback(const EtwInternalCallback& callback);
+
+ private:
+  QnnTelemetry();
+  ~QnnTelemetry();
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnTelemetry);
+
+#if !BUILD_QNN_EP_STATIC_LIB
+  static std::mutex mutex_;
+  static uint32_t global_register_count_;
+  static bool enabled_;
+
+  static std::vector<const EtwInternalCallback*> callbacks_;
+  static std::mutex callbacks_mutex_;
+  static std::mutex provider_change_mutex_;
+  static UCHAR level_;
+  static ULONGLONG keyword_;
+
+  static void InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                              ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext);
+
+  static void NTAPI ORT_TL_EtwEnableCallback(
+      _In_ LPCGUID SourceId,
+      _In_ ULONG IsEnabled,
+      _In_ UCHAR Level,
+      _In_ ULONGLONG MatchAnyKeyword,
+      _In_ ULONGLONG MatchAllKeyword,
+      _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+      _In_opt_ PVOID CallbackContext);
+#endif
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
+
+#endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/qnn/rpcmem_library.cc b/onnxruntime/core/providers/qnn/rpcmem_library.cc
index 59e6cff925668..93c5ed54ab371 100644
--- a/onnxruntime/core/providers/qnn/rpcmem_library.cc
+++ b/onnxruntime/core/providers/qnn/rpcmem_library.cc
@@ -2,9 +2,7 @@
 // Licensed under the MIT License
 
 #include "core/providers/qnn/rpcmem_library.h"
-
-#include "core/common/logging/logging.h"
-#include "core/platform/env.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
@@ -25,7 +23,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb
       return;
     }
 
-    const auto& env = Env::Default();
+    const auto& env = GetDefaultEnv();
     const auto unload_status = env.UnloadDynamicLibrary(library_handle);
 
     if (!unload_status.IsOK()) {
@@ -33,7 +31,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb
     }
   };
 
-  const auto& env = Env::Default();
+  const auto& env = GetDefaultEnv();
   void* library_handle = nullptr;
 
   const auto load_status = env.LoadDynamicLibrary(path, global_symbols, &library_handle);
@@ -47,7 +45,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb
 RpcMemApi CreateApi(void* library_handle) {
   RpcMemApi api{};
 
-  const auto& env = Env::Default();
+  const auto& env = GetDefaultEnv();
   ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(library_handle, "rpcmem_alloc", (void**)&api.alloc));
 
   ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(library_handle, "rpcmem_free", (void**)&api.free));
diff --git a/onnxruntime/core/providers/qnn/rpcmem_library.h b/onnxruntime/core/providers/qnn/rpcmem_library.h
index d5697ff298e79..0642c96798188 100644
--- a/onnxruntime/core/providers/qnn/rpcmem_library.h
+++ b/onnxruntime/core/providers/qnn/rpcmem_library.h
@@ -6,7 +6,7 @@
 #include <cstdint>
 #include <memory>
 
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
diff --git a/onnxruntime/core/providers/qnn/shared_context.h b/onnxruntime/core/providers/qnn/shared_context.h
index a111e57038304..81de357dbe677 100644
--- a/onnxruntime/core/providers/qnn/shared_context.h
+++ b/onnxruntime/core/providers/qnn/shared_context.h
@@ -5,7 +5,7 @@
 #include <mutex>
 #include <vector>
 
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 
 #pragma once
diff --git a/onnxruntime/core/providers/qnn/symbols.def b/onnxruntime/core/providers/qnn/symbols.def
new file mode 100644
index 0000000000000..4ec2f7914c208
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/symbols.def
@@ -0,0 +1,2 @@
+EXPORTS
+   GetProvider
diff --git a/onnxruntime/core/providers/qnn/version_script.lds b/onnxruntime/core/providers/qnn/version_script.lds
new file mode 100644
index 0000000000000..094abb3329781
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/version_script.lds
@@ -0,0 +1,9 @@
+#_init and _fini should be local
+VERS_1.0 {
+  global:
+    GetProvider;    
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/onnxruntime/core/providers/rknpu/node_attr_helper.h b/onnxruntime/core/providers/rknpu/node_attr_helper.h
index 6ab8f8c6bb953..76a0c721f70aa 100644
--- a/onnxruntime/core/providers/rknpu/node_attr_helper.h
+++ b/onnxruntime/core/providers/rknpu/node_attr_helper.h
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 #include <vector>
 #include <string>
 
diff --git a/onnxruntime/core/providers/rknpu/onnx_converter.h b/onnxruntime/core/providers/rknpu/onnx_converter.h
index e90efd75b9c7f..10cc09a9dba92 100644
--- a/onnxruntime/core/providers/rknpu/onnx_converter.h
+++ b/onnxruntime/core/providers/rknpu/onnx_converter.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include <map>
 #include <memory>
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 45f81ed22b7f7..6ff2572e5e668 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -9,6 +9,11 @@
 #pragma once
 #define SHARED_PROVIDER 1
 
+#ifdef _WIN32
+#include <Windows.h>
+#include <evntrace.h>
+#endif  // defined(_WIN32)
+
 #include <vector>
 #include <string>
 #include <map>
@@ -136,6 +141,17 @@ enum class DataType {
   USER = 1     ///< Contains potentially sensitive user data.
 };
 
+enum class ORTTraceLoggingKeyword : uint64_t {
+  Session = 0x1,    // ORT Session TraceLoggingWrite
+  Logs = 0x2,       // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+  Reserved1 = 0x4,  // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+  Reserved2 = 0x8,
+  Reserved3 = 0x10,
+  Reserved4 = 0x20,
+  Reserved5 = 0x40,
+  Reserved6 = 0x80,
+  Profiling = 0x100  // Enables profiling. At higher levels >5 can impact inference performance
+};
 }  // namespace logging
 
 // OnnxRuntime Types (these are the internal types)
@@ -143,6 +159,13 @@ struct CPUIDInfo;
 namespace logging {
 struct Logger;
 struct Capture;
+#ifdef _WIN32
+struct EtwRegistrationManager;
+using EtwRegistrationManager_EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                                      ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                                      PEVENT_FILTER_DESCRIPTOR FilterData,
+                                                                      PVOID CallbackContext)>;
+#endif
 }  // namespace logging
 struct ComputeCapability;
 struct ConfigOptions;
@@ -157,10 +180,12 @@ struct KernelRegistry;
 struct Function;
 struct Graph;
 class GraphViewer;
+struct ConstGraphNodes;
 enum class DataLayout;
 struct Model;
 struct Path;
 struct Node;
+struct Node_EdgeEnd;
 struct NodeArg;
 struct NodeAttributes;
 struct NodeUnitIODef;
@@ -215,6 +240,7 @@ using DeleteFunc = void (*)(void*);
 using NodeArgInfo = ONNX_NAMESPACE::ValueInfoProto;
 
 using NameMLValMap = std::unordered_map<std::string, OrtValue>;
+
 }  // namespace onnxruntime
 
 #include "core/platform/threadpool.h"
@@ -368,6 +394,28 @@ template <>
 constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {
   return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4;
 }
+
+inline std::vector<std::unique_ptr<ComputeCapability>>
+CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                          const std::unordered_set<const Node*>& supported_nodes,
+                          const std::unordered_set<std::string>& stop_ops,
+                          const std::function<std::string()>& generate_metadef_name,
+                          const std::string& execution_provider_name,
+                          const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                          bool drop_constant_initializers = false) {
+  return g_host->Utils__CreateSupportedPartitions(graph_viewer, supported_nodes, stop_ops, generate_metadef_name,
+                                                  execution_provider_name, execution_provider_type, node_unit_map,
+                                                  drop_constant_initializers);
+}
+inline std::unique_ptr<ComputeCapability> MakeComputeCapability(const GraphViewer& graph_viewer,
+                                                                const std::vector<const Node*>& group,
+                                                                const std::function<std::string()>& generate_metadef_name,
+                                                                const std::string& execution_provider_name,
+                                                                bool drop_constant_initializers) {
+  return g_host->Utils__MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                              execution_provider_name, drop_constant_initializers);
+}
 }  // namespace utils
 
 namespace QDQ {
@@ -381,6 +429,10 @@ GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger)
 // So the C API (and C++) becomes available when ORT_API_MANUAL_INIT is used.
 void InitProviderOrtApi();
 
+// This is a replacement for Env::Default(). Returns a reference to the default ORT Environment.
+inline Env& GetDefaultEnv() {
+  return g_host->Env__Default();
+}
 }  // namespace onnxruntime
 
 #define CREATE_MESSAGE(logger, severity, category, datatype) \
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index aa8c367d25d51..4c050534456da 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -505,6 +505,9 @@ Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const st
                              /*out*/ std::vector<uint8_t>& unpacked_tensor) {
   return g_host->UnpackInitializerData(tensor, model_path, unpacked_tensor);
 }
+Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, /*out*/ std::vector<uint8_t>& unpacked_tensor) {
+  return g_host->UnpackInitializerData(tensor, std::filesystem::path(), unpacked_tensor);
+}
 
 }  // namespace utils
 
@@ -788,5 +791,5 @@ std::string ToUTF8String(const std::wstring& s) {
 std::wstring ToWideString(const std::string& s) {
   return g_host->ToWideString(s);
 }
-#endif
+#endif  // _WIN32
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 5a179ec622f8c..962d10d8952d6 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -120,11 +120,20 @@ struct Node__EdgeIterator {
   virtual bool operator!=(const Node__EdgeIterator& p) const = 0;
 
   virtual void operator++() = 0;
+  virtual const Node_EdgeEnd& operator*() const = 0;
   virtual const Node& GetNode() const = 0;
   virtual int GetSrcArgIndex() const = 0;
   virtual int GetDstArgIndex() const = 0;
 };
 
+struct ConstGraphNodes_Iterator {
+  virtual ~ConstGraphNodes_Iterator() {}
+
+  virtual bool operator!=(const ConstGraphNodes_Iterator& other) const = 0;
+  virtual void operator++() = 0;
+  virtual const Node& operator*() = 0;
+};
+
 // There are two ways to route a function, one is a virtual method and the other is a function pointer (or pointer to
 // member function).
 // The function pointers are nicer in that they directly call the target function, but they cannot be used in cases
@@ -273,20 +282,41 @@ struct ProviderHost {
 
   // logging::Logger
   virtual bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) = 0;
+  virtual logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) = 0;
 
   // logging::LoggingManager
   virtual const logging::Logger& logging__LoggingManager__DefaultLogger() = 0;
+  virtual bool logging__LoggingManager__HasDefaultLogger() = 0;
 
   // logging::Capture
-  virtual std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) = 0;
+  virtual std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger,
+                                                                        logging::Severity severity,
+                                                                        const char* category,
+                                                                        logging::DataType data_type,
+                                                                        const CodeLocation& location) = 0;
   virtual void logging__Capture__operator_delete(logging::Capture* p) noexcept = 0;
   virtual std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept = 0;
+  virtual void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) = 0;
+
+#if defined(_WIN32)
+  // logging::EtwRegistrationManager
+  virtual logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() = 0;
+  virtual bool logging__EtwRegistrationManager__SupportsETW() = 0;
+  virtual logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) = 0;
+  virtual void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+  virtual void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+#endif  // defined(_WIN32)
 
   // Env
   virtual Env& Env__Default() = 0;
 
   // Utils::DataTypeUtils
   virtual const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) = 0;
+  virtual const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) = 0;
 
   // int64s
   virtual int int64s__size(const ONNX_NAMESPACE::int64s* p) = 0;
@@ -328,6 +358,7 @@ struct ProviderHost {
   virtual bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
+  virtual bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) = 0;
 
@@ -342,6 +373,7 @@ struct ProviderHost {
   // TypeProto
   virtual std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() = 0;
   virtual void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) = 0;
+  virtual bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) = 0;
 
@@ -462,6 +494,7 @@ struct ProviderHost {
   virtual bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) = 0;
   virtual void TensorProto__CopyFrom(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto* other) = 0;
@@ -495,6 +528,7 @@ struct ProviderHost {
   // TensorShapeProto_Dimensions
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__begin(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__end(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
+  virtual size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
 
   // TensorShapeProto
   virtual int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) = 0;
@@ -823,6 +857,8 @@ struct ProviderHost {
 
   virtual const NodeAttributes& Node__GetAttributes(const Node* p) noexcept = 0;
   virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) = 0;
   virtual size_t Node__GetInputEdgesCount(const Node* p) noexcept = 0;
   virtual size_t Node__GetOutputEdgesCount(const Node* p) noexcept = 0;
 
@@ -842,6 +878,11 @@ struct ProviderHost {
   virtual const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0;
   virtual std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0;
 
+  // Node_EdgeEnd
+  virtual const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) = 0;
+
   // NodeArg
   virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) = 0;
@@ -872,6 +913,8 @@ struct ProviderHost {
   virtual void NodeAttributes__reserve(NodeAttributes* p, size_t size) = 0;
 
   // NodeUnit
+  virtual void NodeUnit__operator_delete(NodeUnit* p) noexcept = 0;
+
   virtual int NodeUnit__UnitType(const NodeUnit* p) noexcept = 0;
 
   virtual const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept = 0;
@@ -897,10 +940,29 @@ struct ProviderHost {
   virtual std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
   QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) = 0;
 
+  // Partitioning utils
+  virtual std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const std::function<std::string()>& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) = 0;
+
+  virtual std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) = 0;
   // Model
   virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                                   const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                                   const logging::Logger& logger) = 0;
+  virtual std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                                  const logging::Logger& logger) = 0;
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
@@ -974,6 +1036,7 @@ struct ProviderHost {
   virtual const std::string& GraphViewer__Name(const GraphViewer* p) noexcept = 0;
   virtual const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept = 0;
 
+  virtual const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept = 0;
   virtual const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) = 0;
   virtual const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) = 0;
 
@@ -989,6 +1052,7 @@ struct ProviderHost {
 
   virtual const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept = 0;
   virtual const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept = 0;
+  virtual bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) = 0;
   virtual const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept = 0;
 
   virtual const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) = 0;
@@ -1007,6 +1071,13 @@ struct ProviderHost {
   virtual const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const = 0;
   virtual IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const = 0;
 
+  // ConstGraphNodes
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) = 0;
+  virtual bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept = 0;
+
   // OpKernel
   virtual const Node& OpKernel__Node(const OpKernel* p) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 76b6d8063fd66..e434935343663 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -23,27 +23,50 @@ namespace logging {
 
 struct Logger final {
   bool OutputIsEnabled(Severity severity, DataType data_type) const noexcept { return g_host->logging__Logger__OutputIsEnabled(this, severity, data_type); }
+  Severity GetSeverity() const noexcept {
+    return g_host->logging__Logger__GetSeverity(this);
+  }
 
   PROVIDER_DISALLOW_ALL(Logger)
 };
 
 struct LoggingManager final {
   static const Logger& DefaultLogger() { return g_host->logging__LoggingManager__DefaultLogger(); }
+  static bool HasDefaultLogger() { return g_host->logging__LoggingManager__HasDefaultLogger(); }
 
   PROVIDER_DISALLOW_ALL(LoggingManager)
 };
 
 struct Capture final {
   static std::unique_ptr<Capture> Create(const Logger& logger, logging::Severity severity, const char* category,
-                                         logging::DataType dataType, const CodeLocation& location) { return g_host->logging__Capture__construct(logger, severity, category, dataType, location); }
+                                         logging::DataType data_type, const CodeLocation& location) {
+    return g_host->logging__Capture__construct(logger, severity, category, data_type, location);
+  }
   static void operator delete(void* p) { g_host->logging__Capture__operator_delete(reinterpret_cast<Capture*>(p)); }
 
   std::ostream& Stream() noexcept { return g_host->logging__Capture__Stream(this); }
+  void ProcessPrintf(const char* format, va_list args) { g_host->logging__Capture__ProcessPrintf(this, format, args); }
 
   Capture() = delete;
   Capture(const Capture&) = delete;
   void operator=(const Capture&) = delete;
 };
+
+#if defined(_WIN32)
+struct EtwRegistrationManager final {
+  using EtwInternalCallback = EtwRegistrationManager_EtwInternalCallback;
+  static EtwRegistrationManager& Instance() { return g_host->logging__EtwRegistrationManager__Instance(); }
+  static bool SupportsETW() { return g_host->logging__EtwRegistrationManager__SupportsETW(); }
+  Severity MapLevelToSeverity() { return g_host->logging__EtwRegistrationManager__MapLevelToSeverity(this); }
+  void RegisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__RegisterInternalCallback(this, callback);
+  }
+  void UnregisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__UnregisterInternalCallback(this, callback);
+  }
+};
+#endif  // defined(_WIN32)
+
 }  // namespace logging
 }  // namespace onnxruntime
 
@@ -234,6 +257,7 @@ struct TensorProto final {
   const std::string& raw_data() const { return g_host->TensorProto__raw_data(this); }
   std::string* mutable_raw_data() { return g_host->TensorProto__mutable_raw_data(this); }
 
+  bool has_data_type() const { return g_host->TensorProto__has_data_type(this); }
   int32_t data_type() const { return g_host->TensorProto__data_type(this); }
   void set_data_type(int32_t type) { return g_host->TensorProto__set_data_type(this, type); }
 
@@ -286,6 +310,7 @@ struct TensorShapeProto_Dimension final {
 struct TensorShapeProto_Dimensions final {
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> begin() const { return g_host->TensorShapeProto_Dimensions__begin(this); }
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> end() const { return g_host->TensorShapeProto_Dimensions__end(this); }
+  size_t size() const { return g_host->TensorShapeProto_Dimensions__size(this); }
 
   PROVIDER_DISALLOW_ALL(TensorShapeProto_Dimensions)
 };
@@ -305,6 +330,7 @@ struct TypeProto_Tensor final {
   bool has_shape() const { return g_host->TypeProto_Tensor__has_shape(this); }
   const TensorShapeProto& shape() const { return g_host->TypeProto_Tensor__shape(this); }
   TensorShapeProto* mutable_shape() { return g_host->TypeProto_Tensor__mutable_shape(this); }
+  bool has_elem_type() const { return g_host->TypeProto_Tensor__has_elem_type(this); }
   int32_t elem_type() const { return g_host->TypeProto_Tensor__elem_type(this); }
   void set_elem_type(int32_t value) { g_host->TypeProto_Tensor__set_elem_type(this, value); }
 
@@ -339,6 +365,7 @@ struct TypeProto_Sequence final {
 struct TypeProto final {
   static std::unique_ptr<TypeProto> Create() { return g_host->TypeProto__construct(); }
 
+  bool has_tensor_type() const { return g_host->TypeProto__has_tensor_type(this); }
   const TypeProto_Tensor& tensor_type() const { return g_host->TypeProto__tensor_type(this); }
   TypeProto_Tensor* mutable_tensor_type() { return g_host->TypeProto__mutable_tensor_type(this); }
 
@@ -475,6 +502,7 @@ namespace Utils {
 
 struct DataTypeUtils final {
   static const std::string* ToType(const ONNX_NAMESPACE::TypeProto& type_proto) { return g_host->Utils__DataTypeUtils__ToType(type_proto); }
+  static const std::string* ToType(const std::string& type_str) { return g_host->Utils__DataTypeUtils__ToType(type_str); }
 
   PROVIDER_DISALLOW_ALL(DataTypeUtils)
 };
@@ -770,6 +798,14 @@ struct Function final {
   PROVIDER_DISALLOW_ALL(Function)
 };
 
+struct Node_EdgeEnd final {
+  const Node& GetNode() const { return g_host->Node_EdgeEnd__GetNode(this); }
+  int GetSrcArgIndex() const { return g_host->Node_EdgeEnd__GetSrcArgIndex(this); }
+  int GetDstArgIndex() const { return g_host->Node_EdgeEnd__GetDstArgIndex(this); }
+
+  PROVIDER_DISALLOW_ALL(Node_EdgeEnd)
+};
+
 struct Node final {
   enum class Type {
     Primitive = 0,
@@ -801,6 +837,12 @@ struct Node final {
   void AddAttribute(const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) {
     g_host->Node__AddAttribute(this, attr_name, value);
   }
+  void AddAttribute(const std::string& attr_name, const std::string& value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
+  void AddAttribute(const std::string& attr_name, int64_t value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
 
   size_t GetInputEdgesCount() const noexcept { return g_host->Node__GetInputEdgesCount(this); }
   size_t GetOutputEdgesCount() const noexcept { return g_host->Node__GetOutputEdgesCount(this); }
@@ -832,6 +874,7 @@ struct Node final {
     }
 
     void operator++() { impl_->operator++(); }
+    const Node_EdgeEnd& operator*() { return impl_->operator*(); }
     const Node__EdgeIterator* operator->() const { return impl_.get(); }
 
     std::unique_ptr<Node__EdgeIterator> impl_;
@@ -906,6 +949,13 @@ struct NodeUnit final {
     QDQGroup,    // The NodeUnit contain a QDQ group of nodes, such as "DQ->Sigmoid->Q"
   };
 
+  NodeUnit() = delete;
+  NodeUnit(const NodeUnit&) = delete;
+  void operator=(const NodeUnit& v) = delete;
+
+  // Need delete because of APIs that return unique_ptr<NodeUnit>
+  static void operator delete(void* p) { g_host->NodeUnit__operator_delete(reinterpret_cast<NodeUnit*>(p)); }
+
   Type UnitType() const noexcept { return static_cast<Type>(g_host->NodeUnit__UnitType(this)); }
 
   const std::vector<NodeUnitIODef>& Inputs() const noexcept { return g_host->NodeUnit__Inputs(this); }
@@ -941,6 +991,9 @@ struct Model final {
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
     return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
   }
+  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) {
+    return g_host->Model__construct(graph_name, is_onnx_domain_only, logger);
+  }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
   static Status Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { return g_host->Model__Load(file_path, model_proto); }
 
@@ -1041,6 +1094,7 @@ class GraphViewer final {
   const std::string& Name() const noexcept { return g_host->GraphViewer__Name(this); }
   const std::filesystem::path& ModelPath() const noexcept { return g_host->GraphViewer__ModelPath(this); }
 
+  const ConstGraphNodes& Nodes() const noexcept { return g_host->GraphViewer__Nodes(this); }
   const Node* GetNode(NodeIndex node_index) const { return g_host->GraphViewer__GetNode(this, node_index); }
   const NodeArg* GetNodeArg(const std::string& name) const { return g_host->GraphViewer__GetNodeArg(this, name); }
 
@@ -1058,6 +1112,9 @@ class GraphViewer final {
 
   const std::vector<const NodeArg*>& GetInputs() const noexcept { return g_host->GraphViewer__GetInputs(this); }
   const std::vector<const NodeArg*>& GetOutputs() const noexcept { return g_host->GraphViewer__GetOutputs(this); }
+  bool NodeProducesGraphOutput(const Node& node) const {
+    return g_host->GraphViewer__NodeProducesGraphOutput(this, node);
+  }
   const std::unordered_set<const NodeArg*>& GetValueInfo() const noexcept { return g_host->GraphViewer__GetValueInfo(this); }
 
   const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return g_host->GraphViewer__GetAllInitializedTensors(this); }
@@ -1085,6 +1142,25 @@ class GraphViewer final {
   void operator=(const GraphViewer&) = delete;
 };
 
+struct ConstGraphNodes final {
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> begin() const {
+    return g_host->ConstGraphNodes__begin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> end() const {
+    return g_host->ConstGraphNodes__end(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cbegin() const {
+    return g_host->ConstGraphNodes__cbegin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cend() const {
+    return g_host->ConstGraphNodes__cend(this);
+  }
+
+  bool empty() const noexcept { return g_host->ConstGraphNodes__empty(this); }
+
+  PROVIDER_DISALLOW_ALL(ConstGraphNodes)
+};
+
 struct OpKernelContext final {
   template <typename T>
   const T& RequiredInput(int index) const;
diff --git a/onnxruntime/core/providers/webgpu/generator/range.cc b/onnxruntime/core/providers/webgpu/generator/range.cc
index ee7c67ec24185..a0b65f08a5b4e 100644
--- a/onnxruntime/core/providers/webgpu/generator/range.cc
+++ b/onnxruntime/core/providers/webgpu/generator/range.cc
@@ -25,6 +25,11 @@ Status Range<T>::ComputeInternal(ComputeContext& context) const {
 
   uint32_t output_size = gsl::narrow<uint32_t>(n);
   RangeProgram program{};
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
   program.AddOutput({output_tensor, ProgramTensorMetadataDependency::Type})
       .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({
@@ -33,6 +38,10 @@ Status Range<T>::ComputeInternal(ComputeContext& context) const {
           *reinterpret_cast<uint32_t*>(&delta),
       });
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
   return context.RunProgram(program);
 }
 
diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
index 8dcf63671092b..eaaad206ebaf5 100644
--- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
@@ -194,6 +194,10 @@ class Clip final : public UnaryElementwise {
                          "Clip",
                          std::is_same_v<T, MLFloat16> ? ClipF16Impl : ClipImpl,
                          "", ShaderUsage::UseElementTypeAlias} {}
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
 
   Status ConfigureProgram(const ComputeContext& context, UnaryElementwiseProgram& program) const override {
     const auto* clip_min_tensor = context.Input<Tensor>(1);
@@ -214,6 +218,9 @@ class Clip final : public UnaryElementwise {
     }
     return Status::OK();
   }
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 
   // uniforms.attr is a f32 value. It is encoded as a float for 2 f16 values.
   // bitcast<vec2<f16>>(uniforms.attr)[0] is clip_min, bitcast<vec2<f16>>(uniforms.attr)[1] is clip_max
diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h
index 1562ec158b40a..7bfd9e8800099 100644
--- a/onnxruntime/core/providers/webgpu/program.h
+++ b/onnxruntime/core/providers/webgpu/program.h
@@ -150,6 +150,11 @@ enum class ProgramTensorMetadataDependency : int {
 };
 std::ostream& operator<<(std::ostream& os, ProgramTensorMetadataDependency);
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
 inline ProgramTensorMetadataDependency operator|(ProgramTensorMetadataDependency a, ProgramTensorMetadataDependency b) {
   return (ProgramTensorMetadataDependency)((int&)a | (int&)b);
 }
@@ -163,6 +168,10 @@ inline ProgramTensorMetadataDependency& operator&=(ProgramTensorMetadataDependen
   return (ProgramTensorMetadataDependency&)((int&)a &= (int&)b);
 }
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
 constexpr SafeInt<uint32_t> WORKGROUP_SIZE = 64;
 
 // data type of variable
diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc
index 109bac34d6503..1fdd312d4f0d8 100644
--- a/onnxruntime/core/providers/webgpu/program_manager.cc
+++ b/onnxruntime/core/providers/webgpu/program_manager.cc
@@ -147,16 +147,16 @@ Status ProgramManager::Build(const ProgramBase& program,
     }
   }
 
-  wgpu::ProgrammableStageDescriptor compute_stage{};
-  compute_stage.module = shader_module;
-  compute_stage.entryPoint = "main";
+  wgpu::ComputeState compute_state{};
+  compute_state.module = shader_module;
+  compute_state.entryPoint = "main";
   if (!constant_entries.empty()) {
-    compute_stage.constants = constant_entries.data();
-    compute_stage.constantCount = constant_entries.size();
+    compute_state.constants = constant_entries.data();
+    compute_state.constantCount = constant_entries.size();
   }
 
   wgpu::ComputePipelineDescriptor pipeline_descriptor{};
-  pipeline_descriptor.compute = compute_stage;
+  pipeline_descriptor.compute = compute_state;
 #ifndef NDEBUG  // if debug build
   pipeline_descriptor.label = program.Name().c_str();
 #endif
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h
index 4c87bc9158890..2aba2a59d157f 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.h
+++ b/onnxruntime/core/providers/webgpu/shader_variable.h
@@ -189,6 +189,10 @@ class ShaderVariableHelper : public ShaderIndicesHelper {
 
   friend class ShaderHelper;
 };
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
 
 inline ShaderUsage operator|(ShaderUsage a, ShaderUsage b) {
   return (uint32_t)a.usage | (uint32_t)b.usage;
@@ -205,6 +209,10 @@ inline ShaderUsage& operator&=(ShaderUsage& a, ShaderUsage b) {
   return a;
 }
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
 namespace detail {
 template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
 std::string pass_as_string(T&& v) {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 1c9a16bf36e8e..99a645878cd7e 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -4,12 +4,20 @@
 #include <memory>
 #include <cmath>
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
 #if !defined(__wasm__)
 #include "dawn/dawn_proc.h"
 #if !defined(USE_EXTERNAL_DAWN)
 #include "dawn/native/DawnNative.h"
 #endif
 #endif
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 
 #include "core/common/common.h"
 #include "core/common/path_string.h"
@@ -107,12 +115,12 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
       device_desc.requiredLimits = &required_limits;
 
       // TODO: revise temporary error handling
-      device_desc.SetUncapturedErrorCallback([](const wgpu::Device& /*device*/, wgpu::ErrorType type, const char* message) {
-        LOGS_DEFAULT(ERROR) << "WebGPU device error(" << int(type) << "): " << message;
+      device_desc.SetUncapturedErrorCallback([](const wgpu::Device& /*device*/, wgpu::ErrorType type, wgpu::StringView message) {
+        LOGS_DEFAULT(ERROR) << "WebGPU device error(" << int(type) << "): " << std::string_view{message};
       });
       // TODO: revise temporary device lost handling
-      device_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& /*device*/, wgpu::DeviceLostReason reason, const char* message) {
-        LOGS_DEFAULT(INFO) << "WebGPU device lost (" << int(reason) << "): " << message;
+      device_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& /*device*/, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+        LOGS_DEFAULT(INFO) << "WebGPU device lost (" << int(reason) << "): " << std::string_view{message};
       });
 
       ORT_ENFORCE(wgpu::WaitStatus::Success == instance_.WaitAny(adapter_.RequestDevice(
diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
index 781ddcb896155..585fddfd1ff2c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
@@ -17,10 +17,6 @@ class ActivationOpBuilder : public BaseOpBuilder {
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
-
-  // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         WebnnDeviceType device_type, const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -68,30 +64,6 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   return Status::OK();
 }
 
-// Operator support related.
-bool ActivationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
-                                            const Node& node,
-                                            WebnnDeviceType device_type,
-                                            const logging::Logger& logger) const {
-  const auto& input_defs = node.InputDefs();
-  const auto& op_type = node.OpType();
-
-  std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
-    return false;
-
-  if (op_type == "Elu" && device_type == WebnnDeviceType::CPU) {
-    NodeAttrHelper helper(node);
-    float alpha = helper.Get("alpha", 1.0f);
-    if (alpha != 1.0f) {
-      LOGS(logger, VERBOSE) << "WebNN CPU backend only supports Elu's alpha == 1.0";
-      return false;
-    }
-  }
-
-  return true;
-}
-
 void CreateActivationOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   if (op_registrations.op_builder_map.count(op_type) > 0)
     return;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index e14507e8f5aea..c5493f97fdb21 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -20,8 +20,6 @@ class BinaryOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
@@ -59,33 +57,6 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
-bool BinaryOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
-                                        const Node& node,
-                                        const WebnnDeviceType device_type,
-                                        const logging::Logger& logger) const {
-  const auto& input_defs = node.InputDefs();
-  const auto& op_type = node.OpType();
-
-  std::vector<int64_t> input0_shape;
-  std::vector<int64_t> input1_shape;
-  if (!GetShape(*input_defs[0], input0_shape, logger) ||
-      !GetShape(*input_defs[1], input1_shape, logger)) {
-    return false;
-  }
-
-  // 'prelu' op in WebNN CPU backend restricts the last dimension of input and slope to be same.
-  // TODO: Remove this workaround once the associated issue is resolved in Chromium:
-  // https://issues.chromium.org/issues/335517470.
-  if (op_type == "PRelu" && device_type == WebnnDeviceType::CPU) {
-    if (input0_shape.back() != input1_shape.back()) {
-      LOGS(logger, VERBOSE) << "The last dimension of input and slope for PRelu must be same for WebNN CPU backend.";
-      return false;
-    }
-  }
-
-  return true;
-}
-
 bool BinaryOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
                                              const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
index 374143c886849..a244efdd9b2eb 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
@@ -69,27 +69,7 @@ bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   // can ensure initializers are constant. See #19401 for details of how this update was made to the NNAPI EP.
   // GetClipMinMax(graph_viewer, node, minValue, maxValue, logger)
   float min, max;
-  if (GetClipMinMax(initializers, node, min, max, logger)) {
-    // WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0].
-    // TODO: Remove this workaround once the associated issue is resolved in Chromium:
-    // https://issues.chromium.org/issues/326156496.
-    if (device_type == WebnnDeviceType::CPU) {
-      if ((min == 0.0f && max == std::numeric_limits<float>::infinity()) ||
-          (min == -1.0f && max == 1.0f) ||
-          (min == 0.0f && max == 6.0f)) {
-        return true;
-      } else {
-        LOGS(logger, VERBOSE) << "Clip min and max values ("
-                              << min << ", "
-                              << max << ") are not supported for WebNN CPU backend";
-        return false;
-      }
-    }
-
-    return true;
-  } else {
-    return false;
-  };
+  return GetClipMinMax(initializers, node, min, max, logger);
 }
 
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 548e718b8774e..e623590e3bc1a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -378,22 +378,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     return false;
   }
 
-  // WebNN CPU backend (TFLite) only supports default dilations and group.
-  // https://source.chromium.org/chromium/chromium/src/+/main:services/webnn/tflite/graph_builder_tflite.cc;l=1040
-  if (device_type == WebnnDeviceType::CPU && op_type == "ConvTranspose") {
-    NodeAttrHelper helper(node);
-    const auto dilations = helper.Get("dilations", std::vector<int64_t>{1, 1});
-    const auto group = helper.Get("group", 1);
-    if (dilations[0] != 1 || (dilations.size() > 1 && dilations[1] != 1)) {
-      LOGS(logger, VERBOSE) << op_type << " for WebNN CPU backend only supports default dilation 1.";
-      return false;
-    }
-    if (group != 1) {
-      LOGS(logger, VERBOSE) << op_type << " for WebNN CPU backend only supports default group 1.";
-      return false;
-    }
-  }
-
   return true;
 }
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 223eed248800e..26ffeb93ab3b6 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -921,7 +921,7 @@ common::Status InferenceSession::SaveToOrtFormat(const std::filesystem::path& fi
   ORT_RETURN_IF_ERROR(kernel_type_str_resolver.RegisterGraphNodeOpSchemas(model_->MainGraph()));
   ORT_RETURN_IF_ERROR(standalone::RegisterCustomOpNodeSchemas(kernel_type_str_resolver, model_->MainGraph()));
 
-  for (const auto op_schema : saved_runtime_optimization_produced_node_op_schemas_) {
+  for (const auto& op_schema : saved_runtime_optimization_produced_node_op_schemas_) {
     ORT_RETURN_IF_ERROR(kernel_type_str_resolver.RegisterOpSchema(*op_schema));
   }
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index af39edae2074d..d7c6dab72fde8 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -37,7 +37,6 @@
 #include "core/framework/model_metadef_id_generator.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #include "core/session/onnxruntime_c_api.h"
 #include "core/common/string_helper.h"
@@ -62,6 +61,10 @@
 #include "orttraining/core/framework/distributed_run_context.h"
 #endif
 
+#ifdef _WIN32
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
 namespace ONNX_NAMESPACE {
 // We use these names in the provider API because we don't have the protobuf definitions of the RepeatedField* types
 using int64s = google::protobuf::RepeatedField<int64_t>;
@@ -76,11 +79,18 @@ using FunctionProtos = google::protobuf::RepeatedPtrField<FunctionProto>;
 namespace onnxruntime {
 using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
+using Node_EdgeEnd = Node::EdgeEnd;
+#ifdef _WIN32
+namespace logging {
+using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwInternalCallback;
+}
+#endif
 }  // namespace onnxruntime
 
 #include "core/common/cpuid_info.h"
 #include "core/common/logging/logging.h"
 #include "core/providers/shared_library/provider_interfaces.h"
+#include "core/providers/partitioning_utils.h"
 
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cann/cann_provider_factory_creator.h"
@@ -90,6 +100,7 @@ using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
 #include "core/providers/vitisai/vitisai_provider_factory_creator.h"
+#include "core/providers/qnn/qnn_provider_factory_creator.h"
 
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cann/cann_provider_factory.h"
@@ -181,6 +192,7 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   bool operator!=(const Node__EdgeIterator& p) const override { return v_ != static_cast<const Node__EdgeIterator_Impl*>(&p)->v_; }
 
   void operator++() override { v_.operator++(); }
+  const Node_EdgeEnd& operator*() const override { return v_.operator*(); }
   const Node& GetNode() const override { return v_->GetNode(); }
   int GetSrcArgIndex() const override { return v_->GetSrcArgIndex(); }
   int GetDstArgIndex() const override { return v_->GetDstArgIndex(); }
@@ -188,6 +200,18 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   Node::EdgeConstIterator v_;
 };
 
+struct ConstGraphNodes_Iterator_Impl : ConstGraphNodes_Iterator {
+  ConstGraphNodes_Iterator_Impl(ConstGraphNodes::ConstNodeIterator&& v) : v_{std::move(v)} {}
+
+  bool operator!=(const ConstGraphNodes_Iterator& other) const override {
+    return v_ != static_cast<const ConstGraphNodes_Iterator_Impl*>(&other)->v_;
+  }
+  void operator++() override { v_.operator++(); }
+  const Node& operator*() override { return *v_; }
+
+  ConstGraphNodes::ConstNodeIterator v_;
+};
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
 common::Status LoadDynamicLibraryFromProvider(onnxruntime::PathString library_name) {
   const auto& platform_env = onnxruntime::Env::Default();
@@ -367,22 +391,58 @@ struct ProviderHostImpl : ProviderHost {
 
   // logging::Logger (wrapped)
   bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) override { return p->OutputIsEnabled(severity, data_type); }
+  logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) override {
+    return p->GetSeverity();
+  }
 
   // logging::LoggingManager (wrapped)
   const logging::Logger& logging__LoggingManager__DefaultLogger() override { return logging::LoggingManager::DefaultLogger(); }
+  bool logging__LoggingManager__HasDefaultLogger() override { return logging::LoggingManager::HasDefaultLogger(); }
 
   // logging::Capture (wrapped)
-  std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) override {
-    return std::make_unique<logging::Capture>(logger, severity, category, dataType, location);
+  std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger,
+                                                                logging::Severity severity, const char* category,
+                                                                logging::DataType data_type,
+                                                                const CodeLocation& location) override {
+    return std::make_unique<logging::Capture>(logger, severity, category, data_type, location);
   }
   void logging__Capture__operator_delete(logging::Capture* p) noexcept override { delete p; }
   std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept override { return p->Stream(); }
+  void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) override {
+    p->ProcessPrintf(format, args);
+  }
+
+#if defined(_WIN32)
+  // logging::EtwRegistrationManager
+  logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() override {
+    return logging::EtwRegistrationManager::Instance();
+  }
+  bool logging__EtwRegistrationManager__SupportsETW() override {
+    return logging::EtwRegistrationManager::SupportsETW();
+  }
+  logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) override {
+    return p->MapLevelToSeverity();
+  }
+  void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->RegisterInternalCallback(callback);
+  }
+  void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->UnregisterInternalCallback(callback);
+  }
+#endif  // defined(_WIN32)
 
   // Env
   Env& Env__Default() override { return Env::Default(); }
 
   // Utils::DataTypeUtils (wrapped)
   const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) override { return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_proto); }
+  const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) override {
+    return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_str);
+  }
 
   // int64s (wrapped)
   int int64s__size(const ONNX_NAMESPACE::int64s* p) override { return p->size(); }
@@ -424,6 +484,7 @@ struct ProviderHostImpl : ProviderHost {
   bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_shape(); }
   const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->shape(); }
   ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->mutable_shape(); }
+  bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_elem_type(); }
   int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->elem_type(); }
   void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) override { p->set_elem_type(value); };
 
@@ -444,6 +505,7 @@ struct ProviderHostImpl : ProviderHost {
   // TypeProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() override { return std::make_unique<ONNX_NAMESPACE::TypeProto>(); }
   void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) override { p->CopyFrom(*other); }
+  bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->has_tensor_type(); }
   const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->tensor_type(); }
   ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) override { return p->mutable_tensor_type(); }
   int TypeProto__value_case(const ONNX_NAMESPACE::TypeProto* p) override { return p->value_case(); }
@@ -572,6 +634,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->raw_data(); }
   std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_raw_data(); }
 
+  bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_data_type(); }
   int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_type(); }
   void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) override { p->set_data_type(type); }
 
@@ -610,6 +673,10 @@ struct ProviderHostImpl : ProviderHost {
     return std::make_unique<TensorShapeProto_Dimension_Iterator_Impl>(p->end());
   }
 
+  size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) override {
+    return p->size();
+  }
+
   // TensorShapeProto (wrapped)
   int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim_size(); }
   const ONNX_NAMESPACE::TensorShapeProto_Dimensions& TensorShapeProto__dim(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim(); }
@@ -960,6 +1027,12 @@ struct ProviderHostImpl : ProviderHost {
   void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) override {
     p->AddAttribute(attr_name, value);
   }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) override {
+    p->AddAttribute(attr_name, value);
+  }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) override {
+    p->AddAttribute(attr_name, value);
+  }
   size_t Node__GetInputEdgesCount(const Node* p) noexcept override { return p->GetInputEdgesCount(); }
   size_t Node__GetOutputEdgesCount(const Node* p) noexcept override { return p->GetOutputEdgesCount(); }
 
@@ -982,6 +1055,11 @@ struct ProviderHostImpl : ProviderHost {
   std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); }
   int Node__NodeType(const Node* p) const noexcept override { return int(p->NodeType()); }
 
+  // Node_EdgeEnd (wrapped). Maps to Node::EdgeEnd struct.
+  const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) override { return p->GetNode(); }
+  int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) override { return p->GetSrcArgIndex(); }
+  int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) override { return p->GetDstArgIndex(); }
+
   // NodeArg (wrapped)
   const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); }
   const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) override { return p->Shape(); }
@@ -1016,7 +1094,8 @@ struct ProviderHostImpl : ProviderHost {
   void NodeAttributes__insert_or_assign(NodeAttributes* p, const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) override { p->insert_or_assign(k, v); }
   void NodeAttributes__reserve(NodeAttributes* p, size_t size) override { p->reserve(size); }
 
-  // NodeUnit (wrapped)
+  void NodeUnit__operator_delete(NodeUnit* p) noexcept override { delete p; }
+
   int NodeUnit__UnitType(const NodeUnit* p) noexcept override { return static_cast<int>(p->UnitType()); }
 
   const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept override {
@@ -1064,12 +1143,46 @@ struct ProviderHostImpl : ProviderHost {
     return QDQ::GetAllNodeUnits(*graph_viewer, logger);
   }
 
+  // Partitioning utils
+  std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const utils::GenerateMetadefNameFn& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) override {
+    return onnxruntime::utils::CreateSupportedPartitions(graph_viewer,
+                                                         supported_nodes,
+                                                         stop_ops,
+                                                         generate_metadef_name,
+                                                         execution_provider_name,
+                                                         execution_provider_type,
+                                                         node_unit_map,
+                                                         drop_constant_initializers);
+  }
+
+  std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) override {
+    return onnxruntime::utils::MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                                     execution_provider_name, drop_constant_initializers);
+  }
+
   // Model (wrapped)
   std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                           const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
+  std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                          const logging::Logger& logger) override {
+    return std::make_unique<Model>(graph_name, is_onnx_domain_only, logger);
+  }
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }
@@ -1179,6 +1292,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& GraphViewer__Name(const GraphViewer* p) noexcept override { return p->Name(); }
   const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept override { return p->ModelPath(); }
 
+  const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept override { return p->Nodes(); }
   const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) override { return p->GetNode(node_index); }
   const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) override { return p->GetNodeArg(name); }
 
@@ -1196,6 +1310,9 @@ struct ProviderHostImpl : ProviderHost {
 
   const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept override { return p->GetInputs(); }
   const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept override { return p->GetOutputs(); }
+  bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) override {
+    return p->NodeProducesGraphOutput(node);
+  }
   const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept override { return p->GetValueInfo(); }
 
   const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) override { return p->GetAllInitializedTensors(); }
@@ -1224,6 +1341,21 @@ struct ProviderHostImpl : ProviderHost {
   const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); }
   IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const override { return p->GetSchemaRegistry(); }
 
+  // ConstGraphNodes
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->begin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->end());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cbegin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cend());
+  }
+  bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept override { return p->empty(); }
+
   // OpKernel (direct)
   const Node& OpKernel__Node(const OpKernel* p) override { return p->OpKernel::Node(); }
 
@@ -1651,6 +1783,9 @@ static ProviderLibrary s_library_tensorrt(LIBRARY_PREFIX ORT_TSTR("onnxruntime_p
 );
 static ProviderLibrary s_library_migraphx(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_migraphx") LIBRARY_EXTENSION);
 
+// QNN EP can be built either as a static library or a shared library. Can safely define s_library_qnn even if static.
+static ProviderLibrary s_library_qnn(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_qnn") LIBRARY_EXTENSION);
+
 void UnloadSharedProviders() {
   s_library_dnnl.Unload();
   s_library_vitisai.Unload();
@@ -1662,6 +1797,7 @@ void UnloadSharedProviders() {
   s_library_rocm.Unload();
   s_library_shared.Unload();
   s_library_migraphx.Unload();
+  s_library_qnn.Unload();
 }
 
 // Used by test code
@@ -1832,6 +1968,20 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O
   return ov_options_converted_map;
 }
 
+#if !BUILD_QNN_EP_STATIC_LIB
+std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
+                                                                             const SessionOptions* session_options) {
+  const ConfigOptions* config_options = nullptr;
+  if (session_options != nullptr) {
+    config_options = &session_options->config_options;
+  }
+
+  std::array<const void*, 2> configs_array = {&provider_options_map, config_options};
+  const void* arg = reinterpret_cast<const void*>(&configs_array);
+  return s_library_qnn.Get().CreateExecutionProviderFactory(arg);
+}
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
 std::shared_ptr<IExecutionProviderFactory> OpenVINOProviderFactoryCreator::Create(
     const ProviderOptions* provider_options_map, const SessionOptions* session_options) {
   // Append session options applicable for EP to EP Provider options.
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 7a6028dfbe153..b9675d4280e59 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -205,6 +205,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "GemmFastGelu": self._infer_GemmFastGelu,
             "GemmFloat8": self._infer_GemmFloat8,
             "GroupNorm": self._infer_GroupNorm,
+            "GroupNormalization": self._infer_GroupNorm,
             "GroupQueryAttention": self._infer_GroupQueryAttention,
             "LayerNormalization": self._infer_LayerNormalization,
             "LongformerAttention": self._infer_LongformerAttention,
@@ -474,6 +475,7 @@ def _onnx_infer_single_node(self, node):
             "PythonOp",
             "MultiHeadAttention",
             "GroupNorm",
+            "GroupNormalization",
             "GroupQueryAttention",
             "SparseAttention",
             "SkipGroupNorm",
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 0384300b99445..7f418af06a4ec 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -6,6 +6,8 @@
 Builds an Ubuntu-based Docker image with TensorRT.
 """
 
+from __future__ import annotations
+
 import argparse
 import os
 import pty
diff --git a/onnxruntime/test/fuzzing/include/OnnxPrediction.h b/onnxruntime/test/fuzzing/include/OnnxPrediction.h
index c169aaa16fd6e..c99120dc45479 100644
--- a/onnxruntime/test/fuzzing/include/OnnxPrediction.h
+++ b/onnxruntime/test/fuzzing/include/OnnxPrediction.h
@@ -20,7 +20,7 @@
 #include <filesystem>
 
 #include "BetaDistribution.h"
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "onnxruntime_cxx_api.h"
 
 #include "testlog.h"
diff --git a/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp b/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp
index 607d9cfd9c755..472122be58e89 100644
--- a/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp
+++ b/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp
@@ -5,7 +5,7 @@
 #include "OnnxPrediction.h"
 #include "onnxruntime_session_options_config_keys.h"
 #include "src/libfuzzer/libfuzzer_macro.h"
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include <type_traits>
 
diff --git a/onnxruntime/test/mlas/bench/bench_hgemm.cpp b/onnxruntime/test/mlas/bench/bench_hgemm.cpp
new file mode 100644
index 0000000000000..1e8b0eb7c34d6
--- /dev/null
+++ b/onnxruntime/test/mlas/bench/bench_hgemm.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "mlas.h"
+#include "bench_util.h"
+#include "core/util/thread_utils.h"
+
+#include <stdexcept>
+#include <numeric>
+
+static const std::vector<std::string> hgemm_bench_arg_names = {"M", "N", "K"};
+
+void HGEMM(benchmark::State& state, bool transA, bool transB) {
+  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
+  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
+  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
+  const size_t M = static_cast<size_t>(state.range(0));
+  const size_t N = static_cast<size_t>(state.range(1));
+  const size_t K = static_cast<size_t>(state.range(2));
+
+  auto A = RandomVectorUniform(static_cast<size_t>(M * K), MLAS_FP16(-1.0f), MLAS_FP16(1.0f));
+  auto B = RandomVectorUniform(static_cast<size_t>(N * K), MLAS_FP16(-1.0f), MLAS_FP16(1.0f));
+  std::vector<MLAS_FP16> C(static_cast<size_t>(M * N));
+
+  MLAS_FP16 alpha = MLAS_FP16(1.0f);
+  MLAS_FP16 beta = MLAS_FP16(0.0f);
+  OrtThreadPoolParams tpo;
+  tpo.thread_pool_size = 8;
+  tpo.auto_set_affinity = true;
+  std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(
+      onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(),
+                                                 tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
+  MlasGemm(
+      transA ? CblasTrans : CblasNoTrans,
+      transB ? CblasTrans : CblasNoTrans,
+      static_cast<size_t>(M),
+      static_cast<size_t>(N),
+      static_cast<size_t>(K),
+      A.data(),
+      transA ? M : K,
+      B.data(),
+      transB ? K : N,
+      C.data(),
+      N,
+      alpha.val,
+      beta.val,
+      tp.get());
+
+  for (auto _ : state) {
+    MlasGemm(
+        transA ? CblasTrans : CblasNoTrans,
+        transB ? CblasTrans : CblasNoTrans,
+        static_cast<size_t>(M),
+        static_cast<size_t>(N),
+        static_cast<size_t>(K),
+        A.data(),
+        transA ? M : K,
+        B.data(),
+        transB ? K : N,
+        C.data(),
+        N,
+        alpha.val,
+        beta.val,
+        tp.get());
+  }
+}
+
+static void GemmSizeWithOne(benchmark::internal::Benchmark* b) {
+  b->ArgNames(hgemm_bench_arg_names);
+  b->ArgsProduct({{1}, {63, 255, 1023}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {1}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {1}});
+}
+BENCHMARK_CAPTURE(HGEMM, GEMV_TransB, false, true)->Apply(GemmSizeWithOne)->UseRealTime();
+
+static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
+  b->ArgNames(hgemm_bench_arg_names);
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {63, 255, 1023}});
+}
+BENCHMARK_CAPTURE(HGEMM, NORMAL_TransB, false, true)->Apply(GemmSizeProducts)->UseRealTime();
+
+static void GemmLLMSizeProducts(benchmark::internal::Benchmark* b) {
+  b->ArgNames(hgemm_bench_arg_names);
+  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}});
+}
+BENCHMARK_CAPTURE(HGEMM, LLM, false, true)->Apply(GemmLLMSizeProducts)->UseRealTime();
diff --git a/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp b/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp
new file mode 100644
index 0000000000000..4f3d690b432bf
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp
@@ -0,0 +1,393 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    test_hgemm_neon.cpp
+
+Abstract:
+
+    Tests for MLAS fp16 GEMM on ARM CPU.
+
+--*/
+
+#include <vector>
+#include <random>
+
+#include "test/mlas/unittest/test_util.h"
+#include "core/mlas/lib/mlasi.h"
+#include "core/mlas/lib/halfgemm.h"
+
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
+class MlasNeonHGemmPackBTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> input_, ref_, packed_;
+
+  template <size_t N, size_t K>
+  MLAS_FORCEINLINE void PackB(const MLAS_FP16* src, MLAS_FP16* dst) {
+    size_t i = 0;
+    for (; i + 16 <= N; i += 16) {
+      for (size_t j = 0; j < K; ++j) {
+        for (size_t k = 0; k < 16; ++k) {
+          *dst = src[(i + k) * K + j];
+          ++dst;
+        }
+      }
+    }
+    if (i + 8 <= N) {
+      for (size_t j = 0; j < K; ++j) {
+        for (size_t k = 0; k < 8; ++k) {
+          *dst = src[(i + k) * K + j];
+          ++dst;
+        }
+      }
+      i += 8;
+    }
+    if (i < N) {
+      for (size_t j = 0; j < K; ++j) {
+        for (size_t k = 0; k < N - i; ++k) {
+          *dst = src[(i + k) * K + j];
+          ++dst;
+        }
+        dst += 8 - (N - i);
+      }
+    }
+  }
+
+  template <size_t N, size_t K>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* packed, const MLAS_FP16* ref) {
+    size_t n = ((N + 7) & ~7) * K;
+    for (size_t i = 0; i < n; ++i) {
+      ASSERT_EQ(packed[i].val, ref[i].val) << " seed " << seed_ << " i " << i;
+    }
+  }
+
+  template <size_t N, size_t K>
+  void TestPackB() {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* input = input_.GetFilledBuffer(N * K, InitializeBuffer);
+    auto* packed = packed_.GetBuffer(K * ((N + 7) & ~7), true);
+    auto* ref = ref_.GetBuffer(K * ((N + 7) & ~7), true);
+    hgemm_neon::HPackB_TransposedB_Kernel(input, packed, N, K, K);
+    PackB<N, K>(input, ref);
+    Check<N, K>(packed, ref);
+  }
+
+ public:
+  MlasNeonHGemmPackBTest()
+      : seed_(rd_()), gen_(seed_), distrib_(-100.f, 100.f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemmPackB";
+  }
+
+  void ExecuteShort(void) override {
+    TestPackB<1, 1>();
+    TestPackB<1, 15>();
+    TestPackB<1, 31>();
+    TestPackB<8, 1>();
+    TestPackB<8, 16>();
+    TestPackB<9, 31>();
+    TestPackB<9, 33>();
+    TestPackB<15, 33>();
+    TestPackB<17, 67>();
+    TestPackB<17, 96>();
+    TestPackB<265, 263>();
+  }
+};
+
+class MlasNeonHGemmTransposedBTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> A_, B_, ref_, C_;
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) {
+    float alphaf = alpha.ToFloat();
+    float betaf = beta.ToFloat();
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t n = 0; n < N; ++n) {
+        float accu = 0.0f;
+        for (size_t k = 0; k < K; ++k) {
+          accu += (A[m * K + k].ToFloat()) * (B[n * K + k].ToFloat());
+        }
+        C[m * N + n] = MLAS_FP16(accu * alphaf + C[m * N + n].ToFloat() * betaf);
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = v0.ToFloat(), f1 = v1.ToFloat();
+    return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol;
+  }
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) {
+    size_t n = M * N;
+    for (size_t i = 0; i < n; ++i) {
+      ASSERT_TRUE(FloatEqual(C[i], ref[i], 0.02f, 0.055f))
+          << " seed " << seed_ << " i " << i
+          << " M " << M << " N " << N << " K " << K
+          << " v0 " << C[i] << " v1 " << ref[i];
+    }
+  }
+
+  template <size_t M, size_t K, size_t N>
+  void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer);
+    const auto* B = B_.GetFilledBuffer(K * N, InitializeBuffer);
+    auto* C = C_.GetBuffer(M * N, true);
+    auto* ref = ref_.GetBuffer(M * N, true);
+    hgemm_neon::HGemm_TransposedB_Kernel(A, B, C, M, N, K, K, K, N, alpha.val, beta.val);
+    HGemm<M, K, N>(A, B, ref, alpha, beta);
+    Check<M, K, N>(C, ref);
+  }
+
+ public:
+  MlasNeonHGemmTransposedBTest()
+      : seed_(1928375), gen_(seed_), distrib_(-1.f, 1.f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemmTransposedB";
+  }
+
+  void ExecuteShort(void) override {
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<1, 1, 1>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 15, 17>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 17, 15>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 17, 15>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 33, 31>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 31, 32>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 32, 33>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 78, 263>(MLAS_FP16(0.5f), MLAS_FP16(0.0f));
+    TestHGemm<2, 267, 79>(MLAS_FP16(1.5f), MLAS_FP16(1.0f));
+  }
+};
+
+class MlasNeonHGemmTransposedPackedBTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> A_, B_, ref_, C_;
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) {
+    float alphaf = alpha.ToFloat();
+    float betaf = beta.ToFloat();
+    size_t n = 0;
+    for (; n + 16 <= N; n += 16) {
+      for (size_t i = 0; i < 16; ++i) {
+        for (size_t m = 0; m < M; ++m) {
+          float accu = 0.0f;
+          for (size_t k = 0; k < K; ++k) {
+            accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 16 + i].ToFloat());
+          }
+          C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf);
+        }
+      }
+    }
+    if (n + 8 <= N) {
+      for (size_t i = 0; i < 8; ++i) {
+        for (size_t m = 0; m < M; ++m) {
+          float accu = 0.0f;
+          for (size_t k = 0; k < K; ++k) {
+            accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 8 + i].ToFloat());
+          }
+          C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf);
+        }
+      }
+      n += 8;
+    }
+    if (n < N) {
+      for (size_t i = 0; i < N - n; ++i) {
+        for (size_t m = 0; m < M; ++m) {
+          float accu = 0.0f;
+          for (size_t k = 0; k < K; ++k) {
+            accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 8 + i].ToFloat());
+          }
+          C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf);
+        }
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = v0.ToFloat(), f1 = v1.ToFloat();
+    return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol;
+  }
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) {
+    size_t n = M * N;
+    for (size_t i = 0; i < n; ++i) {
+      ASSERT_TRUE(FloatEqual(C[i], ref[i], 0.02f, 0.055f))
+          << " seed " << seed_ << " i " << i
+          << " M " << M << " K " << K << " N " << N
+          << " v0 " << C[i] << " v1 " << ref[i];
+    }
+  }
+
+  template <size_t M, size_t K, size_t N>
+  void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer);
+    const auto* B = B_.GetFilledBuffer(K * ((N + 7) & ~7), InitializeBuffer);
+    auto* C = C_.GetBuffer(M * N, true);
+    auto* ref = ref_.GetBuffer(M * N, true);
+    hgemm_neon::HGemm_TransposedPackedB_Kernel(A, B, C, M, N, K, K, N, alpha.val, beta.val);
+    HGemm<M, K, N>(A, B, ref, alpha, beta);
+    Check<M, K, N>(C, ref);
+  }
+
+ public:
+  MlasNeonHGemmTransposedPackedBTest()
+      : seed_(1928372), gen_(seed_), distrib_(-1.f, 1.f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemmTransposedPackedB";
+  }
+
+  void ExecuteShort(void) override {
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<1, 1, 1>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 15, 17>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 17, 15>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 17, 15>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 33, 31>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 31, 32>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 32, 33>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 78, 263>(MLAS_FP16(0.5f), MLAS_FP16(0.0f));
+    TestHGemm<2, 267, 79>(MLAS_FP16(1.5f), MLAS_FP16(1.0f));
+  }
+};
+
+class MlasNeonHGemmTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> A_, B_, ref_, C_;
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) {
+    float alphaf = alpha.ToFloat();
+    float betaf = beta.ToFloat();
+    for (size_t i = 0; i < M; ++i) {
+      for (size_t j = 0; j < N; ++j) {
+        float accu = 0.0f;
+        for (size_t k = 0; k < K; ++k) {
+          accu += (A[i * K + k].ToFloat()) * (B[j * K + k].ToFloat());
+        }
+        C[i * N + j] = MLAS_FP16(accu * alphaf + C[i * N + j].ToFloat() * betaf);
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = v0.ToFloat(), f1 = v1.ToFloat();
+    return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol;
+  }
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) {
+    for (size_t i = 0; i < M; ++i) {
+      for (size_t j = 0; j < N; ++j) {
+        ASSERT_TRUE(FloatEqual(C[i * N + j], ref[i * N + j], 0.02f, 0.055f))
+            << " seed " << seed_ << " i " << i << " j " << j
+            << " M " << M << " K " << K << " N " << N
+            << " v0 " << C[i * N + j] << " v1 " << ref[i * N + j];
+      }
+    }
+  }
+
+  template <size_t M, size_t K, size_t N>
+  void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer);
+    const auto* B = B_.GetFilledBuffer(K * N, InitializeBuffer);
+    auto* C = C_.GetBuffer(M * N, true);
+    auto* ref = ref_.GetBuffer(M * N, true);
+    MlasGemm(CblasNoTrans, CblasTrans, M, N, K, A, K, B, K, C, N, alpha.val, beta.val, nullptr);
+    HGemm<M, K, N>(A, B, ref, alpha, beta);
+    Check<M, K, N>(C, ref);
+  }
+
+ public:
+  MlasNeonHGemmTest()
+      : seed_(192837), gen_(seed_), distrib_(-0.25f, 0.25f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemm";
+  }
+
+  void ExecuteShort(void) override {
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<1, 128, 512>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 128, 513>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 128, 511>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 129, 512>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 127, 512>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 513, 1023>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 511, 1025>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<127, 513, 1023>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<129, 511, 1025>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+  }
+};
+
+static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  size_t count = 0;
+  if (is_short_execute) {
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmPackBTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmTransposedBTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmTransposedPackedBTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmTest>::RegisterShortExecute();
+  }
+  return count;
+});
+
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index b9b69fdc74b4d..d44f098db6c4c 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -961,6 +961,7 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
       {"reduce_prod_empty_set", "unknown version", {}},
       {"reduce_sum_empty_set", "unknown version", {}},
       {"reduce_sum_square_empty_set_expanded", "unknown version", {}},
+      {"averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True", "TODO(titaiwang): enable this in the next ONNX release."},
 #ifdef ENABLE_TRAINING_CORE
       {"adagrad", "not a registered function/op", {}},                  // Op not registered.
       {"adagrad_multiple", "not a registered function/op", {}},         // Op not registered.
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 302ad57fb88c5..a9aa78b7a3229 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -15,7 +15,7 @@
 #include "test/util/include/inference_session_wrapper.h"
 #include "test/util/include/test_environment.h"
 #include "test/util/include/test_utils.h"
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 // if this is a full build we need the provider test utils
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index a340f975ec91a..24a8c8491b632 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -1030,6 +1030,31 @@ TEST(PoolTest, AveragePool_19_dilation_2d) {
             kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
 }
 
+TEST(PoolTest, AveragePool_19_ceil_count_include_pad_1d) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
+  }
+
+  OpTester test("AveragePool", 19);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{3});
+  test.AddAttribute("pads", vector<int64_t>{3, 3});
+  test.AddAttribute("kernel_shape", vector<int64_t>{7});
+  test.AddAttribute("ceil_mode", (int64_t)1);
+  test.AddAttribute("count_include_pad", (int64_t)1);
+
+  std::vector<float> x_vals = {2.0903f, 4.6493f, 1.6320f, -3.2051f, 4.6975f, 4.7296f, 3.3653f, -1.5815f, -2.3832f, 0.9628f, -1.5899f, -2.6820f, 5.7529f, 7.7346f, -0.8910f, -2.0151f, 0.1313f, -0.5374f};
+  std::vector<int64_t> x_dims = {1, 2, 9};
+  std::vector<int64_t> expected_dims = {1, 2, 4};
+  std::vector<float> expected_vals = {0.73807144f, 2.5655572f, 0.8032287f, -0.09990001f, 0.34911433f, 1.0389f, 1.4536142f, -0.40353334f};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
+}
+
 TEST(PoolTest, GlobalAveragePool) {
   OpTester test("GlobalAveragePool");
 
diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
index c514cf16b2f3c..da6eda1317778 100644
--- a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
+++ b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
@@ -9,7 +9,7 @@
 #include "core/graph/node_attr_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc
index 1a0f9bfcbae97..f897a08da6b2e 100644
--- a/onnxruntime/test/providers/qnn/average_pool_test.cc
+++ b/onnxruntime/test/providers/qnn/average_pool_test.cc
@@ -11,7 +11,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc
index 9b83dd281a56d..e2e4b0d714e54 100644
--- a/onnxruntime/test/providers/qnn/cast_test.cc
+++ b/onnxruntime/test/providers/qnn/cast_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index cfa77a46210b3..21bd6fcc98d74 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc
index 637d3257ddea7..b33f8f9c00fc4 100644
--- a/onnxruntime/test/providers/qnn/flatten_op_test.cc
+++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc
index 81c08873064c8..85dc792666827 100644
--- a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc
@@ -11,7 +11,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index da0c7f2c36854..0c1146ba22360 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -9,7 +9,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc
index 59105136781f4..522b781379119 100644
--- a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc
+++ b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc
index a99cba66bf167..4b26ed0da93c7 100644
--- a/onnxruntime/test/providers/qnn/lrn_op_test.cc
+++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index f3f584f24a102..dec9369b81748 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -8,7 +8,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/max_min_op_test.cc b/onnxruntime/test/providers/qnn/max_min_op_test.cc
index 3deff121f3c72..9a45d11b7e34f 100644
--- a/onnxruntime/test/providers/qnn/max_min_op_test.cc
+++ b/onnxruntime/test/providers/qnn/max_min_op_test.cc
@@ -7,7 +7,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp
index a6b8664c6c0c9..4ce6db7facc69 100644
--- a/onnxruntime/test/providers/qnn/pad_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp
@@ -10,7 +10,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp
index 5dd3a6aaa3620..f0ca3557191c7 100644
--- a/onnxruntime/test/providers/qnn/pool_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp
@@ -10,7 +10,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 92ec4ba3b0d28..a6fb66472844a 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -6,7 +6,9 @@
 #include <thread>
 
 #include "core/providers/cpu/cpu_provider_factory.h"  // For OrtSessionOptionsAppendExecutionProvider_CPU
-#include "core/providers/qnn/qnn_allocator.h"
+#if BUILD_QNN_EP_STATIC_LIB
+#include "core/providers/qnn/qnn_allocator.h"  // Used by QnnHTPBackendTests.UseHtpSharedMemoryAllocatorForInputs
+#endif
 #include "core/session/inference_session.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -1099,6 +1101,9 @@ TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) {
   }
 }
 
+// Only compile this test when QNN EP is built as a static library. When QNN EP is a shared library,
+// we cannot include internal QNN EP headers that use the provider-bridge API.
+#if BUILD_QNN_EP_STATIC_LIB
 TEST_F(QnnHTPBackendTests, UseHtpSharedMemoryAllocatorForInputs) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
@@ -1145,6 +1150,7 @@ TEST_F(QnnHTPBackendTests, UseHtpSharedMemoryAllocatorForInputs) {
                   ExpectedEPNodeAssignment::All,
                   0.008f);
 }
+#endif  // BUILD_QNN_EP_STATIC_LIB
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 #endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index a3f0ed55b83f2..38fde332ca992 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -7,7 +7,6 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/inference_session.h"
-#include "core/providers/shared/utils/utils.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -25,6 +24,24 @@ namespace test {
 
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // Create a model with FusedMatMul + Add (quantized)
 // input1 -> Add -> Q -> DQ \
 //                           FusedMatMul -> Q -> DQ -> output
@@ -873,10 +890,9 @@ static void GetLastContextBinaryFileName(const std::string last_onnx_ctx_file,
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -899,10 +915,9 @@ static void UpdateEpContextModel(const std::vector<std::string>& ep_ctx_files,
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
diff --git a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
index 3964edc11461b..b66547a939983 100644
--- a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc
index 15612e3267a75..651f55bc05d3f 100644
--- a/onnxruntime/test/providers/qnn/resize_test.cc
+++ b/onnxruntime/test/providers/qnn/resize_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc
index 6dc721edb421e..23682f7e934c3 100644
--- a/onnxruntime/test/providers/qnn/split_op_test.cc
+++ b/onnxruntime/test/providers/qnn/split_op_test.cc
@@ -7,7 +7,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
index 33d2f64c0315e..abc1b3a89d85c 100644
--- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
@@ -7,7 +7,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/tile_op_test.cc b/onnxruntime/test/providers/qnn/tile_op_test.cc
index 2b35c730ee5fe..85541efe5646c 100644
--- a/onnxruntime/test/providers/qnn/tile_op_test.cc
+++ b/onnxruntime/test/providers/qnn/tile_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/topk_op_test.cc b/onnxruntime/test/providers/qnn/topk_op_test.cc
index 5a9351b9366ec..354a5d1e3b49a 100644
--- a/onnxruntime/test/providers/qnn/topk_op_test.cc
+++ b/onnxruntime/test/providers/qnn/topk_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc
index 3be0bd253c8a4..bb5007b40b072 100644
--- a/onnxruntime/test/qnn_ctx_gen/main.cc
+++ b/onnxruntime/test/qnn_ctx_gen/main.cc
@@ -16,7 +16,6 @@
 #include "core/common/logging/sinks/clog_sink.h"
 
 #include "core/graph/model.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/session/environment.h"
 #include "core/common/logging/logging.h"
 
@@ -31,6 +30,24 @@ static void CheckStatus(const Status& status) {
   }
 }
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // from the last context cache Onnx model, find the EPContext node with main_context=1,
 // and get the QNN context binary file name, this context binary contains all graphs from all Onnx models
 // get the max spill fill buffer size
@@ -44,11 +61,10 @@ static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
-      max_size = node_helper.Get("max_size", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
+      max_size = GetNodeAttr(node, "max_size", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -72,10 +88,9 @@ static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 406d0b7f19818..2ca562e5f5c2c 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -3,6 +3,6 @@
 lintrunner==0.12.5
 lintrunner-adapters==0.12.4
 # RUFF
-ruff==0.9.1
+ruff==0.9.3
 # CLANGFORMAT
 clang-format==19.1.7
diff --git a/setup.py b/setup.py
index a2d50284b03ff..6481f58f69070 100644
--- a/setup.py
+++ b/setup.py
@@ -315,17 +315,20 @@ def finalize_options(self):
 providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt")
 providers_openvino = "onnxruntime_providers_openvino"
 providers_cann = "onnxruntime_providers_cann"
+providers_qnn = "onnxruntime_providers_qnn"
 
 if platform.system() == "Linux":
     providers_cuda_or_rocm = "lib" + providers_cuda_or_rocm + ".so"
     providers_tensorrt_or_migraphx = "lib" + providers_tensorrt_or_migraphx + ".so"
     providers_openvino = "lib" + providers_openvino + ".so"
     providers_cann = "lib" + providers_cann + ".so"
+    providers_qnn = "lib" + providers_qnn + ".so"
 elif platform.system() == "Windows":
     providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll"
     providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll"
     providers_openvino = providers_openvino + ".dll"
     providers_cann = providers_cann + ".dll"
+    providers_qnn = providers_qnn + ".dll"
 
 # Additional binaries
 dl_libs = []
@@ -345,8 +348,9 @@ def finalize_options(self):
     dl_libs.append(providers_cuda_or_rocm)
     dl_libs.append(providers_tensorrt_or_migraphx)
     dl_libs.append(providers_cann)
+    dl_libs.append(providers_qnn)
     dl_libs.append("libonnxruntime.so*")
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["libonnxruntime_providers_shared.so"])
     libs.extend(["libonnxruntime_providers_dnnl.so"])
     libs.extend(["libonnxruntime_providers_openvino.so"])
@@ -354,6 +358,7 @@ def finalize_options(self):
     libs.append(providers_cuda_or_rocm)
     libs.append(providers_tensorrt_or_migraphx)
     libs.append(providers_cann)
+    libs.append(providers_qnn)
     # QNN
     qnn_deps = [
         "libQnnCpu.so",
@@ -392,13 +397,14 @@ def finalize_options(self):
         providers_cann,
         "onnxruntime.dll",
     ]
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["onnxruntime_providers_shared.dll"])
     libs.extend(["onnxruntime_providers_dnnl.dll"])
     libs.extend(["onnxruntime_providers_tensorrt.dll"])
     libs.extend(["onnxruntime_providers_openvino.dll"])
     libs.extend(["onnxruntime_providers_cuda.dll"])
     libs.extend(["onnxruntime_providers_vitisai.dll"])
+    libs.extend(["onnxruntime_providers_qnn.dll"])
     # DirectML Libs
     libs.extend(["DirectML.dll"])
     # QNN V68/V73 dependencies
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index e7d93aeabe113..bce7552854a4c 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -129,6 +129,17 @@ def invalid_hetero_build():
     return device_read
 
 
+def _qnn_verify_library_kind(library_kind):
+    choices = ["shared_lib", "static_lib"]
+    if library_kind not in choices:
+        print("\nYou have specified an invalid library kind for QNN EP.")
+        print(f"The invalid library kind was: {library_kind}")
+        print("Provide a library kind from the following options: ", choices)
+        print(f"Example: --use_qnn {choices[0]}")
+        sys.exit("Incorrect build configuration")
+    return library_kind
+
+
 def parse_arguments():
     class Parser(argparse.ArgumentParser):
         # override argument file line parsing behavior - allow multiple arguments per line and handle quotes
@@ -578,7 +589,14 @@ def convert_arg_line_to_args(self, arg_line):
     parser.add_argument("--use_jsep", action="store_true", help="Build with JavaScript kernels.")
     parser.add_argument("--use_webgpu", action="store_true", help="Build with WebGPU support.")
     parser.add_argument("--use_external_dawn", action="store_true", help="Treat Dawn as an external dependency.")
-    parser.add_argument("--use_qnn", action="store_true", help="Build with QNN support.")
+    parser.add_argument(
+        "--use_qnn",
+        nargs="?",
+        const="shared_lib",  # If provide --use_qnn without an arg, defaults to a shared library.
+        type=_qnn_verify_library_kind,
+        help="Build with QNN support. Specify 'shared_lib' or 'static_lib' to build QNN EP "
+        "as a shared or static library, respectively.",
+    )
     parser.add_argument("--qnn_home", help="Path to QNN SDK dir.")
     parser.add_argument("--use_rknpu", action="store_true", help="Build with RKNPU.")
     parser.add_argument("--use_preinstalled_eigen", action="store_true", help="Use pre-installed Eigen.")
@@ -1350,6 +1368,11 @@ def generate_build_tree(
             raise BuildError("qnn_home=" + qnn_home + " not valid." + " qnn_home paths must be specified and valid.")
         cmake_args += ["-Donnxruntime_USE_QNN=ON"]
 
+        if args.use_qnn == "static_lib":
+            cmake_args += ["-Donnxruntime_BUILD_QNN_EP_STATIC_LIB=ON"]
+        if args.android and args.use_qnn != "static_lib":
+            raise BuildError("Only support Android + QNN builds with QNN EP built as a static library.")
+
     if args.use_coreml:
         cmake_args += ["-Donnxruntime_USE_COREML=ON"]
 
@@ -2401,6 +2424,8 @@ def build_nuget_package(
     elif use_rocm:
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm"
     elif use_qnn:
+        if use_qnn != "shared_lib":
+            raise BuildError("Currently NuGet packages with QNN require QNN EP to be built as a shared library.")
         execution_provider = "/p:ExecutionProvider=qnn"
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN"
     elif any("OrtPackageId=" in x for x in msbuild_extra_options):
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index e9f8fea951661..c2bc5cba82a23 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -72,11 +72,15 @@ def _parse_build_settings(args):
     return build_settings
 
 
+def _is_qnn_android_build(build_settings):
+    return any(build_arg.startswith("--use_qnn") for build_arg in build_settings["build_params"])
+
+
 def _build_aar(args):
     build_settings = _parse_build_settings(args)
     build_dir = os.path.abspath(args.build_dir)
     ops_config_path = os.path.abspath(args.include_ops_by_config) if args.include_ops_by_config else None
-    qnn_android_build = "--use_qnn" in build_settings["build_params"]
+    qnn_android_build = _is_qnn_android_build(build_settings)
 
     # Setup temp environment for building
     temp_env = os.environ.copy()
diff --git a/tools/ci_build/github/android/default_qnn_aar_build_settings.json b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
index 8c362440358c4..5ac49f582d23e 100644
--- a/tools/ci_build/github/android/default_qnn_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
@@ -11,7 +11,7 @@
         "--cmake_generator=Ninja",
         "--build_java",
         "--build_shared_lib",
-        "--use_qnn",
+        "--use_qnn=static_lib",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",
         "--skip_tests"
 
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index b89aa50171b4c..f237ef37fe82c 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -72,7 +72,8 @@ jobs:
         --android_abi=x86_64 \
         --android_api=31 \
         --parallel \
-        --use_qnn \
+        --build_shared_lib \
+        --use_qnn static_lib \
         --qnn_home $(QnnSDKRootDir) \
         --cmake_generator=Ninja \
         --skip_tests
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 59deb0d4975fe..0eaaea562ca36 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -41,7 +41,7 @@ parameters:
 
 variables:
   - name: docker_base_image
-    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
   - name: linux_trt_version
     value: 10.3.0.26-1.cuda11.8
   - name: Repository
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 518aec8c2f92a..71f7ab6e49b70 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -49,9 +49,9 @@ parameters:
 variables:
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
 
   - name: Repository
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 9025f084d5982..c08eaaaa1308d 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -40,9 +40,9 @@ variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index 8d42e7201411b..4a86da167ff1f 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -40,9 +40,9 @@ variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index fb235bda24fbf..093db011e44f9 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -41,7 +41,12 @@ jobs:
     timeoutInMinutes: 60
     workspace:
       clean: all
-
+    strategy:
+      matrix:
+        SHARED_LIB:
+          QnnLibKind: 'shared_lib'
+        STATIC_LIB:
+          QnnLibKind: 'static_lib'
     steps:
       - script: |
           ls -R /data/qnn_test_data
@@ -65,7 +70,8 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
-            --use_qnn \
+            --build_shared_lib \
+            --use_qnn $(QnnLibKind) \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
             --update --build --parallel
@@ -77,7 +83,8 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
-            --use_qnn \
+            --build_shared_lib \
+            --use_qnn $(QnnLibKind) \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
             --test
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
index 4b94ffc7e302e..960b59f93bee0 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
           cuda_version: '11.8'
 
   - stage: Republish_Wheels
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
index 48d1e6b1ac7a7..021f7c5ece140 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
           cuda_version: '12.2'
 
   - stage: Republish_Wheels
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
index 0517fec3bad04..b081b39ad9bcc 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
@@ -142,9 +142,9 @@ stages:
       value: false
     - name: docker_base_image
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
     timeoutInMinutes: 60
 
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index 4adf41d3db4e5..85366ffc28b3a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -45,9 +45,9 @@ jobs:
       - template: ../../templates/common-variables.yml
       - name: docker_base_image
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
         ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index a3c804055d8fb..f48573abd3dba 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -68,9 +68,9 @@ stages:
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
 
   - ${{ if eq(parameters.enable_windows_dml, true) }}:
     - ${{ each python_version in parameters.PythonVersions }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index d44952690f651..ede9ec1a086ca 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -4,11 +4,6 @@ parameters:
   type: string
   default: ''
 
-- name: job_name_suffix
-  displayName: job name
-  type: string
-  default: ''
-
 - name: packageName
   displayName: Package Name
   type: string
@@ -25,17 +20,13 @@ parameters:
   default: '2.30.0.250109'
 
 jobs:
-- job: Final_AAR_Testing_Android_${{ parameters.job_name_suffix }}
+- job: Final_AAR_Testing_Android
+  pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   workspace:
     clean: all
-  pool:
-    vmImage: 'macOS-13'
   variables:
-  - name: runCodesignValidationInjection
-    value: false
+    runCodesignValidationInjection: false
   timeoutInMinutes: 90
-  dependsOn:
-    - Android_Java_API_AAR_Packaging_${{ parameters.job_name_suffix }}
   steps:
   - template: set-version-number-variables-step.yml
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 98206bcb690c0..1ab4fd2a8e9e7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -82,10 +82,12 @@ stages:
       packageName: 'onnxruntime-android'
       ReleaseVersionSuffix: $(ReleaseVersionSuffix)
 
+- stage: Android_Java_API_AAR_Testing_Full
+  dependsOn: Android_Java_API_AAR_Packaging_Full
+  jobs:
   - template: android-java-api-aar-test.yml
     parameters:
       artifactName: 'onnxruntime-android-full-aar'
-      job_name_suffix: 'Full'
       ReleaseVersionSuffix: $(ReleaseVersionSuffix)
 
 - stage: Android_Java_API_AAR_Packaging_QNN
@@ -105,10 +107,12 @@ stages:
       ReleaseVersionSuffix: $(ReleaseVersionSuffix)
       QnnSDKVersion: ${{ parameters.QnnSDKVersion }}
 
+- stage: Final_AAR_Testing_Android_QNN
+  dependsOn: Android_Java_API_AAR_Packaging_QNN
+  jobs:
   - template: android-java-api-aar-test.yml
     parameters:
       artifactName: 'onnxruntime-android-qnn-aar'
-      job_name_suffix: 'QNN'
       packageName: 'onnxruntime-android-qnn'
       QnnSDKVersion: ${{ parameters.QnnSDKVersion }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 1a53ce6a423b6..fe3bc60c83dea 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.208
+      version: 1.0.213
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.208
+      version: 1.0.213
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 8bbe8f82530ea..523f3ab58b982 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -102,10 +102,12 @@ stages:
       packageName: onnxruntime-training-android
       enable_code_sign: true
 
+- stage: Final_AAR_Testing_Android_Training_Full
+  dependsOn: Android_Java_API_AAR_Packaging_Training_Full
+  jobs:
   - template: android-java-api-aar-test.yml
     parameters:
       artifactName: 'onnxruntime-training-android-full-aar'
-      job_name_suffix: 'Training_Full'
       packageName: onnxruntime-training-android
 
 - stage: NuGet_Packaging_Training_CPU
@@ -115,7 +117,7 @@ stages:
   - Windows_Packaging_Training_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_x64_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_arm64_${{ parameters.BuildVariant }}
-  - Android_Java_API_AAR_Packaging_Training_Full
+  - Final_AAR_Testing_Android_Training_Full
   condition: succeeded()
   jobs:
   - job: NuGet_Packaging_Training_CPU
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index d1b85e64fa48a..ff2ecb0d3c28f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -94,6 +94,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index 8595a52cdef2b..f382156c03944 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 979961d06654f..a5f2a481e6ba8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 205bd0b5c3c71..5a74998ca4bc8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -93,12 +93,18 @@ stages:
           workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
           createLogFile: true
 
+      - task: CmdLine@2
+        displayName: 'Print contents of binaries directory'
+        inputs:
+          script: |
+            dir $(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}
+
       - template: win-esrp-dll.yml
         parameters:
           FolderPath: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
           DisplayName: 'ESRP - Sign dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
-          Pattern: 'onnxruntime.dll'
+          Pattern: 'onnxruntime*.dll'
 
       - task: MSBuild@1
         displayName: 'Restore NuGet Packages and create project.assets.json'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 84b71b37d992a..787c3ffe23bd9 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -36,7 +36,7 @@ parameters:
   default: 2.30.0.250109
 
 jobs:
-- job: 'build'
+- job: 'BUILD_QNN_EP'
   pool: 'onnxruntime-qnn-windows-vs-2022-arm64'
   variables:
     DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
@@ -46,6 +46,12 @@ jobs:
   timeoutInMinutes: 240
   workspace:
     clean: all
+  strategy:
+    matrix:
+      SHARED_LIB:
+        QnnLibKind: 'shared_lib'
+      STATIC_LIB:
+        QnnLibKind: 'static_lib'
   steps:
 
   - script: |
@@ -79,7 +85,8 @@ jobs:
         --config $(BuildConfig)
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
-        --use_qnn
+        --build_shared_lib
+        --use_qnn $(QnnLibKind)
         --qnn_home $(QnnSDKRootDir)
         --update --build --parallel
 
@@ -88,7 +95,8 @@ jobs:
         --config $(BuildConfig) ^
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
-        --use_qnn ^
+        --build_shared_lib ^
+        --use_qnn $(QnnLibKind) ^
         --qnn_home $(QnnSDKRootDir) ^
         --test --enable_onnx_tests
     displayName: 'Run unit tests'
@@ -121,7 +129,7 @@ jobs:
       TargetFolder: '$(Build.ArtifactStagingDirectory)'
       CleanTargetFolder: true
       OverWrite: true
-    condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
+    condition: and(succeeded(), and(ne(variables['Build.Reason'], 'PullRequest'), eq(variables['QnnLibKind'], 'shared_lib')))
 
   - task: PublishBuildArtifacts@1
     displayName: 'Publish Artifact'
@@ -129,4 +137,4 @@ jobs:
       PathtoPublish: '$(Build.ArtifactStagingDirectory)'
       ArtifactName: 'internal_release'
       publishLocation: 'Container'
-    condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
+    condition: and(succeeded(), and(ne(variables['Build.Reason'], 'PullRequest'), eq(variables['QnnLibKind'], 'shared_lib')))
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index e6792bc34aad0..28fbe4a1096b2 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -36,7 +36,7 @@ parameters:
   default: 2.30.0.250109
 
 jobs:
-- job: 'build'
+- job: 'BUILD_QNN_EP'
   pool: 'Onnxruntime-QNNEP-Windows-2022-CPU'
   variables:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
@@ -50,6 +50,12 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
+  strategy:
+    matrix:
+      SHARED_LIB:
+        QnnLibKind: 'shared_lib'
+      STATIC_LIB:
+        QnnLibKind: 'static_lib'
   steps:
 
   - task: UsePythonVersion@0
@@ -72,7 +78,8 @@ jobs:
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
         --build_java
-        --use_qnn
+        --build_shared_lib
+        --use_qnn $(QnnLibKind)
         --qnn_home $(QnnSDKRootDir)
         --use_binskim_compliant_compile_flags
         --update --parallel
@@ -87,7 +94,8 @@ jobs:
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
         --build_java ^
-        --use_qnn ^
+        --build_shared_lib ^
+        --use_qnn $(QnnLibKind) ^
         --qnn_home $(QnnSDKRootDir) ^
         --use_binskim_compliant_compile_flags ^
         --test --enable_onnx_tests
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 6d86a57bd7986..b5999da997589 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -75,7 +75,7 @@ fi
 
 if [ "$BUILD_DEVICE" == "NPU" ]; then
     #Enable QNN EP
-    BUILD_ARGS+=("--use_qnn" "--qnn_home=/qnn_sdk")
+    BUILD_ARGS+=("--build_shared_lib" "--use_qnn" "--qnn_home=/qnn_sdk")
 fi
 
 export ONNX_ML=1
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index 72912acce885e..02938f015ec57 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250124.1
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
 
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
index 9569aa2fcda63..f9d84e3b0e130 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14_dotnet:20250124.1
 
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
index 589bd869ba89f..20b9a6c224120 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14:20250124.1
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
index 1c1f716d81e95..d94e7562f19d4 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14_dotnet:20250124.1
 
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
index 6caf21c475545..24287fd34d3ea 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20250124.1
 
 ARG TRT_VERSION
 #Install TensorRT only if TRT_VERSION is not empty
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
index a5dda5904de49..764a79135d7a3 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20250124.1
 ARG TRT_VERSION
 
 #Install TensorRT only if TRT_VERSION is not empty
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
index 04c6398e061b7..7590d5dd18347 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250124.1
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 0568ae864dbfc..8ccb2c054900e 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -382,6 +382,7 @@ def generate_files(line_list, args):
             "tensorrt_ep_shared_lib": "onnxruntime_providers_tensorrt.dll",
             "openvino_ep_shared_lib": "onnxruntime_providers_openvino.dll",
             "cuda_ep_shared_lib": "onnxruntime_providers_cuda.dll",
+            "qnn_ep_shared_lib": "onnxruntime_providers_qnn.dll",
             "onnxruntime_perf_test": "onnxruntime_perf_test.exe",
             "onnx_test_runner": "onnx_test_runner.exe",
         }
@@ -777,6 +778,24 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
+    if args.execution_provider == "qnn" or (is_qnn_package and not is_ado_packaging_build):
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["providers_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["qnn_ep_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+
     # process all other library dependencies
     if is_cpu_package or is_cuda_gpu_package or is_dml_package or is_mklml_package:
         # Process dnnl dependency