diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 044588c080072..46f8c8891dda5 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -346,7 +346,7 @@ "component": { "type": "git", "git": { - "commitHash": "12a3b24c456cebd9fd11f23ac0164f78129b00c6", + "commitHash": "b9b4a37041dec3dd62ac92014a6cc1aece48d9f3", "repositoryUrl": "https://github.com/google/dawn.git" }, "comments": "dawn" diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 304236743fd42..b332583035890 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -93,6 +93,7 @@ option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) option(onnxruntime_USE_COREML "Build with CoreML support" OFF) option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF) option(onnxruntime_USE_QNN "Build with QNN support" OFF) +option(onnxruntime_BUILD_QNN_EP_STATIC_LIB "Build with QNN EP as a static library" OFF) option(onnxruntime_USE_SNPE "Build with SNPE support" OFF) option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) option(onnxruntime_USE_DNNL "Build with DNNL support" OFF) diff --git a/cmake/deps.txt b/cmake/deps.txt index d1a528bd6b4be..c73d9a4e3532f 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -58,5 +58,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/f3f6caa6e composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1 -dawn;https://github.com/google/dawn/archive/12a3b24c456cebd9fd11f23ac0164f78129b00c6.zip;ad428f6dc16f1336d584f7bad5714e1097dafc43 +dawn;https://github.com/google/dawn/archive/b9b4a37041dec3dd62ac92014a6cc1aece48d9f3.zip;e8b8c2ebabdedb7c57d931fc4a19ae22146d31e1 kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/d15722976120710080ca098fe8ddabf4556cb40f/kleidiai-d15722976120710080ca098fe8ddabf4556cb40f.zip;d6c840d00c3b05aedf06e957ddaece1013d1f40b diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake index 7b6e2141eeb1b..6b4404a124926 100644 --- a/cmake/external/abseil-cpp.cmake +++ b/cmake/external/abseil-cpp.cmake @@ -32,7 +32,13 @@ FetchContent_Declare( onnxruntime_fetchcontent_makeavailable(abseil_cpp) FetchContent_GetProperties(abseil_cpp) -set(ABSEIL_SOURCE_DIR ${abseil_cpp_SOURCE_DIR}) +if(abseil_cpp_SOURCE_DIR) + set(ABSEIL_SOURCE_DIR ${abseil_cpp_SOURCE_DIR}) + if(onnxruntime_USE_WEBGPU) + set(DAWN_ABSEIL_DIR ${abseil_cpp_SOURCE_DIR}) + endif() +endif() + # abseil_cpp_SOURCE_DIR is non-empty if we build it from source message(STATUS "Abseil source dir:" ${ABSEIL_SOURCE_DIR}) # abseil_cpp_VERSION is non-empty if we find a preinstalled ABSL diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index b8e90026b4f9a..e95656969866f 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -222,6 +222,11 @@ onnxruntime_fetchcontent_makeavailable(Protobuf) if(Protobuf_FOUND) message(STATUS "Protobuf version: ${Protobuf_VERSION}") else() + if(protobuf_SOURCE_DIR) + if(onnxruntime_USE_WEBGPU) + set(DAWN_PROTOBUF_DIR ${protobuf_SOURCE_DIR}) + endif() + endif() # Adjust warning flags if (TARGET libprotoc) if (NOT MSVC) @@ -626,7 +631,7 @@ if (onnxruntime_USE_WEBGPU) URL_HASH SHA1=${DEP_SHA1_dawn} # All previous patches are merged into the upstream dawn project. We don't need to apply any patches right now. # if we need to apply patches in the future, we can uncomment the following line. - PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch + # PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch ) endif() diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index d72b61a0859b2..78edb4179fafd 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -199,17 +199,12 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA) endforeach() endif() -# This list is a reversed topological ordering of library dependencies. -# Earlier entries may depend on later ones. Later ones should not depend on earlier ones. -set(onnxruntime_INTERNAL_LIBRARIES - onnxruntime_session - ${onnxruntime_libs} +set(onnxruntime_INTERNAL_PROVIDER_LIBRARIES ${PROVIDERS_ACL} ${PROVIDERS_ARMNN} ${PROVIDERS_COREML} ${PROVIDERS_DML} ${PROVIDERS_NNAPI} - ${PROVIDERS_QNN} ${PROVIDERS_SNPE} ${PROVIDERS_RKNPU} ${PROVIDERS_VSINPU} @@ -218,6 +213,18 @@ set(onnxruntime_INTERNAL_LIBRARIES ${PROVIDERS_WEBNN} ${PROVIDERS_AZURE} ${PROVIDERS_INTERNAL_TESTING} +) + +if (onnxruntime_BUILD_QNN_EP_STATIC_LIB) + list(APPEND onnxruntime_INTERNAL_PROVIDER_LIBRARIES onnxruntime_providers_qnn) +endif() + +# This list is a reversed topological ordering of library dependencies. +# Earlier entries may depend on later ones. Later ones should not depend on earlier ones. +set(onnxruntime_INTERNAL_LIBRARIES + onnxruntime_session + ${onnxruntime_libs} + ${onnxruntime_INTERNAL_PROVIDER_LIBRARIES} ${onnxruntime_winml} onnxruntime_optimizer onnxruntime_providers diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake index b15b9632e9e24..1227264e595ed 100644 --- a/cmake/onnxruntime_java.cmake +++ b/cmake/onnxruntime_java.cmake @@ -148,7 +148,7 @@ if (WIN32) if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_JNI_DIR}/$) - if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT) + if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) endif() if (onnxruntime_USE_CUDA) @@ -163,11 +163,14 @@ if (WIN32) if (onnxruntime_USE_TENSORRT) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) endif() + if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB) + add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) + endif() endif() else() add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_JNI_DIR}/$) - if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT) + if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) endif() if (onnxruntime_USE_CUDA) @@ -182,6 +185,9 @@ else() if (onnxruntime_USE_TENSORRT) add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) endif() + if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB) + add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${JAVA_PACKAGE_LIB_DIR}/$) + endif() endif() # run the build process (this copies the results back into CMAKE_CURRENT_BINARY_DIR) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 5124262ec0004..ed3ad89247975 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -95,6 +95,8 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp + ${MLAS_SRC_DIR}/hgemm_kernel_neon.cpp + ${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp ) set(mlas_platform_preprocess_srcs @@ -374,6 +376,7 @@ else() ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp + ${MLAS_SRC_DIR}/hgemm_kernel_neon.cpp ) set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod") @@ -394,6 +397,7 @@ else() ${MLAS_SRC_DIR}/cast_kernel_neon.cpp ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp + ${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp ) set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ") set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ") @@ -406,6 +410,7 @@ else() set_source_files_properties(${MLAS_SRC_DIR}/cast_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ") set_source_files_properties(${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ") set_source_files_properties(${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ") + set_source_files_properties(${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ") endif() if(ONNXRUNTIME_MLAS_MULTI_ARCH) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 582491de9503d..67fa48b28278d 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -74,9 +74,6 @@ endif() if(onnxruntime_USE_JSEP) set(PROVIDERS_JS onnxruntime_providers_js) endif() -if(onnxruntime_USE_QNN) - set(PROVIDERS_QNN onnxruntime_providers_qnn) -endif() if(onnxruntime_USE_RKNPU) set(PROVIDERS_RKNPU onnxruntime_providers_rknpu) endif() diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake index ec7bc7a98969e..18048c8cdce2f 100644 --- a/cmake/onnxruntime_providers_coreml.cmake +++ b/cmake/onnxruntime_providers_coreml.cmake @@ -8,25 +8,18 @@ endif() add_compile_definitions(USE_COREML=1) # Check if we can build the coremltools code for creating an mlpackage with an mlprogram. -# The coremltools source requires std::filesystem::path which is only available from iOS 13 on. -set(_enable_ML_PROGRAM ON) -if (IOS AND CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 13.0) - message(WARNING "CoreML ML Program is not supported on iOS < 13.0. Excluding ML Program support from build.") - set(_enable_ML_PROGRAM OFF) -elseif(LINUX) - # uuid-dev is required. we don't bother installing on CIs as it's really for manual developer testing. +if(LINUX) find_library(LibUUID_LIBRARY NAMES uuid) find_path(LibUUID_INCLUDE_DIR NAMES uuid/uuid.h) if (NOT LibUUID_INCLUDE_DIR) - message(STATUS "uuid/uuid.h was not found as is required for ML Program support. " + message(FATAL "uuid/uuid.h was not found as is required for ML Program support. " "Run `sudo apt install uuid-dev` if you need to test ML Program related CoreML EP code. ") - set(_enable_ML_PROGRAM OFF) endif() endif() -if (_enable_ML_PROGRAM) - add_compile_definitions(COREML_ENABLE_MLPROGRAM=1) -endif() + +add_compile_definitions(COREML_ENABLE_MLPROGRAM=1) + # Compile CoreML proto definition to ${CMAKE_CURRENT_BINARY_DIR}/coreml_proto set(COREML_PROTO_ROOT ${coremltools_SOURCE_DIR}/mlmodel/format) @@ -93,10 +86,10 @@ file(GLOB_RECURSE "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.cc" ) -if(_enable_ML_PROGRAM) + # Add helpers to create mlpackage weights. limit to just the files we need to minimize the changes to make them # build on Windows and Linux. - file(GLOB +file(GLOB onnxruntime_providers_coreml_milblob_cc_srcs CONFIGURE_DEPENDS "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.hpp" "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.cpp" @@ -105,22 +98,22 @@ if(_enable_ML_PROGRAM) "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageFormat.hpp" "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/FileWriter.?pp" "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageWriter.?pp" - ) +) - # Add helpers to create mlpackage - file(GLOB +# Add helpers to create mlpackage +file(GLOB onnxruntime_providers_coreml_modelpackage_cc_srcs CONFIGURE_DEPENDS "${coremltools_SOURCE_DIR}/modelpackage/src/ModelPackage.?pp" "${coremltools_SOURCE_DIR}/modelpackage/src/utils/JsonMap.?pp" - ) +) - set(coremltools_srcs +set(coremltools_srcs ${onnxruntime_providers_coreml_milblob_cc_srcs} ${onnxruntime_providers_coreml_modelpackage_cc_srcs} - ) +) + +source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs}) - source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs}) -endif() # Add CoreML objective c++ source code if (APPLE) @@ -174,34 +167,34 @@ if (APPLE) target_compile_definitions(onnxruntime_providers_coreml PRIVATE __APPLE__) endif() -if (_enable_ML_PROGRAM) - # Setup coremltools fp16 and json dependencies for creating an mlpackage. - # - # fp16 depends on psimd - FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd}) - onnxruntime_fetchcontent_makeavailable(psimd) - set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR}) - FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16}) - set(FP16_BUILD_TESTS OFF CACHE INTERNAL "") - set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "") - onnxruntime_fetchcontent_makeavailable(fp16) - - # need to tweak the include paths to match what the coreml source code expects - target_include_directories(onnxruntime_providers_coreml PRIVATE - ${fp16_SOURCE_DIR}/include - ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann - ${coremltools_SOURCE_DIR} - ${coremltools_SOURCE_DIR}/mlmodel/src/ - ${coremltools_SOURCE_DIR}/modelpackage/src/ - ) - add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16) +# Setup coremltools fp16 and json dependencies for creating an mlpackage. +# +# fp16 depends on psimd +FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd}) +onnxruntime_fetchcontent_makeavailable(psimd) +set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR}) +FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16}) +set(FP16_BUILD_TESTS OFF CACHE INTERNAL "") +set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "") +onnxruntime_fetchcontent_makeavailable(fp16) + +# need to tweak the include paths to match what the coreml source code expects +target_include_directories(onnxruntime_providers_coreml PRIVATE + ${fp16_SOURCE_DIR}/include + ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann + ${coremltools_SOURCE_DIR} + ${coremltools_SOURCE_DIR}/mlmodel/src/ + ${coremltools_SOURCE_DIR}/modelpackage/src/ +) - if (LINUX) - target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid) - endif() +add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16) + +if (LINUX) + target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid) endif() + if (APPLE) target_link_libraries(onnxruntime_providers_coreml PRIVATE "-framework Foundation" "-framework CoreML") endif() diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake index 91a2b13002ec9..4ae89a392278f 100644 --- a/cmake/onnxruntime_providers_cpu.cmake +++ b/cmake/onnxruntime_providers_cpu.cmake @@ -239,7 +239,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst") elseif(UNIX) if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") - set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections") + target_link_options(onnxruntime_providers_shared PRIVATE + "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds" + "LINKER:--gc-sections") endif() elseif(WIN32) set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def") diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake index b68d84c23bb32..303020145889b 100644 --- a/cmake/onnxruntime_providers_qnn.cmake +++ b/cmake/onnxruntime_providers_qnn.cmake @@ -3,41 +3,89 @@ add_compile_definitions(USE_QNN=1) - # These are shared utils, - # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML - file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h" - "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc" - ) + if(onnxruntime_BUILD_QNN_EP_STATIC_LIB) + add_compile_definitions(BUILD_QNN_EP_STATIC_LIB=1) + endif() file(GLOB_RECURSE - onnxruntime_providers_qnn_ep_cc_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h" - "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc" + onnxruntime_providers_qnn_ep_srcs CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h" + "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc" ) - file(GLOB_RECURSE - onnxruntime_providers_qnn_builder_cc_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.h" - "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.cc" - ) + if(onnxruntime_BUILD_QNN_EP_STATIC_LIB) + # + # Build QNN EP as a static library + # + set(onnxruntime_providers_qnn_srcs ${onnxruntime_providers_qnn_ep_srcs}) + source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_srcs}) + onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_srcs}) + onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx + onnx_proto protobuf::libprotobuf-lite + flatbuffers::flatbuffers Boost::mp11) + add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES}) + set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON) + set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime") + target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT} + ${onnxruntime_QNN_HOME}/include/QNN + ${onnxruntime_QNN_HOME}/include) + set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX) - set(onnxruntime_providers_qnn_cc_srcs - ${onnxruntime_providers_shared_utils_cc_srcs} - ${onnxruntime_providers_qnn_ep_cc_srcs} - ${onnxruntime_providers_qnn_builder_cc_srcs} - ) + # ignore the warning unknown-pragmas on "pragma region" + if(NOT MSVC) + target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas") + endif() + else() + # + # Build QNN EP as a shared library + # + file(GLOB_RECURSE + onnxruntime_providers_qnn_shared_lib_srcs CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h" + "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" + ) + set(onnxruntime_providers_qnn_srcs ${onnxruntime_providers_qnn_ep_srcs} + ${onnxruntime_providers_qnn_shared_lib_srcs}) + + source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_srcs}) + onnxruntime_add_shared_library_module(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_srcs}) + onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx + onnxruntime_common Boost::mp11 safeint_interface) + target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS} ${CMAKE_DL_LIBS}) + add_dependencies(onnxruntime_providers_qnn onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) + target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT} + ${CMAKE_CURRENT_BINARY_DIR} + ${onnxruntime_QNN_HOME}/include/QNN + ${onnxruntime_QNN_HOME}/include) + + # Set linker flags for function(s) exported by EP DLL + if(UNIX) + target_link_options(onnxruntime_providers_qnn PRIVATE + "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds" + "LINKER:--gc-sections" + "LINKER:-rpath=\$ORIGIN" + ) + elseif(WIN32) + set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS + "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def") + else() + message(FATAL_ERROR "onnxruntime_providers_qnn unknown platform, need to specify shared library exports for it") + endif() + + # Set compile options + if(MSVC) + target_compile_options(onnxruntime_providers_qnn PUBLIC /wd4099 /wd4005) + else() + # ignore the warning unknown-pragmas on "pragma region" + target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas") + endif() + + set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX) + set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON) + set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime") - source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_cc_srcs}) - onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs}) - onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers::flatbuffers Boost::mp11) - target_link_libraries(onnxruntime_providers_qnn) - add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES}) - set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON) - set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime") - target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_QNN_HOME}/include/QNN ${onnxruntime_QNN_HOME}/include) - set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX) - # ignore the warning unknown-pragmas on "pragma region" - if(NOT MSVC) - target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas") + install(TARGETS onnxruntime_providers_qnn + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 5b29d1093aa5c..15a2862cede0c 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -169,9 +169,7 @@ if (onnxruntime_ENABLE_LAZY_TENSOR) endif() endif() -target_link_libraries(onnxruntime_pybind11_state PRIVATE - onnxruntime_session - ${onnxruntime_libs} +set(onnxruntime_pybind11_state_static_providers ${PROVIDERS_NNAPI} ${PROVIDERS_VSINPU} ${PROVIDERS_XNNPACK} @@ -183,7 +181,16 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE ${PROVIDERS_XNNPACK} ${PROVIDERS_WEBGPU} ${PROVIDERS_AZURE} - ${PROVIDERS_QNN} +) + +if(onnxruntime_BUILD_QNN_EP_STATIC_LIB) + list(APPEND onnxruntime_pybind11_state_static_providers PRIVATE onnxruntime_providers_qnn) +endif() + +target_link_libraries(onnxruntime_pybind11_state PRIVATE + onnxruntime_session + ${onnxruntime_libs} + ${onnxruntime_pybind11_state_static_providers} onnxruntime_optimizer onnxruntime_providers onnxruntime_util @@ -1000,6 +1007,16 @@ if (onnxruntime_USE_COREML) endif() if (onnxruntime_USE_QNN) + if(NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB) + add_custom_command( + TARGET onnxruntime_pybind11_state POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ + $ + $/onnxruntime/capi/ + ) + endif() + add_custom_command( TARGET onnxruntime_pybind11_state POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 7c1b9ddc1548f..c727f4b7e381b 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -619,16 +619,13 @@ if(onnxruntime_USE_ARMNN) list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_armnn) endif() -set(ONNXRUNTIME_TEST_LIBS - onnxruntime_session - ${ONNXRUNTIME_INTEROP_TEST_LIBS} - ${onnxruntime_libs} - # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime +set(ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS + # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime. + # QNN EP can be built as either a dynamic and static libs. ${PROVIDERS_NNAPI} ${PROVIDERS_VSINPU} ${PROVIDERS_JS} ${PROVIDERS_WEBGPU} - ${PROVIDERS_QNN} ${PROVIDERS_SNPE} ${PROVIDERS_RKNPU} ${PROVIDERS_DML} @@ -637,6 +634,17 @@ set(ONNXRUNTIME_TEST_LIBS ${PROVIDERS_COREML} ${PROVIDERS_XNNPACK} ${PROVIDERS_AZURE} +) + +if (onnxruntime_BUILD_QNN_EP_STATIC_LIB) + list(APPEND ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS onnxruntime_providers_qnn) +endif() + +set(ONNXRUNTIME_TEST_LIBS + onnxruntime_session + ${ONNXRUNTIME_INTEROP_TEST_LIBS} + ${onnxruntime_libs} + ${ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS} onnxruntime_optimizer onnxruntime_providers onnxruntime_util @@ -700,7 +708,9 @@ if(onnxruntime_USE_QNN AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_RED list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/qnn/*) list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_qnn) list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn) - list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_qnn) + if(NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB) + list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_shared) + endif() endif() if(onnxruntime_USE_SNPE) diff --git a/cmake/patches/coremltools/crossplatformbuild.patch b/cmake/patches/coremltools/crossplatformbuild.patch index 7f2268f50c82e..832191b366d4d 100644 --- a/cmake/patches/coremltools/crossplatformbuild.patch +++ b/cmake/patches/coremltools/crossplatformbuild.patch @@ -3,7 +3,7 @@ index adc7bfcf..7b2bf9cc 100644 --- a/mlmodel/src/MILBlob/Blob/FileWriter.cpp +++ b/mlmodel/src/MILBlob/Blob/FileWriter.cpp @@ -8,8 +8,12 @@ - + #include #include + @@ -12,17 +12,31 @@ index adc7bfcf..7b2bf9cc 100644 #include #include +#endif - + using namespace MILBlob; using namespace MILBlob::Blob; +diff --git a/mlmodel/src/MILBlob/Blob/FileWriter.hpp b/mlmodel/src/MILBlob/Blob/FileWriter.hpp +index 2bc99403..49239513 100644 +--- a/mlmodel/src/MILBlob/Blob/FileWriter.hpp ++++ b/mlmodel/src/MILBlob/Blob/FileWriter.hpp +@@ -6,7 +6,8 @@ + #pragma once + + #include "MILBlob/Util/Span.hpp" +- ++// ORT_EDIT: add missing header ++#include + #include + #include + #include diff --git a/mlmodel/src/MILBlob/Fp16.cpp b/mlmodel/src/MILBlob/Fp16.cpp index ae1e71a1..77a7161f 100644 --- a/mlmodel/src/MILBlob/Fp16.cpp +++ b/mlmodel/src/MILBlob/Fp16.cpp @@ -5,6 +5,8 @@ - + #include "MILBlob/Fp16.hpp" - + +// ORT_EDIT: Exclude clang specific pragmas from other builds +#if defined(__clang__) // fp16 lib code has some conversion warnings we don't want to globally ignore @@ -35,11 +49,11 @@ index ae1e71a1..77a7161f 100644 +#else +#include "fp16/fp16.h" +#endif - + using namespace MILBlob; - + diff --git a/modelpackage/src/ModelPackage.cpp b/modelpackage/src/ModelPackage.cpp -index 8fee56b9..99e0d8d6 100644 +index 8fee56b9..5508e316 100644 --- a/modelpackage/src/ModelPackage.cpp +++ b/modelpackage/src/ModelPackage.cpp @@ -26,7 +26,14 @@ namespace std { @@ -55,22 +69,22 @@ index 8fee56b9..99e0d8d6 100644 #include +#endif #include - + #if defined(__cplusplus) @@ -187,7 +194,10 @@ public: ModelPackageItemInfo createFile(const std::string& name, const std::string& author, const std::string& description); }; - + +// ORT_EDIT: pragma only available on APPLE platforms +#if defined(__APPLE__) #pragma mark ModelPackageImpl +#endif - + ModelPackageImpl::ModelPackageImpl(const std::filesystem::path& path, bool createIfNecessary, bool readOnly) : m_packagePath(path), @@ -372,6 +382,20 @@ std::filesystem::path ModelPackageImpl::getItemPath(const std::string& name, con } - + std::string ModelPackageImpl::generateIdentifier() const { +// ORT_EDIT: Use built-in UUID generation on Windows +#if defined(_WIN32) @@ -87,20 +101,20 @@ index 8fee56b9..99e0d8d6 100644 + return uuidStrCpp; +#else uuid_t uuid; - + // uuid_unparse generates a 36-character null-terminated string (37 bytes). @@ -383,6 +407,7 @@ std::string ModelPackageImpl::generateIdentifier() const { uuid_unparse(uuid, buf); - + return std::string(buf); +#endif } - + ModelPackageItemInfo ModelPackageImpl::createFile(const std::string& name, const std::string& author, const std::string& description) { -@@ -468,7 +493,13 @@ std::shared_ptr ModelPackageImpl::findItem(const std::stri +@@ -468,7 +493,14 @@ std::shared_ptr ModelPackageImpl::findItem(const std::stri auto author = itemInfoEntry->getString(kModelPackageItemInfoAuthorKey); auto description = itemInfoEntry->getString(kModelPackageItemInfoDescriptionKey); - + +// ORT_EDIT: need to use path.string() on Windows +#if defined(_WIN32) + return std::make_shared(std::make_shared(identifier, path.string(), name, author, description)); @@ -108,12 +122,13 @@ index 8fee56b9..99e0d8d6 100644 +#else return std::make_shared(std::make_shared(identifier, path, name, author, description)); +#endif ++ } - + std::shared_ptr ModelPackageImpl::findItem(const std::string& name, const std::string& author) const -@@ -514,7 +545,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier) +@@ -514,7 +546,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier) } - + auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey); - if (0 != std::remove(path.c_str())) { + // ORT_EDIT: std::remove doesn't work on Windows. Use std::filesystem::remove instead. @@ -121,8 +136,8 @@ index 8fee56b9..99e0d8d6 100644 + if (!std::filesystem::remove(path)) { throw std::runtime_error("Failed to remove file at path: " + path.string()); } - -@@ -525,13 +558,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path) + +@@ -525,13 +559,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path) { try { ModelPackageImpl(path, false, true); @@ -132,16 +147,16 @@ index 8fee56b9..99e0d8d6 100644 } return true; } - + +// ORT_EDIT: pragma only available on APPLE platforms +#if defined(__APPLE__) #pragma mark ModelPackage +#endif - + ModelPackage::ModelPackage(const std::string& packagePath, bool createIfNecessary, bool readOnly) : m_modelPackageImpl(std::make_shared(packagePath, createIfNecessary, readOnly)) -@@ -544,7 +580,12 @@ ModelPackage::~ModelPackage() - +@@ -544,7 +581,12 @@ ModelPackage::~ModelPackage() + std::string ModelPackage::path() const { +// ORT_EDIT: Windows doesn't automatically convert to std::string as the native format could be char or wchar. @@ -151,5 +166,19 @@ index 8fee56b9..99e0d8d6 100644 return m_modelPackageImpl->path(); +#endif } - + std::string ModelPackage::setRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description) +diff --git a/modelpackage/src/utils/JsonMap.hpp b/modelpackage/src/utils/JsonMap.hpp +index 0d7dc3f4..b700cfd5 100644 +--- a/modelpackage/src/utils/JsonMap.hpp ++++ b/modelpackage/src/utils/JsonMap.hpp +@@ -10,7 +10,8 @@ + #include + #include + #include +- ++// ORT_EDIT: add missing header ++#include + class JsonMapImpl; + + class JsonMap { diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch deleted file mode 100644 index 868db0c43e3a5..0000000000000 --- a/cmake/patches/dawn/dawn.patch +++ /dev/null @@ -1,118 +0,0 @@ -diff --git a/src/emdawnwebgpu/CMakeLists.txt b/src/emdawnwebgpu/CMakeLists.txt -index de673537d3..c98dc46de7 100644 ---- a/src/emdawnwebgpu/CMakeLists.txt -+++ b/src/emdawnwebgpu/CMakeLists.txt -@@ -78,6 +78,7 @@ if (${DAWN_ENABLE_EMSCRIPTEN}) - endif() - - set(ARGS -+ ${Python3_EXECUTABLE} - "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/maint/gen_struct_info.py" - -q - "${EM_BUILD_GEN_DIR}/struct_info_webgpu.json" -diff --git a/third_party/emdawnwebgpu/library_webgpu.js b/third_party/emdawnwebgpu/library_webgpu.js -index d1835cb090..df03ea2f94 100644 ---- a/third_party/emdawnwebgpu/library_webgpu.js -+++ b/third_party/emdawnwebgpu/library_webgpu.js -@@ -16,10 +16,19 @@ - throw new Error("To use Dawn's library_webgpu.js, disable -sUSE_WEBGPU and first include Dawn's library_webgpu_struct_info.js and library_webgpu_enum_tables.js (before library_webgpu.js)"); - } - -+ if (MEMORY64) { -+ throw new Error("The current implementation of Dawn's library_webgpu.js does not support MEMORY64 yet"); -+ } -+ - // Helper functions for code generation - globalThis.gpu = { -- convertSentinelToUndefined: function(name) { -- return `if (${name} == -1) ${name} = undefined;`; -+ convertSentinelToUndefined: function(name, isPtr = false) { -+ // When `CAN_ADDRESS_2GB` is true, value `-1` is normalized to `0xFFFFFFFF` for pointer. -+ if (CAN_ADDRESS_2GB && isPtr) { -+ return `if (${name} == 0xFFFFFFFF) ${name} = undefined;`; -+ } else { -+ return `if (${name} == -1) ${name} = undefined;`; -+ } - }, - - makeGetBool: function(struct, offset) { -@@ -700,6 +709,7 @@ var LibraryWebGPU = { - {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.adapterType, 'adapterType', 'i32') }}}; - {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.vendorID, '0', 'i32') }}}; - {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.deviceID, '0', 'i32') }}}; -+ return 1; - }, - - wgpuAdapterGetLimits: (adapterPtr, limitsOutPtr) => { -@@ -882,7 +892,7 @@ var LibraryWebGPU = { - - if (size === 0) warnOnce('getMappedRange size=0 no longer means WGPU_WHOLE_MAP_SIZE'); - -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - - var mapped; - try { -@@ -909,7 +919,7 @@ var LibraryWebGPU = { - - if (size === 0) warnOnce('getMappedRange size=0 no longer means WGPU_WHOLE_MAP_SIZE'); - -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - - var mapped; - try { -@@ -950,7 +960,7 @@ var LibraryWebGPU = { - var buffer = WebGPU.getJsObject(bufferPtr); - WebGPU.Internals.bufferOnUnmaps[bufferPtr] = []; - -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - - {{{ runtimeKeepalivePush() }}} - WebGPU.Internals.futureInsert(futureId, buffer.mapAsync(mode, offset, size).then(() => { -@@ -1145,7 +1155,7 @@ var LibraryWebGPU = { - - wgpuCommandEncoderClearBuffer: (encoderPtr, bufferPtr, offset, size) => { - var commandEncoder = WebGPU.getJsObject(encoderPtr); -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - - var buffer = WebGPU.getJsObject(bufferPtr); - commandEncoder.clearBuffer(buffer, offset, size); -@@ -2103,7 +2113,7 @@ var LibraryWebGPU = { - wgpuRenderBundleEncoderSetIndexBuffer: (passPtr, bufferPtr, format, offset, size) => { - var pass = WebGPU.getJsObject(passPtr); - var buffer = WebGPU.getJsObject(bufferPtr); -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - pass.setIndexBuffer(buffer, WebGPU.IndexFormat[format], offset, size); - }, - -@@ -2116,7 +2126,7 @@ var LibraryWebGPU = { - wgpuRenderBundleEncoderSetVertexBuffer: (passPtr, slot, bufferPtr, offset, size) => { - var pass = WebGPU.getJsObject(passPtr); - var buffer = WebGPU.getJsObject(bufferPtr); -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - pass.setVertexBuffer(slot, buffer, offset, size); - }, - -@@ -2211,7 +2221,7 @@ var LibraryWebGPU = { - wgpuRenderPassEncoderSetIndexBuffer: (passPtr, bufferPtr, format, offset, size) => { - var pass = WebGPU.getJsObject(passPtr); - var buffer = WebGPU.getJsObject(bufferPtr); -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - pass.setIndexBuffer(buffer, WebGPU.IndexFormat[format], offset, size); - }, - -@@ -2234,7 +2244,7 @@ var LibraryWebGPU = { - wgpuRenderPassEncoderSetVertexBuffer: (passPtr, slot, bufferPtr, offset, size) => { - var pass = WebGPU.getJsObject(passPtr); - var buffer = WebGPU.getJsObject(bufferPtr); -- {{{ gpu.convertSentinelToUndefined('size') }}} -+ {{{ gpu.convertSentinelToUndefined('size', true) }}} - pass.setVertexBuffer(slot, buffer, offset, size); - }, - diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake new file mode 100644 index 0000000000000..e61308bf643b4 --- /dev/null +++ b/cmake/vcpkg-ports/cpuinfo/portfile.cmake @@ -0,0 +1,63 @@ +# On Windows, we can get a cpuinfo.dll, but it exports no symbols. +if(VCPKG_TARGET_IS_WINDOWS) + vcpkg_check_linkage(ONLY_STATIC_LIBRARY) +endif() + +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO pytorch/cpuinfo + REF 8a1772a0c5c447df2d18edf33ec4603a8c9c04a6 + SHA512 b94ccbfa886221d6bb16513d074675af0a72928a9dd9485dcacdc1124a8a60aacbbe91913a1579e766dfb024f0be1d52eeead40342004ff0238a8b94a095ed08 + HEAD_REF master +) + +vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS + FEATURES + tools CPUINFO_BUILD_TOOLS +) + +set(LINK_OPTIONS "") +if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") + list(APPEND LINK_OPTIONS -DCPUINFO_LIBRARY_TYPE=shared) +else() + list(APPEND LINK_OPTIONS -DCPUINFO_LIBRARY_TYPE=static) +endif() + +if(VCPKG_CRT_LINKAGE STREQUAL "dynamic") + list(APPEND LINK_OPTIONS -DCPUINFO_RUNTIME_TYPE=shared) +else() + list(APPEND LINK_OPTIONS -DCPUINFO_RUNTIME_TYPE=static) +endif() + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + ${FEATURE_OPTIONS} + ${LINK_OPTIONS} + -DCPUINFO_BUILD_UNIT_TESTS=OFF + -DCPUINFO_BUILD_MOCK_TESTS=OFF + -DCPUINFO_BUILD_BENCHMARKS=OFF + OPTIONS_DEBUG + -DCPUINFO_LOG_LEVEL=debug + OPTIONS_RELEASE + -DCPUINFO_LOG_LEVEL=default +) +vcpkg_cmake_install() +vcpkg_cmake_config_fixup() +vcpkg_copy_pdbs() +vcpkg_fixup_pkgconfig() # pkg_check_modules(libcpuinfo) + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") + +if("tools" IN_LIST FEATURES) + set(additional_tools "") + if(EXISTS "${CURRENT_PACKAGES_DIR}/bin/cpuid-dump${VCPKG_TARGET_EXECUTABLE_SUFFIX}") + list(APPEND additional_tools "cpuid-dump") + endif() + vcpkg_copy_tools( + TOOL_NAMES cache-info cpu-info isa-info ${additional_tools} + AUTO_CLEAN + ) +endif() + +file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) diff --git a/cmake/vcpkg-ports/cpuinfo/vcpkg.json b/cmake/vcpkg-ports/cpuinfo/vcpkg.json new file mode 100644 index 0000000000000..ce93591dba5ac --- /dev/null +++ b/cmake/vcpkg-ports/cpuinfo/vcpkg.json @@ -0,0 +1,25 @@ +{ + "name": "cpuinfo", + "version-date": "2024-12-09", + "port-version": 3, + "description": "CPU INFOrmation library (x86/x86-64/ARM/ARM64, Linux/Windows/Android/macOS/iOS)", + "homepage": "https://github.com/pytorch/cpuinfo", + "license": "BSD-2-Clause", + "supports": "!(uwp & arm32)", + "dependencies": [ + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ], + "features": { + "tools": { + "description": "Build cpuinfo command-line tools", + "supports": "!uwp" + } + } +} diff --git a/cmake/vcpkg-ports/onnx/fix-cmakelists.patch b/cmake/vcpkg-ports/onnx/fix-cmakelists.patch new file mode 100644 index 0000000000000..f8d300103ab20 --- /dev/null +++ b/cmake/vcpkg-ports/onnx/fix-cmakelists.patch @@ -0,0 +1,67 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 4dd56b6..2ff3e29 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -65,6 +65,27 @@ endif() + + include(GNUInstallDirs) + ++# install protobuf files ++install(FILES ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-data.proto ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-data.proto3 ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-ml.proto ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-ml.proto3 ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-operators-ml.proto ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-operators-ml.proto3 ++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnx ++) ++# install python files ++if(BUILD_ONNX_PYTHON) ++ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_data_pb.py ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_data_pb2.py ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_ml_pb2.py ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_operators_ml_pb2.py ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_operators_pb.py ++ ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_pb.py ++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnx ++ ) ++endif() ++ + set(ONNX_ROOT ${PROJECT_SOURCE_DIR}) + + # Read ONNX version +@@ -116,7 +137,8 @@ endif() + # find_package Python has replaced PythonInterp and PythonLibs since cmake 3.12 + # Use the following command in the future; now this is only compatible with the latest pybind11 + # find_package(Python ${PY_VERSION} COMPONENTS Interpreter Development REQUIRED) +-find_package(PythonInterp ${PY_VERSION} REQUIRED) ++find_package(Python3 ${PY_VERSION} COMPONENTS Interpreter REQUIRED) ++set(PYTHON_EXECUTABLE ${Python3_EXECUTABLE}) + if(BUILD_ONNX_PYTHON) + find_package(PythonLibs ${PY_VERSION}) + endif() +@@ -434,6 +456,7 @@ target_link_libraries(onnx PUBLIC onnx_proto) + add_onnx_global_defines(onnx) + + if(BUILD_ONNX_PYTHON) ++ find_package(Python3 ${PY_VERSION} COMPONENTS Development REQUIRED) + if("${PY_EXT_SUFFIX}" STREQUAL "") + if(MSVC) + set(PY_EXT_SUFFIX ".pyd") +@@ -452,10 +475,14 @@ if(BUILD_ONNX_PYTHON) + target_include_directories(onnx_cpp2py_export PRIVATE + $ + $ +- $) ++ ${Python3_INCLUDE_DIRS}) ++ target_link_directories(onnx_cpp2py_export PRIVATE ++ ${Python3_LIBRARY_DIRS}) ++ target_link_libraries(onnx_cpp2py_export PRIVATE ++ ${Python3_LIBRARIES}) + + # pybind11 is a header only lib +- find_package(pybind11 2.2 CONFIG) ++ find_package(pybind11 2.2 CONFIG REQUIRED) + if(NOT pybind11_FOUND) + if(EXISTS "${ONNX_ROOT}/third_party/pybind11/include/pybind11/pybind11.h") + add_subdirectory("${ONNX_ROOT}/third_party/pybind11") diff --git a/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch b/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch new file mode 100644 index 0000000000000..c435922d0103d --- /dev/null +++ b/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch @@ -0,0 +1,28 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index d81ac1d..9f97998 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -149,6 +149,7 @@ if(ONNX_BUILD_TESTS) + set(googletest_STATIC_LIBRARIES GTest::gtest) + endif() + ++find_package(protobuf CONFIG REQUIRED) + if((ONNX_USE_LITE_PROTO AND TARGET protobuf::libprotobuf-lite) OR ((NOT ONNX_USE_LITE_PROTO) AND TARGET protobuf::libprotobuf)) + # Sometimes we need to use protoc compiled for host architecture while linking + # libprotobuf against target architecture. See https://github.com/caffe2/caffe +diff --git a/cmake/ONNXConfig.cmake.in b/cmake/ONNXConfig.cmake.in +index d588f8a..dbd4398 100644 +--- a/cmake/ONNXConfig.cmake.in ++++ b/cmake/ONNXConfig.cmake.in +@@ -6,9 +6,8 @@ + # library version information + set(ONNX_VERSION "@ONNX_VERSION@") + +-list(APPEND CMAKE_PREFIX_PATH "@PROTOBUF_DIR@") +-set(Protobuf_INCLUDE_DIR "@PROTOBUF_INCLUDE_DIR@") +-find_package(Protobuf REQUIRED) ++include(CMakeFindDependencyMacro) ++find_dependency(protobuf CONFIG) + + # import targets + include ("${CMAKE_CURRENT_LIST_DIR}/ONNXTargets.cmake") diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake new file mode 100644 index 0000000000000..a0c997803870d --- /dev/null +++ b/cmake/vcpkg-ports/onnx/portfile.cmake @@ -0,0 +1,83 @@ +vcpkg_check_linkage(ONLY_STATIC_LIBRARY) + +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO onnx/onnx + REF "v${VERSION}" + SHA512 5a18e2b19ec9c18c8b115fb7e12ed98eddaa581c95f15c4dd420cd6c86e7caa04f9a393da589e76b89cf9b3544abd3749a8c77c2446782f37502eb74e9b1f661 + PATCHES + fix-cmakelists.patch + fix-dependency-protobuf.patch +) + +string(COMPARE EQUAL "${VCPKG_CRT_LINKAGE}" "static" USE_STATIC_RUNTIME) + +# ONNX_USE_PROTOBUF_SHARED_LIBS: find the library and check its file extension +find_library(PROTOBUF_LIBPATH NAMES protobuf PATHS "${CURRENT_INSTALLED_DIR}/bin" "${CURRENT_INSTALLED_DIR}/lib" REQUIRED) +get_filename_component(PROTOBUF_LIBNAME "${PROTOBUF_LIBPATH}" NAME) + +set(USE_PROTOBUF_SHARED OFF) + + + +# Like protoc, python is required for codegen. +vcpkg_find_acquire_program(PYTHON3) + +# PATH for .bat scripts so it can find 'python' +get_filename_component(PYTHON_DIR "${PYTHON3}" PATH) +vcpkg_add_to_path(PREPEND "${PYTHON_DIR}") + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + ${FEATURE_OPTIONS} + -DPython3_EXECUTABLE=${PYTHON3} + -DONNX_ML=ON + -DONNX_GEN_PB_TYPE_STUBS=ON + -DONNX_USE_PROTOBUF_SHARED_LIBS=${USE_PROTOBUF_SHARED} + -DONNX_USE_LITE_PROTO=OFF + -DONNX_USE_MSVC_STATIC_RUNTIME=${USE_STATIC_RUNTIME} + -DONNX_BUILD_TESTS=OFF + -DONNX_BUILD_BENCHMARKS=OFF + -DONNX_DISABLE_STATIC_REGISTRATION=ON + MAYBE_UNUSED_VARIABLES + ONNX_USE_MSVC_STATIC_RUNTIME +) + +vcpkg_cmake_install() +vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/ONNX) + +vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE") + +file(REMOVE_RECURSE + "${CURRENT_PACKAGES_DIR}/debug/include" + "${CURRENT_PACKAGES_DIR}/debug/share" + # the others are empty + "${CURRENT_PACKAGES_DIR}/include/onnx/backend" + "${CURRENT_PACKAGES_DIR}/include/onnx/bin" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/controlflow" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/generator" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/image" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/logical" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/math" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/nn" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/object_detection" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/optional" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/quantization" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/reduction" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/rnn" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/sequence" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/text" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/traditionalml" + "${CURRENT_PACKAGES_DIR}/include/onnx/defs/training" + "${CURRENT_PACKAGES_DIR}/include/onnx/examples" + "${CURRENT_PACKAGES_DIR}/include/onnx/frontend" + "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_cpp2py_export" + "${CURRENT_PACKAGES_DIR}/include/onnx/test" + "${CURRENT_PACKAGES_DIR}/include/onnx/tools" + "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_ml" + "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_data" + "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_operators_ml" + "${CURRENT_PACKAGES_DIR}/include/onnx/reference/ops" + "${CURRENT_PACKAGES_DIR}/include/onnx/reference" +) diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json new file mode 100644 index 0000000000000..7d2bbd84c05b3 --- /dev/null +++ b/cmake/vcpkg-ports/onnx/vcpkg.json @@ -0,0 +1,23 @@ +{ + "name": "onnx", + "version-semver": "1.17.0", + "description": "Open standard for machine learning interoperability", + "homepage": "https://onnx.ai", + "license": "Apache-2.0", + "supports": "!uwp", + "dependencies": [ + "protobuf", + { + "name": "protobuf", + "host": true + }, + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ] +} diff --git a/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch b/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch new file mode 100644 index 0000000000000..97fd1ac7a2bb1 --- /dev/null +++ b/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch @@ -0,0 +1,82 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index f06aada..3c6c6e2 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -31,8 +31,6 @@ IF(CCACHE_BINARY) + ENDIF() + + # ---[ Options. +-SET(PTHREADPOOL_LIBRARY_TYPE "default" CACHE STRING "Type of library (shared, static, or default) to build") +-SET_PROPERTY(CACHE PTHREADPOOL_LIBRARY_TYPE PROPERTY STRINGS default static shared) + OPTION(PTHREADPOOL_ALLOW_DEPRECATED_API "Enable deprecated API functions" ON) + SET(PTHREADPOOL_SYNC_PRIMITIVE "default" CACHE STRING "Synchronization primitive (condvar, futex, gcd, event, or default) for worker threads") + SET_PROPERTY(CACHE PTHREADPOOL_SYNC_PRIMITIVE PROPERTY STRINGS default condvar futex gcd event) +@@ -41,7 +39,7 @@ IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|AMD64|x86(_64)?)$") + ELSE() + OPTION(PTHREADPOOL_ENABLE_FASTPATH "Enable fast path using atomic decrement instead of atomic compare-and-swap" OFF) + ENDIF() +-IF("${CMAKE_SOURCE_DIR}" STREQUAL "${PROJECT_SOURCE_DIR}") ++IF(FALSE) + OPTION(PTHREADPOOL_BUILD_TESTS "Build pthreadpool unit tests" ON) + OPTION(PTHREADPOOL_BUILD_BENCHMARKS "Build pthreadpool micro-benchmarks" ON) + ELSE() +@@ -67,7 +65,8 @@ MACRO(PTHREADPOOL_TARGET_ENABLE_CXX11 target) + ENDMACRO() + + # ---[ Download deps +-IF(NOT DEFINED FXDIV_SOURCE_DIR) ++find_path(FXDIV_INCLUDE_DIRS "fxdiv.h") ++IF(FALSE) + MESSAGE(STATUS "Downloading FXdiv to ${CMAKE_BINARY_DIR}/FXdiv-source (define FXDIV_SOURCE_DIR to avoid it)") + CONFIGURE_FILE(cmake/DownloadFXdiv.cmake "${CMAKE_BINARY_DIR}/FXdiv-download/CMakeLists.txt") + EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" . +@@ -118,21 +117,13 @@ ELSE() + ENDIF() + + ADD_LIBRARY(pthreadpool_interface INTERFACE) +-TARGET_INCLUDE_DIRECTORIES(pthreadpool_interface INTERFACE include) ++TARGET_INCLUDE_DIRECTORIES(pthreadpool_interface INTERFACE $ $) + IF(NOT PTHREADPOOL_ALLOW_DEPRECATED_API) + TARGET_COMPILE_DEFINITIONS(pthreadpool_interface INTERFACE PTHREADPOOL_NO_DEPRECATED_API=1) + ENDIF() + INSTALL(FILES include/pthreadpool.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +-IF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "default") +- ADD_LIBRARY(pthreadpool ${PTHREADPOOL_SRCS}) +-ELSEIF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "shared") +- ADD_LIBRARY(pthreadpool SHARED ${PTHREADPOOL_SRCS}) +-ELSEIF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "static") +- ADD_LIBRARY(pthreadpool STATIC ${PTHREADPOOL_SRCS}) +-ELSE() +- MESSAGE(FATAL_ERROR "Unsupported library type ${PTHREADPOOL_LIBRARY_TYPE}") +-ENDIF() ++ADD_LIBRARY(pthreadpool ${PTHREADPOOL_SRCS}) + + IF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "condvar") + TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=0) +@@ -181,18 +172,22 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") + ENDIF() + + # ---[ Configure FXdiv +-IF(NOT TARGET fxdiv) ++IF(FALSE) + SET(FXDIV_BUILD_TESTS OFF CACHE BOOL "") + SET(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") + ADD_SUBDIRECTORY( + "${FXDIV_SOURCE_DIR}" + "${CMAKE_BINARY_DIR}/FXdiv") + ENDIF() +-TARGET_LINK_LIBRARIES(pthreadpool PRIVATE fxdiv) ++TARGET_INCLUDE_DIRECTORIES(pthreadpool PRIVATE ${FXDIV_INCLUDE_DIRS}) + +-INSTALL(TARGETS pthreadpool ++INSTALL(TARGETS pthreadpool pthreadpool_interface ++ EXPORT unofficial-pthreadpool-config ++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) ++install(EXPORT unofficial-pthreadpool-config NAMESPACE unofficial:: ++ DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/unofficial-${PROJECT_NAME}) # share/unofficial-pthreadpool + + IF(PTHREADPOOL_BUILD_TESTS) + # ---[ Build google test diff --git a/cmake/vcpkg-ports/pthreadpool/portfile.cmake b/cmake/vcpkg-ports/pthreadpool/portfile.cmake new file mode 100644 index 0000000000000..9400e5e886639 --- /dev/null +++ b/cmake/vcpkg-ports/pthreadpool/portfile.cmake @@ -0,0 +1,25 @@ +if(VCPKG_TARGET_IS_WINDOWS) + vcpkg_check_linkage(ONLY_STATIC_LIBRARY) +endif() + +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO google/pthreadpool + REF 4e80ca24521aa0fb3a746f9ea9c3eaa20e9afbb0 + SHA512 776017cc5d2aa94337292f2f4fbd54d099ef29abf736ab8147f07f98f12b7654cbd2fe38d34646a479a519c261ac253bbaf19c6dcbb0ec4cc0859de70f7e6472 + PATCHES + fix-cmakelists.patch +) + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + -DPTHREADPOOL_BUILD_TESTS=OFF + -DPTHREADPOOL_BUILD_BENCHMARKS=OFF +) +vcpkg_cmake_install() +vcpkg_copy_pdbs() +vcpkg_cmake_config_fixup(PACKAGE_NAME unofficial-${PORT}) + +#file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") diff --git a/cmake/vcpkg-ports/pthreadpool/vcpkg.json b/cmake/vcpkg-ports/pthreadpool/vcpkg.json new file mode 100644 index 0000000000000..16c0bea5b712c --- /dev/null +++ b/cmake/vcpkg-ports/pthreadpool/vcpkg.json @@ -0,0 +1,17 @@ +{ + "name": "pthreadpool", + "version-date": "2024-12-17", + "description": "Portable (POSIX/Windows/Emscripten) thread pool for C/C++", + "homepage": "https://github.com/google/pthreadpool", + "dependencies": [ + "fxdiv", + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ] +} diff --git a/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch b/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch new file mode 100644 index 0000000000000..a7c5e0e254aa1 --- /dev/null +++ b/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch @@ -0,0 +1,12 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 4a9fad59a..2713cded3 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -334,6 +334,7 @@ ENDIF() + IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # Disable "note: parameter passing for argument of type ... changed/will change in ..." + ADD_COMPILE_OPTIONS("-Wno-psabi") ++ ADD_COMPILE_OPTIONS("-Wno-incompatible-pointer-types") + ENDIF() + + # ---[ Build flags diff --git a/cmake/vcpkg-ports/xnnpack/fix-build.patch b/cmake/vcpkg-ports/xnnpack/fix-build.patch new file mode 100644 index 0000000000000..b867377d2ff9e --- /dev/null +++ b/cmake/vcpkg-ports/xnnpack/fix-build.patch @@ -0,0 +1,71 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index f0b3410ae..ba54c3bfe 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -1047,9 +1047,11 @@ ENDIF() + IF(XNNPACK_BUILD_ALL_MICROKERNELS) + TARGET_INCLUDE_DIRECTORIES(microkernels-all PRIVATE include src) + ENDIF() ++ + TARGET_INCLUDE_DIRECTORIES(datatype PRIVATE include src) + TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE include src) +-TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src ${CPUINFO_SOURCE_DIR}/include) ++TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src) ++ + TARGET_INCLUDE_DIRECTORIES(indirection PRIVATE include src) + TARGET_INCLUDE_DIRECTORIES(microparams-init PRIVATE include src) + TARGET_INCLUDE_DIRECTORIES(normalization PRIVATE include src) +@@ -1104,14 +1106,9 @@ IF(NOT TARGET cpuinfo) + "${CPUINFO_SOURCE_DIR}" + "${CMAKE_BINARY_DIR}/cpuinfo") + ELSE() +- ADD_LIBRARY(cpuinfo SHARED IMPORTED) +- FIND_LIBRARY(CPUINFO_LIBRARY cpuinfo PATHS "${CPUINFO_SOURCE_DIR}/lib") +- IF(NOT CPUINFO_LIBRARY) +- MESSAGE(FATAL_ERROR "Cannot find cpuinfo") +- ENDIF() +- TARGET_INCLUDE_DIRECTORIES(cpuinfo INTERFACE "${CPUINFO_SOURCE_DIR}/include") +- SET_PROPERTY(TARGET cpuinfo PROPERTY IMPORTED_LOCATION "${CPUINFO_LIBRARY}") +- SET_PROPERTY(TARGET cpuinfo PROPERTY IMPORTED_IMPLIB "${CPUINFO_LIBRARY}") ++ ADD_LIBRARY(cpuinfo INTERFACE) ++ FIND_PACKAGE(cpuinfo CONFIG REQUIRED) ++ TARGET_LINK_LIBRARIES(cpuinfo INTERFACE cpuinfo::cpuinfo) + ENDIF() + ENDIF() + IF(XNNPACK_BUILD_LIBRARY) +@@ -1129,16 +1126,12 @@ IF(NOT TARGET pthreadpool) + "${PTHREADPOOL_SOURCE_DIR}" + "${CMAKE_BINARY_DIR}/pthreadpool") + ELSE() ++ find_package(unofficial-pthreadpool CONFIG REQUIRED) + ADD_LIBRARY(pthreadpool SHARED IMPORTED) +- FIND_LIBRARY(PTHREADPOOL_LIBRARY pthreadpool PATHS "${PTHREADPOOL_SOURCE_DIR}/lib") +- IF(NOT PTHREADPOOL_LIBRARY) +- MESSAGE(FATAL_ERROR "Cannot find pthreadpool") +- ENDIF() ++ FIND_LIBRARY(PTHREADPOOL_LIBRARY NAMES pthreadpool REQUIRED) + FIND_PACKAGE(Threads REQUIRED) +- TARGET_INCLUDE_DIRECTORIES(pthreadpool INTERFACE "${PTHREADPOOL_SOURCE_DIR}/include") +- TARGET_LINK_LIBRARIES(pthreadpool INTERFACE Threads::Threads) ++ TARGET_LINK_LIBRARIES(pthreadpool INTERFACE Threads::Threads unofficial::pthreadpool unofficial::pthreadpool_interface) + SET_PROPERTY(TARGET pthreadpool PROPERTY IMPORTED_LOCATION "${PTHREADPOOL_LIBRARY}") +- SET_PROPERTY(TARGET pthreadpool PROPERTY IMPORTED_IMPLIB "${PTHREADPOOL_LIBRARY}") + ENDIF() + ENDIF() + TARGET_LINK_LIBRARIES(xnnpack-base INTERFACE pthreadpool) +@@ -1152,12 +1145,12 @@ IF(NOT TARGET fxdiv) + "${FXDIV_SOURCE_DIR}" + "${CMAKE_BINARY_DIR}/FXdiv") + ELSE() +- FIND_FILE(FXDIV_HDR fxdiv.h PATH_SUFFIXES include PATHS "${FXDIV_SOURCE_DIR}") ++ FIND_PATH(FXDIV_HDR fxdiv.h PATH_SUFFIXES include) + IF(NOT FXDIV_HDR) + MESSAGE(FATAL_ERROR "Cannot find fxdiv") + ENDIF() +- ADD_LIBRARY(fxdiv STATIC "${FXDIV_HDR}") +- TARGET_INCLUDE_DIRECTORIES(fxdiv INTERFACE "${FXDIV_SOURCE_DIR}/include") ++ ADD_LIBRARY(fxdiv INTERFACE IMPORTED) ++ target_include_directories(fxdiv INTERFACE "${FXDIV_HDR}") + SET_PROPERTY(TARGET fxdiv PROPERTY LINKER_LANGUAGE C) + ENDIF() + ENDIF() diff --git a/cmake/vcpkg-ports/xnnpack/portfile.cmake b/cmake/vcpkg-ports/xnnpack/portfile.cmake new file mode 100644 index 0000000000000..b07da3186b4b4 --- /dev/null +++ b/cmake/vcpkg-ports/xnnpack/portfile.cmake @@ -0,0 +1,39 @@ +if(VCPKG_TARGET_IS_WINDOWS) + vcpkg_check_linkage(ONLY_STATIC_LIBRARY) +endif() + +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO google/XNNPACK + REF 854b343f9cad36bd596e4390959ca3648208e048 + SHA512 f37384b43022cb74bf87bd99c2e82e51d48fe4e0e4642611fcbc10cbb86ff2468b67964027f13f82a715dc7201c490d88d5020fb565ad236187b9dd219f3f644 + HEAD_REF master + PATCHES + fix-build.patch + disable_gcc_warning.patch +) +vcpkg_find_acquire_program(PYTHON3) + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + WINDOWS_USE_MSBUILD + OPTIONS + "-DPython3_EXECUTABLE=${PYTHON3}" + "-DPython_EXECUTABLE=${PYTHON3}" + -DXNNPACK_USE_SYSTEM_LIBS=ON + -DXNNPACK_ENABLE_AVXVNNI=OFF + -DXNNPACK_ENABLE_ASSEMBLY=ON + -DXNNPACK_ENABLE_MEMOPT=ON + -DXNNPACK_ENABLE_SPARSE=ON + -DXNNPACK_ENABLE_KLEIDIAI=OFF + -DXNNPACK_BUILD_TESTS=OFF + -DXNNPACK_BUILD_BENCHMARKS=OFF +) +vcpkg_cmake_install() +vcpkg_copy_pdbs() + +file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include" + "${CURRENT_PACKAGES_DIR}/debug/bin" + "${CURRENT_PACKAGES_DIR}/debug/share" +) diff --git a/cmake/vcpkg-ports/xnnpack/vcpkg.json b/cmake/vcpkg-ports/xnnpack/vcpkg.json new file mode 100644 index 0000000000000..5e383c0b37810 --- /dev/null +++ b/cmake/vcpkg-ports/xnnpack/vcpkg.json @@ -0,0 +1,17 @@ +{ + "name": "xnnpack", + "version-date": "2025-01-17", + "description": "High-efficiency floating-point neural network inference operators for mobile, server, and Web", + "homepage": "https://github.com/google/XNNPACK", + "license": "BSD-3-Clause", + "supports": "!(arm & windows) & !uwp & !arm32", + "dependencies": [ + "cpuinfo", + "fxdiv", + "pthreadpool", + { + "name": "vcpkg-cmake", + "host": true + } + ] +} diff --git a/cmake/vcpkg-triplets/asan/arm64-linux.cmake b/cmake/vcpkg-triplets/asan/arm64-linux.cmake index 6875a03064bfa..9f5c9997daedb 100644 --- a/cmake/vcpkg-triplets/asan/arm64-linux.cmake +++ b/cmake/vcpkg-triplets/asan/arm64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address") -set(VCPKG_CXX_FLAGS "-fsanitize=address") +set(VCPKG_C_FLAGS "-g -fsanitize=address") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/arm64-osx.cmake b/cmake/vcpkg-triplets/asan/arm64-osx.cmake index 4ac6bd8097de6..ba56684949836 100644 --- a/cmake/vcpkg-triplets/asan/arm64-osx.cmake +++ b/cmake/vcpkg-triplets/asan/arm64-osx.cmake @@ -3,13 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address") -set(VCPKG_CXX_FLAGS "-fsanitize=address") +set(VCPKG_C_FLAGS "-g -fsanitize=address") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake index c03c9e718fc80..79e10ad9e4436 100644 --- a/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake b/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake index 184001d4238b0..d0a3305b1f74a 100644 --- a/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake index 36176fe04033e..05a9718835ffb 100644 --- a/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake index aa086c1220dfb..e0f4b2e1e4183 100644 --- a/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/universal2-osx.cmake b/cmake/vcpkg-triplets/asan/universal2-osx.cmake index de2c8cee48ed5..d74494d578cd9 100644 --- a/cmake/vcpkg-triplets/asan/universal2-osx.cmake +++ b/cmake/vcpkg-triplets/asan/universal2-osx.cmake @@ -3,13 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address") -set(VCPKG_CXX_FLAGS "-fsanitize=address") +set(VCPKG_C_FLAGS "-g -fsanitize=address") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/x64-linux.cmake b/cmake/vcpkg-triplets/asan/x64-linux.cmake index dd1d066eb373a..64ba6b2216394 100644 --- a/cmake/vcpkg-triplets/asan/x64-linux.cmake +++ b/cmake/vcpkg-triplets/asan/x64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address") -set(VCPKG_CXX_FLAGS "-fsanitize=address") +set(VCPKG_C_FLAGS "-g -fsanitize=address") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/x64-osx.cmake b/cmake/vcpkg-triplets/asan/x64-osx.cmake index 5f1442c1d5c4e..bbcaff4c39209 100644 --- a/cmake/vcpkg-triplets/asan/x64-osx.cmake +++ b/cmake/vcpkg-triplets/asan/x64-osx.cmake @@ -3,13 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address") -set(VCPKG_CXX_FLAGS "-fsanitize=address") +set(VCPKG_C_FLAGS "-g -fsanitize=address") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake index 27f7a0190a33c..c0edb9ca31cb6 100644 --- a/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/x64-windows-static.cmake b/cmake/vcpkg-triplets/asan/x64-windows-static.cmake index 23b8082fbd5a3..3370987c55a12 100644 --- a/cmake/vcpkg-triplets/asan/x64-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan/x64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake index cb9c639049936..429a4ac7cea36 100644 --- a/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan/x86-windows-static.cmake b/cmake/vcpkg-triplets/asan/x86-windows-static.cmake index 0667f5f0ea61e..404cb3fbd07fb 100644 --- a/cmake/vcpkg-triplets/asan/x86-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan/x86-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake index 77f35ebada258..3d78741ebcf1d 100644 --- a/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake index 5cc70905e6e24..b25f8f8ebb8d1 100644 --- a/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake @@ -3,13 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake index cb0957791f432..c4ba82b7cac2a 100644 --- a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake index 2d38883062bb1..3b028c4e40bcc 100644 --- a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake index 4cc7102bf3b1c..d2d4bda334e38 100644 --- a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake index d84533c8de35c..8e986eb139862 100644 --- a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake index cacbfa751677d..6181e6d1c161b 100644 --- a/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake @@ -3,13 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake index b53e668a64c09..d7103ff2508bf 100644 --- a/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake index 9f4adb513edd4..191dfb3d35d10 100644 --- a/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake @@ -3,13 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-fsanitize=address") +set(VCPKG_LINKER_FLAGS "-fsanitize=address -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake index 2812ed9419e43..ae3f00b851145 100644 --- a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake index ccdb919b3e3ee..d64f20d3ce7f6 100644 --- a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake index 7a6b45666a924..24ddfa43c0f59 100644 --- a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake index 96b2a2ad749b8..53fcb44313c26 100644 --- a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake +++ b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim/arm64-linux.cmake b/cmake/vcpkg-triplets/binskim/arm64-linux.cmake index 4b738553e0fbc..8a3cf645d7f5f 100644 --- a/cmake/vcpkg-triplets/binskim/arm64-linux.cmake +++ b/cmake/vcpkg-triplets/binskim/arm64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack") +set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim/arm64-osx.cmake b/cmake/vcpkg-triplets/binskim/arm64-osx.cmake index 4b6999874b111..9892a3eac80e8 100644 --- a/cmake/vcpkg-triplets/binskim/arm64-osx.cmake +++ b/cmake/vcpkg-triplets/binskim/arm64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake index 89dfae4bcbf26..3818356b5c0ce 100644 --- a/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake b/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake index 28ef65c4d1227..ab38e9f9a9f18 100644 --- a/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake index 0c087aa1b59f7..6937aea847a8a 100644 --- a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake index 8c7de3b8a97f9..84c0531033699 100644 --- a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim/universal2-osx.cmake b/cmake/vcpkg-triplets/binskim/universal2-osx.cmake index 60826f1ede770..da4c6abb39000 100644 --- a/cmake/vcpkg-triplets/binskim/universal2-osx.cmake +++ b/cmake/vcpkg-triplets/binskim/universal2-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim/x64-linux.cmake b/cmake/vcpkg-triplets/binskim/x64-linux.cmake index 8d7aeb2342e26..e3d4d34326409 100644 --- a/cmake/vcpkg-triplets/binskim/x64-linux.cmake +++ b/cmake/vcpkg-triplets/binskim/x64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack") +set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim/x64-osx.cmake b/cmake/vcpkg-triplets/binskim/x64-osx.cmake index e391ab9eaee6d..426a35e33f747 100644 --- a/cmake/vcpkg-triplets/binskim/x64-osx.cmake +++ b/cmake/vcpkg-triplets/binskim/x64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake index ef67223cd0cd3..0f600d7931076 100644 --- a/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake b/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake index 62948a156c911..17d41775c9d06 100644 --- a/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake index 8ac022c7eee4c..cb981c264a2f1 100644 --- a/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake b/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake index 8fd2d29dc3d99..53342263d5ada 100644 --- a/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake index c9787f460b78d..203c85fa3a59e 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack") +set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake index f5866d6863cb7..c57a2401e4c0f 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake index 927b110c98d45..9963cfb66f4b1 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake index b0419c9a0d0e0..0f4948ff076f1 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake index aa8b7a5f0e96b..6a5c8b9f1058a 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake index 96da5d9b1372f..668d4fb4dc420 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake index f4ef6f0c659d8..1956daf30e6d9 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake index 8fe977fb86e56..da17e0073980f 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake @@ -3,12 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) -set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack") +set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake index 196018d7cf442..c74e60bc7c206 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake index 38b5cbdde2d65..6491d31ae469b 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake index bea970b669f4f..011999df2ac99 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake index e75d0c645c6a1..bf843c3e950e2 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake index 6de6f80d9705c..21e0858066ab8 100644 --- a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake +++ b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE") if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS diff --git a/cmake/vcpkg-triplets/default/arm64-linux.cmake b/cmake/vcpkg-triplets/default/arm64-linux.cmake index 581367931ba5e..120865a5b0b57 100644 --- a/cmake/vcpkg-triplets/default/arm64-linux.cmake +++ b/cmake/vcpkg-triplets/default/arm64-linux.cmake @@ -3,11 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "") -set(VCPKG_CXX_FLAGS "") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/arm64-osx.cmake b/cmake/vcpkg-triplets/default/arm64-osx.cmake index 4d74306ba4e6a..02e8a3430475f 100644 --- a/cmake/vcpkg-triplets/default/arm64-osx.cmake +++ b/cmake/vcpkg-triplets/default/arm64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "") -set(VCPKG_CXX_FLAGS "") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake index 135dc6ed6f3b5..6d5cf67665f4f 100644 --- a/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/arm64-windows-static.cmake b/cmake/vcpkg-triplets/default/arm64-windows-static.cmake index 56e1aebfe620b..19ca6f16cd890 100644 --- a/cmake/vcpkg-triplets/default/arm64-windows-static.cmake +++ b/cmake/vcpkg-triplets/default/arm64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake index 9256f07f5451d..d7982158f390c 100644 --- a/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake index bbdfed06fb2bc..fb14ad71c1d91 100644 --- a/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake +++ b/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/universal2-osx.cmake b/cmake/vcpkg-triplets/default/universal2-osx.cmake index 64b19451dd64d..57386c423cf99 100644 --- a/cmake/vcpkg-triplets/default/universal2-osx.cmake +++ b/cmake/vcpkg-triplets/default/universal2-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "") -set(VCPKG_CXX_FLAGS "") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/x64-linux.cmake b/cmake/vcpkg-triplets/default/x64-linux.cmake index 57114dd5fcb76..30c7b1b786302 100644 --- a/cmake/vcpkg-triplets/default/x64-linux.cmake +++ b/cmake/vcpkg-triplets/default/x64-linux.cmake @@ -3,11 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "") -set(VCPKG_CXX_FLAGS "") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/x64-osx.cmake b/cmake/vcpkg-triplets/default/x64-osx.cmake index dd50e622677b7..7af622e1354b9 100644 --- a/cmake/vcpkg-triplets/default/x64-osx.cmake +++ b/cmake/vcpkg-triplets/default/x64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "") -set(VCPKG_CXX_FLAGS "") +set(VCPKG_C_FLAGS "-g") +set(VCPKG_CXX_FLAGS "-g") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake index 5339a033715bb..bec5f2724da13 100644 --- a/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/x64-windows-static.cmake b/cmake/vcpkg-triplets/default/x64-windows-static.cmake index 579740efb6ab7..3f62418071583 100644 --- a/cmake/vcpkg-triplets/default/x64-windows-static.cmake +++ b/cmake/vcpkg-triplets/default/x64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake index 34223c67e8f44..d93d87b3289f3 100644 --- a/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/default/x86-windows-static.cmake b/cmake/vcpkg-triplets/default/x86-windows-static.cmake index fc95d409f890e..727b35cd1f7cc 100644 --- a/cmake/vcpkg-triplets/default/x86-windows-static.cmake +++ b/cmake/vcpkg-triplets/default/x86-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/gen.py b/cmake/vcpkg-triplets/gen.py index 615ca66fc8f64..bec1a87a0a20e 100644 --- a/cmake/vcpkg-triplets/gen.py +++ b/cmake/vcpkg-triplets/gen.py @@ -88,9 +88,11 @@ def add_copyright_header(f): # Disable RTTI and turn usage of dynamic_cast and typeid into errors cxxflags += ["/GR-", "/we4541"] # TODO: should it be a cmake list separated by semicolons? - f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags))) - f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags))) - f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)\n") + if len(cflags) >= 1: + f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags))) + if len(cxxflags) >= 1: + f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags))) + f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)\n") if ldflags: f.write('set(VCPKG_LINKER_FLAGS "{}")\n'.format(" ".join(ldflags))) add_port_configs(f) @@ -135,27 +137,34 @@ def add_copyright_header(f): f.write(f"set(VCPKG_TARGET_ARCHITECTURE {target_abi})\n") f.write(f"set(VCPKG_CRT_LINKAGE {crt_linkage})\n") f.write("set(VCPKG_LIBRARY_LINKAGE static)\n") + + if enable_binskim and os_name == "linux": ldflags = [ "-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now", - "-Wl,-z,noexecstack", + "-Wl,-z,noexecstack" ] else: ldflags = [] - cflags = [] + # Enable debug info for all build configs + cflags = ["-g"] + cflags_release = ["-DNDEBUG", "-O3"] if enable_binskim: - cflags += [ + # A warning may be generated from include/features.h if the _FORTIFY_SOURCE flag was used in a debug build + cflags_release += [ "-Wp,-D_FORTIFY_SOURCE=2", "-Wp,-D_GLIBCXX_ASSERTIONS", "-fstack-protector-strong", ] if target_abi == "x64": - cflags += ["-fstack-clash-protection", "-fcf-protection"] + cflags_release += ["-fstack-clash-protection", "-fcf-protection"] elif enable_asan: cflags += ["-fsanitize=address"] ldflags += ["-fsanitize=address"] + # Enable debug info for all build configs + ldflags.append('-g') # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001 if not enable_rtti: @@ -163,8 +172,13 @@ def add_copyright_header(f): cxxflags = cflags.copy() if not enable_rtti: cxxflags.append("-fno-rtti") - f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags))) - f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags))) + if len(cflags) >= 1: + f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags))) + if len(cxxflags) >= 1: + f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags))) + if len(cflags_release) >= 1: + f.write('set(VCPKG_C_FLAGS_RELEASE "{}")\n'.format(" ".join(cflags_release))) + f.write('set(VCPKG_CXX_FLAGS_RELEASE "{}")\n'.format(" ".join(cflags_release))) if os_name == "linux": f.write("set(VCPKG_CMAKE_SYSTEM_NAME Linux)\n") else: @@ -184,4 +198,8 @@ def add_copyright_header(f): if ldflags: f.write('set(VCPKG_LINKER_FLAGS "{}")\n'.format(" ".join(ldflags))) + if os_name == 'osx': + f.write('list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n') + else: + f.write('list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)\n') add_port_configs(f) diff --git a/cmake/vcpkg-triplets/nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/nortti/arm64-linux.cmake index 4bd974a112125..f9035fc299ce5 100644 --- a/cmake/vcpkg-triplets/nortti/arm64-linux.cmake +++ b/cmake/vcpkg-triplets/nortti/arm64-linux.cmake @@ -3,11 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/nortti/arm64-osx.cmake index fd8be60b7158a..d8971e8122f9d 100644 --- a/cmake/vcpkg-triplets/nortti/arm64-osx.cmake +++ b/cmake/vcpkg-triplets/nortti/arm64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE arm64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake index 45b24baa2c94b..9d3c86ce644d0 100644 --- a/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake index 947fe9b61bd6c..238f7405ec492 100644 --- a/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake +++ b/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake index ea5741fa42fbe..da314824ca7cc 100644 --- a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake index 2b354ba511303..0c7fb60401f1d 100644 --- a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake +++ b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/nortti/universal2-osx.cmake index 7111bb87c910b..febc002c0488e 100644 --- a/cmake/vcpkg-triplets/nortti/universal2-osx.cmake +++ b/cmake/vcpkg-triplets/nortti/universal2-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/x64-linux.cmake b/cmake/vcpkg-triplets/nortti/x64-linux.cmake index 34fcc968e6c0e..c1dac19d33f2c 100644 --- a/cmake/vcpkg-triplets/nortti/x64-linux.cmake +++ b/cmake/vcpkg-triplets/nortti/x64-linux.cmake @@ -3,11 +3,15 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Linux) set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/x64-osx.cmake b/cmake/vcpkg-triplets/nortti/x64-osx.cmake index 0a7fcc08e0c69..242d34a358170 100644 --- a/cmake/vcpkg-triplets/nortti/x64-osx.cmake +++ b/cmake/vcpkg-triplets/nortti/x64-osx.cmake @@ -3,12 +3,16 @@ set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) -set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") -set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0") +set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti") +set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3") +set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3") set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64") set(CMAKE_POSITION_INDEPENDENT_CODE ON) list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF) +set(VCPKG_LINKER_FLAGS "-g") +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake index 5c62c4263f8eb..a8d2441583d0f 100644 --- a/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake index deceefcd95910..688ed230fd17c 100644 --- a/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake +++ b/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake index cb1b3cd887932..1d3de9c1420c4 100644 --- a/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake +++ b/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake index 032021745099a..3a856c26797a4 100644 --- a/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake +++ b/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake @@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000") set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541") -list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error) +list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17) if(PORT MATCHES "onnx") list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS "-DONNX_DISABLE_STATIC_REGISTRATION=ON" diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj index 63131d05c03d5..be0e8d2ee58a4 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj +++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj @@ -1,4 +1,4 @@ - + Microsoft.ML.OnnxRuntime @@ -127,6 +127,11 @@ $(OrtConstants);__ENABLE_TRAINING_APIS__ + + + true + + @@ -184,6 +189,10 @@ + + + + diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs index d38748c2f97cc..7a5c3aaa19eac 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs @@ -9,6 +9,14 @@ using System.Runtime.InteropServices; using System.Text; +#if NET8_0_OR_GREATER +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Runtime.CompilerServices; +using SystemNumericsTensors = System.Numerics.Tensors; +using TensorPrimitives = System.Numerics.Tensors.TensorPrimitives; +#endif + namespace Microsoft.ML.OnnxRuntime { /// @@ -205,6 +213,33 @@ public ReadOnlySpan GetTensorDataAsSpan() where T : unmanaged return MemoryMarshal.Cast(byteSpan); } +#if NET8_0_OR_GREATER + /// + /// Returns a ReadOnlyTensorSpan over tensor native buffer that + /// provides a read-only view. + /// + /// Note, that the memory may be device allocated and, therefore, not accessible from the CPU. + /// To get memory descriptor use GetTensorMemoryInfo(). + /// + /// OrtValue must contain a non-string tensor. + /// The span is valid as long as the OrtValue instance is alive (not disposed). + /// + /// + /// ReadOnlySpan + /// + [Experimental("SYSLIB5001")] + public SystemNumericsTensors.ReadOnlyTensorSpan GetTensorDataAsTensorSpan() where T : unmanaged + { + var byteSpan = GetTensorBufferRawData(typeof(T)); + + var typeSpan = MemoryMarshal.Cast(byteSpan); + var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape; + nint[] nArray = Array.ConvertAll(shape, new Converter(x => (nint)x)); + + return new SystemNumericsTensors.ReadOnlyTensorSpan(typeSpan, nArray, []); + } +#endif + /// /// Returns a Span over tensor native buffer. /// This enables you to safely and efficiently modify the underlying @@ -225,6 +260,32 @@ public Span GetTensorMutableDataAsSpan() where T : unmanaged return MemoryMarshal.Cast(byteSpan); } +#if NET8_0_OR_GREATER + /// + /// Returns a TensorSpan over tensor native buffer. + /// + /// Note, that the memory may be device allocated and, therefore, not accessible from the CPU. + /// To get memory descriptor use GetTensorMemoryInfo(). + /// + /// OrtValue must contain a non-string tensor. + /// The span is valid as long as the OrtValue instance is alive (not disposed). + /// + /// + /// ReadOnlySpan + /// + [Experimental("SYSLIB5001")] + public SystemNumericsTensors.TensorSpan GetTensorMutableDataAsTensorSpan() where T : unmanaged + { + var byteSpan = GetTensorBufferRawData(typeof(T)); + + var typeSpan = MemoryMarshal.Cast(byteSpan); + var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape; + nint[] nArray = Array.ConvertAll(shape, new Converter(x => (nint)x)); + + return new SystemNumericsTensors.TensorSpan(typeSpan, nArray, []); + } +#endif + /// /// Provides mutable raw native buffer access. /// @@ -234,6 +295,23 @@ public Span GetTensorMutableRawData() return GetTensorBufferRawData(typeof(byte)); } +#if NET8_0_OR_GREATER + /// + /// Provides mutable raw native buffer access. + /// + /// TensorSpan over the native buffer bytes + [Experimental("SYSLIB5001")] + public SystemNumericsTensors.TensorSpan GetTensorSpanMutableRawData() where T : unmanaged + { + var byteSpan = GetTensorBufferRawData(typeof(T)); + + var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape; + nint[] nArray = Array.ConvertAll(shape, new Converter(x => (nint)x)); + + return new SystemNumericsTensors.TensorSpan(byteSpan, nArray, []); + } +#endif + /// /// Fetch string tensor element buffer pointer at the specified index, /// convert/copy to UTF-16 char[] and return a ReadOnlyMemory{char} instance. @@ -605,6 +683,80 @@ public static OrtValue CreateTensorValueFromMemory(T[] data, long[] shape) wh return OrtValue.CreateTensorValueFromMemory(OrtMemoryInfo.DefaultInstance, new Memory(data), shape); } +#if NET8_0_OR_GREATER + /// + /// This is a factory method creates a native Onnxruntime OrtValue containing a tensor on top of the existing tensor managed memory. + /// The method will attempt to pin managed memory so no copying occurs when data is passed down + /// to native code. + /// + /// Tensor object + /// discovered tensor element type + /// And instance of OrtValue constructed on top of the object + [Experimental("SYSLIB5001")] + public static OrtValue CreateTensorValueFromSystemNumericsTensorObject(SystemNumericsTensors.Tensor tensor) where T : unmanaged + { + if (!IsContiguousAndDense(tensor)) + { + var newTensor = SystemNumericsTensors.Tensor.Create(tensor.Lengths); + tensor.CopyTo(newTensor); + tensor = newTensor; + } + unsafe + { + var backingData = (T[])tensor.GetType().GetField("_values", BindingFlags.Instance | BindingFlags.NonPublic).GetValue(tensor); + GCHandle handle = GCHandle.Alloc(backingData, GCHandleType.Pinned); + var memHandle = new MemoryHandle(Unsafe.AsPointer(ref tensor.GetPinnableReference()), handle); + + try + { + IntPtr dataBufferPointer = IntPtr.Zero; + unsafe + { + dataBufferPointer = (IntPtr)memHandle.Pointer; + } + + var bufferLengthInBytes = tensor.FlattenedLength * sizeof(T); + long[] shape = Array.ConvertAll(tensor.Lengths.ToArray(), new Converter(x => (long)x)); + + var typeInfo = TensorBase.GetTypeInfo(typeof(T)) ?? + throw new OnnxRuntimeException(ErrorCode.InvalidArgument, $"Tensor of type: {typeof(T)} is not supported"); + + NativeApiStatus.VerifySuccess(NativeMethods.OrtCreateTensorWithDataAsOrtValue( + OrtMemoryInfo.DefaultInstance.Pointer, + dataBufferPointer, + (UIntPtr)(bufferLengthInBytes), + shape, + (UIntPtr)tensor.Rank, + typeInfo.ElementType, + out IntPtr nativeValue)); + + return new OrtValue(nativeValue, memHandle); + } + catch (Exception) + { + memHandle.Dispose(); + throw; + } + } + } + + [Experimental("SYSLIB5001")] + private static bool IsContiguousAndDense(SystemNumericsTensors.Tensor tensor) where T : unmanaged + { + // Right most dimension must be 1 for a dense tensor. + if (tensor.Strides[^1] != 1) + return false; + + // For other dimensions, the stride must be equal to the product of the dimensions to the right. + for (int i = tensor.Rank - 2; i >= 0; i--) + { + if (tensor.Strides[i] != TensorPrimitives.Product(tensor.Lengths.Slice(i + 1, tensor.Lengths.Length - i - 1))) + return false; + } + return true; + } +#endif + /// /// The factory API creates an OrtValue with memory allocated using the given allocator /// according to the specified shape and element type. The memory will be released when OrtValue diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs index ff5fd2de54197..816511150a137 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs @@ -7,6 +7,10 @@ using System.Text.RegularExpressions; using Xunit; +#if NET8_0_OR_GREATER +using SystemNumericsTensors = System.Numerics.Tensors; +#endif + namespace Microsoft.ML.OnnxRuntime.Tests { /// @@ -67,6 +71,194 @@ public void CanCreateAndDisposeSessionWithModelPath() } } +#if NET8_0_OR_GREATER +#pragma warning disable SYSLIB5001 // System.Numerics.Tensors is only in preview so we can continue receiving API feedback + [Theory] + [InlineData(GraphOptimizationLevel.ORT_DISABLE_ALL, true)] + [InlineData(GraphOptimizationLevel.ORT_DISABLE_ALL, false)] + [InlineData(GraphOptimizationLevel.ORT_ENABLE_EXTENDED, true)] + [InlineData(GraphOptimizationLevel.ORT_ENABLE_EXTENDED, false)] + private void CanRunInferenceOnAModelDotnetTensors(GraphOptimizationLevel graphOptimizationLevel, bool enableParallelExecution) + { + var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx"); + + using (var cleanUp = new DisposableListTest()) + { + // Set the graph optimization level for this session. + SessionOptions options = new SessionOptions(); + cleanUp.Add(options); + options.GraphOptimizationLevel = graphOptimizationLevel; + + var session = new InferenceSession(model, options); + cleanUp.Add(session); + + using var runOptions = new RunOptions(); + var inputMeta = session.InputMetadata; + var outputMeta = session.OutputMetadata; + + float[] expectedOutput = TestDataLoader.LoadTensorFromEmbeddedResource("bench.expected_out"); + long[] expectedDimensions = { 1, 1000, 1, 1 }; // hardcoded for now for the test data + ReadOnlySpan expectedOutputDimensions = expectedDimensions; + + float[] inputData = TestDataLoader.LoadTensorFromEmbeddedResource("bench.in"); // this is the data for only one input tensor for this model + + using var inputOrtValues = new DisposableListTest>(session.InputMetadata.Count); + + foreach (var name in inputMeta.Keys) + { + Assert.Equal(typeof(float), inputMeta[name].ElementType); + Assert.True(inputMeta[name].IsTensor); + var tensor = SystemNumericsTensors.Tensor.Create(inputData, inputMeta[name].Dimensions.Select(x => (nint)x).ToArray()); + inputOrtValues.Add(new DisposableTestPair(name, OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor))); + + } + + runOptions.LogId = "CsharpTest"; + runOptions.Terminate = false; // TODO: Test terminate = true, it currently crashes + runOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_ERROR; + // Run inference with named inputs and outputs created with in Run() + using (var results = session.Run(runOptions, inputOrtValues.Select(x => x.Key).ToList(), inputOrtValues.Select(x => x.Value).ToList(), new List(["softmaxout_1"]))) // results is an IDisposableReadOnlyCollection container + { + // validate the results + foreach (var r in results) + { + Assert.Single(results); + + ValidateRunResult(r, expectedOutput, expectedDimensions); + } + } + } + } + + [Fact] + public void InferenceSessionDisposedDotnetTensors() + { + var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx"); + + // Set the graph optimization level for this session. + using (SessionOptions options = new SessionOptions()) + { + options.ProfileOutputPathPrefix = "Ort_P_"; + options.EnableProfiling = true; + using (var session = new InferenceSession(model, options)) + { + var inputMeta = session.InputMetadata; + var container = new List(); + + float[] inputData = TestDataLoader.LoadTensorFromEmbeddedResource("bench.in"); // this is the data for only one input tensor for this model + + using (var runOptions = new RunOptions()) + using (var inputOrtValues = new DisposableListTest>(session.InputMetadata.Count)) + using (var outputOrtValues = new DisposableListTest>(session.OutputMetadata.Count)) + { + + foreach (var name in inputMeta.Keys) + { + Assert.Equal(typeof(float), inputMeta[name].ElementType); + Assert.True(inputMeta[name].IsTensor); + var tensor = SystemNumericsTensors.Tensor.Create(inputData, inputMeta[name].Dimensions.Select(x => (nint) x).ToArray()); + inputOrtValues.Add(new DisposableTestPair(name, OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor))); + } + + // Run inference with named inputs and outputs created with in Run() + using (var results = session.Run(runOptions, inputOrtValues.Select(x => x.Key).ToList(), inputOrtValues.Select(x => x.Value).ToList(), new List(["softmaxout_1"]))) // results is an IDisposableReadOnlyCollection container + { + // validate the results + foreach (var r in results) + { + Assert.Single(results); + + float[] expectedOutput = TestDataLoader.LoadTensorFromEmbeddedResource("bench.expected_out"); + long[] expectedDimensions = { 1, 1000, 1, 1 }; // hardcoded for now for the test data + ValidateRunResult(r, expectedOutput, expectedDimensions); + } + } + } + + string profile_file = session.EndProfiling(); + + // Profile file should have the output path prefix in it + Assert.Contains("Ort_P_", profile_file); + } + } + } + + [Fact] + private void ThrowWrongOutputNameDotnetTensors() + { + var tuple = OpenSessionSqueezeNet(); + var session = tuple.Item1; + var inputData = tuple.Item2; + var inputTensor = tuple.Item3; + + using (var runOptions = new RunOptions()) + using (var inputOrtValues = new DisposableListTest>(session.InputMetadata.Count)) + using (var outputOrtValues = new DisposableListTest>(session.OutputMetadata.Count)) + { + var tensor = SystemNumericsTensors.Tensor.Create(inputData, Array.ConvertAll(inputTensor.Dimensions.ToArray(), x => (nint)x)); + + inputOrtValues.Add(new DisposableTestPair("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor))); + outputOrtValues.Add(new DisposableTestPair("bad_output_name", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor))); + + var ex = Assert.Throws(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["bad_output_name"], [outputOrtValues[0].Value])); + Assert.Contains("Output name: 'bad_output_name' is not in the metadata", ex.Message); + } + + session.Dispose(); + } + + [Fact] + private void ThrowWrongOutputDimensionDotnetTensors() + { + var tuple = OpenSessionSqueezeNet(); + var session = tuple.Item1; + var inputData = tuple.Item2; + var inputTensor = tuple.Item3; + var outputTensor = SystemNumericsTensors.Tensor.Create([1, 1001, 1, 1]); + + using (var runOptions = new RunOptions()) + using (var inputOrtValues = new DisposableListTest>(session.InputMetadata.Count)) + using (var outputOrtValues = new DisposableListTest>(session.OutputMetadata.Count)) + { + var tensor = SystemNumericsTensors.Tensor.Create(inputData, Array.ConvertAll(inputTensor.Dimensions.ToArray(), x => (nint)x)); + + inputOrtValues.Add(new DisposableTestPair("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor))); + outputOrtValues.Add(new DisposableTestPair("softmaxout_1", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(outputTensor))); + + var ex = Assert.Throws(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["softmaxout_1"], [outputOrtValues[0].Value])); + } + + session.Dispose(); + } + + [Fact] + private void ThrowInconsistentPinnedOutputsDotnetTensors() + { + var tuple = OpenSessionSqueezeNet(); + using var cleanUp = new DisposableListTest(); + cleanUp.Add(tuple.Item1); + var session = tuple.Item1; + var inputData = tuple.Item2; + var inputTensor = tuple.Item3; + var outputTensor = SystemNumericsTensors.Tensor.Create([1, 1001, 1, 1], [4]); + + using (var runOptions = new RunOptions()) + using (var inputOrtValues = new DisposableListTest>(session.InputMetadata.Count)) + using (var outputOrtValues = new DisposableListTest>(session.OutputMetadata.Count)) + { + var tensor = SystemNumericsTensors.Tensor.Create(inputData, Array.ConvertAll(inputTensor.Dimensions.ToArray(), x => (nint)x)); + + inputOrtValues.Add(new DisposableTestPair("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor))); + outputOrtValues.Add(new DisposableTestPair("softmaxout_1", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(outputTensor))); + OrtValue[] outputs = []; + var ex = Assert.Throws(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["softmaxout_1"], outputs)); + Assert.StartsWith("Length of outputNames (1) must match that of outputValues (0).", ex.Message); + } + } +#pragma warning restore SYSLIB5001 // System.Numerics.Tensors is only in preview so we can continue receiving API feedback +#endif + + #if USE_CUDA [Fact(DisplayName = "TestCUDAProviderOptions")] private void TestCUDAProviderOptions() @@ -1416,6 +1608,25 @@ private void VerifyNativeMethodsExist() } } +#if NET8_0_OR_GREATER +#pragma warning disable SYSLIB5001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + private void ValidateRunResultData(SystemNumericsTensors.Tensor resultTensor, float[] expectedOutput, int[] expectedDimensions) + { + Assert.Equal(expectedDimensions.Length, resultTensor.Rank); + + var resultDimensions = resultTensor.Lengths; + for (int i = 0; i < expectedDimensions.Length; i++) + { + Assert.Equal(expectedDimensions[i], resultDimensions[i]); + } + + var resultArray = resultTensor.ToArray(); + Assert.Equal(expectedOutput.Length, resultArray.Length); + Assert.Equal(expectedOutput, resultArray, new FloatComparer()); + } +#pragma warning restore SYSLIB5001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. +#endif + static string GetTestModelsDir() { // get build directory, append downloaded models location diff --git a/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj index 098078d2e3683..b814f99b05ae1 100644 --- a/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj +++ b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj @@ -7,7 +7,7 @@ If you need a more sophisticated package for testing, you can run the production packaging pipeline against your branch and download the resulting nuget package from the build artifacts. --> - + netstandard2.0 $(OnnxRuntimeBuildDirectory)/NativeNuget.nuspec diff --git a/include/onnxruntime/core/graph/node_arg.h b/include/onnxruntime/core/graph/node_arg.h index 921bff59fb6d4..0ddf1a2b9d3de 100644 --- a/include/onnxruntime/core/graph/node_arg.h +++ b/include/onnxruntime/core/graph/node_arg.h @@ -3,7 +3,7 @@ #pragma once -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "core/graph/basic_types.h" #include "core/common/status.h" diff --git a/onnxruntime/core/graph/onnx_protobuf.h b/include/onnxruntime/core/graph/onnx_protobuf.h similarity index 100% rename from onnxruntime/core/graph/onnx_protobuf.h rename to include/onnxruntime/core/graph/onnx_protobuf.h diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java index b80debdde47c4..c28c79f1e723e 100644 --- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java +++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java @@ -76,6 +76,9 @@ final class OnnxRuntime { /** The short name of the ONNX runtime TensorRT provider library */ static final String ONNXRUNTIME_LIBRARY_TENSORRT_NAME = "onnxruntime_providers_tensorrt"; + /** The short name of the ONNX runtime QNN provider library */ + static final String ONNXRUNTIME_LIBRARY_QNN_NAME = "onnxruntime_providers_qnn"; + /** The OS & CPU architecture string */ private static final String OS_ARCH_STR = initOsArch(); @@ -159,8 +162,11 @@ static synchronized void init() throws IOException { // the ONNX Runtime native library will load it extractProviderLibrary(ONNXRUNTIME_LIBRARY_SHARED_NAME); - load(ONNXRUNTIME_LIBRARY_NAME); + if (!isAndroid()) { + load(ONNXRUNTIME_LIBRARY_NAME); + } load(ONNXRUNTIME_JNI_LIBRARY_NAME); + ortApiHandle = initialiseAPIBase(ORT_API_VERSION_14); if (ortApiHandle == 0L) { throw new IllegalStateException( @@ -252,6 +258,16 @@ static boolean extractTensorRT() { return extractProviderLibrary(ONNXRUNTIME_LIBRARY_TENSORRT_NAME); } + /** + * Extracts the QNN provider library from the classpath resources if present, or checks to see if + * the QNN provider library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}. + * + * @return True if the QNN provider library is ready for loading, false otherwise. + */ + static boolean extractQNN() { + return extractProviderLibrary(ONNXRUNTIME_LIBRARY_QNN_NAME); + } + /** * Extracts a shared provider library from the classpath resources if present, or checks to see if * that library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}. @@ -260,7 +276,7 @@ static boolean extractTensorRT() { * @return True if the library is ready for loading by ORT's native code, false otherwise. */ static synchronized boolean extractProviderLibrary(String libraryName) { - // Android does not need to extract library and it has no shared provider library + // Android does not need to extract provider libraries. if (isAndroid()) { return false; } @@ -312,7 +328,7 @@ static boolean isAndroid() { private static void load(String library) throws IOException { // On Android, we simply use System.loadLibrary if (isAndroid()) { - System.loadLibrary("onnxruntime4j_jni"); + System.loadLibrary(library); return; } diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java index 32dc9d9f84aaa..bd988e2bb7468 100644 --- a/java/src/main/java/ai/onnxruntime/OrtSession.java +++ b/java/src/main/java/ai/onnxruntime/OrtSession.java @@ -1320,6 +1320,10 @@ public void addXnnpack(Map providerOptions) throws OrtException */ public void addQnn(Map providerOptions) throws OrtException { String qnnProviderName = "QNN"; + + // QNN can either be built as a shared or static library. extractQNN() will extract the + // (lib)onnxruntime_providers_qnn(.so/.dll) from classpath resources if present. + OnnxRuntime.extractQNN(); addExecutionProvider(qnnProviderName, providerOptions); } diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md index 5ad2311ef80de..a6a2ecdf6f467 100644 --- a/js/web/docs/webnn-operators.md +++ b/js/web/docs/webnn-operators.md @@ -6,108 +6,110 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim (**Note**: ONNX Runtime only *guarantees* support for models stamped with opset version 7 or above for opset domain 'ai.onnx'.) -[WebNN API](https://webmachinelearning.github.io/webnn) provides two device types `cpu` and `gpu` to leverage different on-device accelerators. WebNN API implementation in Chromium uses TFLite XNNPack delegate backend for `cpu` device type and DirectML backend for `gpu` device type. [The op support status](https://webmachinelearning.github.io/webnn-status/) behind these two backends is inconsistent. +The [WebNN API](https://webmachinelearning.github.io/webnn) is available in the latest versions of Chrome and Edge on Windows, +Linux, macOS, Android, and ChromeOS behind an "Enables WebNN API" flag. The operator support status may vary across these +platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-status/) for the latest implementation details. -| Operator | Opset | WebNN API | WebNN CPU | WebNN GPU | Comments | -|:------:|:------:|:------:|:-:|:-:|:------| -| Abs | ai.onnx(7-12, 13+) | abs | ✓ | ✓ | | -| Add | ai.onnx(7-12, 13, 14+) | add | ✓ | ✓ | | -| And | ai.onnx(7+) | logicalAnd | ✗ | ✓ | | -| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | | -| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | | -| AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 | -| BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✓ | ✓ | Only supports 'training_mode' value is 0, one output | -| Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✓ | ✓ | WebNN CPU backend doesn't support casting to uint64 data type | -| Ceil | ai.onnx(7-12, 13+) | ceil | ✓ | ✓ | | -| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | ✓ | ✓ | WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0] (Chromium issue: https://issues.chromium.org/issues/326156496) | -| Concat | ai.onnx(7-10, 11-12, 13+) | concat | ✓ | ✓ | | -| Conv | ai.onnx(7-10, 11+) | conv2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight) | -| ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight). WebNN CPU backend only supports default dilations and group | -| Cos | ai.onnx(7+) | cos | ✓ | ✓ | | -| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | 'axis' input should be a constant | -| Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | | -| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input | -| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode | -| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | ✓ | ✓ | | -| Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 | -| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✓ | ✓ | | -| Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✓ | ✓ | | -| Exp | ai.onnx(7-12, 13+) | exp | ✓ | ✓ | | -| Expand | ai.onnx(8-12, 13+) | expand | ✓ | ✓ | 'shape' input should be a constant | -| Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | | -| Floor | ai.onnx(7-12, 13+) | floor | ✓ | ✓ | | -| Gather | ai.onnx(7-10, 11-12, 13+) | gather | ✓ | ✓ | | -| GatherElements | ai.onnx(11-12, 13+) | gatherElements | ✗ | ✓ | | -| GatherND | ai.onnx(11, 12, 13+) | gatherND | ✓ | ✓ | Only supports 'batch_dims' == 0 | -| Gelu | ai.onnx(20+) | gelu | ✓ | ✓ | | -| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | ✓ | ✓ | Only supports 1-D 'C' input | -| GlobalAveragePool | ai.onnx(7+) | averagePool2d | ✓ | ✓ | Only supports 4-D input | -| GlobalMaxPool | ai.onnx(7+) | maxPool2d | ✓ | ✓ | Only supports 4-D input | -| GlobalLpPool| ai.onnx(7+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 'p' value is 2 | -| Greater | ai.onnx(7-8, 9-12, 13+) | greater | ✓ | ✓ | | -| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | ✓ | ✓ | | -| GRU | ai.onnx(7-13, 14-21, 22+) | gru | ✓ | ✓ | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' | -| HardSigmoid | ai.onnx(7+) | hardSigmoid | ✓ | ✓ | | -| HardSwish | ai.onnx(14+) | hardSwish | ✓ | ✓ | | -| Identity | ai.onnx(7-13, 14-15, 16-18, 19-20, 21+) | identity | ✓ | ✓ | | -| InstanceNormalization | ai.onnx(7+) | instanceNormalization | ✓ | ✓ | | -| LayerNormalization | ai.onnx(7-16, 17+) | layerNormalization | ✓ | ✓ | | -| LeakyRelu | ai.onnx(7-15, 16+) | leakyRelu | ✓ | ✓ | | -| Less | ai.onnx(7-8, 9-12, 13+) | lesser | ✓ | ✓ | | -| LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | ✓ | ✓ | | -| Log | ai.onnx(7-12, 13+) | log | ✓ | ✓ | | -| LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 | -| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | ✓ | ✓ | | -| LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | ✓ | ✓ | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' | -| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | | -| Max | ai.onnx(7, 8-11, 12, 13+) | max | ✓ | ✓ | | -| MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output | -| Min | ai.onnx(7, 8-11, 12, 13+) | min | ✓ | ✓ | | -| Mul | ai.onnx(7-12, 13, 14+) | mul | ✓ | ✓ | | -| Neg | ai.onnx(7-12, 13+) | neg | ✓ | ✓ | | -| Not | ai.onnx(7+) | logicalNot | ✓ | ✓ | | -| Or | ai.onnx(7+) | logicalOr | ✗ | ✓ | | -| Pad | ai.onnx(7-10, 11-12, 13-17, 18, 19-20, 21+) | pad | ✓ | ✓ | modes == 'wrap' is not supported | -| Pow | ai.onnx(7-11, 12, 13-14, 15+) | pow | ✓ | ✓ | | -| PRelu | ai.onnx(7-8, 9-15, 16+) | prelu | ✓ | ✓ | WebNN CPU backend restricts the last dimension of input and slope to be same (Chromium issue: https://issues.chromium.org/issues/335517470) | -| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input | -| Reciprocal | ai.onnx(7-12, 13+) | reciprocal | ✓ | ✓ | | -| ReduceL1 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL1 | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceL2 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL2 | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceLogSum| ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSum| ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceLogSumExp | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSumExp | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceMax | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMax | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceMean | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceMean | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceMin | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMin | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceProd | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceProduct | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceSum | ai.onnx(7-10, 11-12, 13+) | reduceSum | ✓ | ✓ | Input 'axes' if present should be a constant | -| ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✓ | ✓ | Input 'axes' if present should be a constant | -| Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | | -| Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported | -| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant | -| RotaryEmbedding | com.microsoft(1+) | add, concat, gather, mul, reshape, split | ✓ | ✓ | | -| ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | ✗ | ✓ | Only supports 'reduction' == 'none' | -| ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | ✗ | ✓ | Only supports 'reduction' == 'none' | -| Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | | -| SimplifiedLayerNormalization | ai.onnx(1+) | pow, reduceMean, add, sqrt, div, mul | ✓ | ✓ | | -| Sigmoid | ai.onnx(7-12, 13+) | sigmoid | ✓ | ✓ | | -| Sign | ai.onnx(9-12, 13+) | sign | ✓ | ✓ | | -| SkipSimplifiedLayerNormalization | com.microsoft(1+) | pow, reduceMean, add, sqrt, div, mul | ✓ | ✓ | | -| Softplus | ai.onnx(7+) | softplus | ✓ | ✓ | | -| Softsign | ai.onnx(7+) | softsign | ✓ | ✓ | | -| Sin | ai.onnx(7+) | sin | ✓ | ✓ | | -| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant | -| Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | ✓ | ✓ | | -| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant | -| Sqrt | ai.onnx(7-12, 13+) | sqrt | ✓ | ✓ | | -| Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | Input 'axes' if present should be a constant | -| Sub | ai.onnx(7-12, 13, 14+) | sub | ✓ | ✓ | | -| Tan | ai.onnx(7+) | tan | ✓ | ✓ | | -| Tanh | ai.onnx(7-12, 13+) | tanh | ✓ | ✓ | | -| Tile | ai.onnx(7-12, 13+) | tile | ✗ | ✓ | Input 'repeats' should be a constant | -| Transpose | ai.onnx(7-12, 13-20, 21+) | transpose | ✓ | ✓ | | -| Trilu | ai.onnx(14+) | triangular | ✓ | ✓ | Input 'k' (option 'diagonal' for WebNN) if present should be a constant | -| Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | | -| Where | ai.onnx(7-8, 9-15, 16+) | where | ✓ | ✓ | | -| Xor | ai.onnx(7+) | logicalXor | ✗ | ✓ | | +| Operator | Opset | WebNN API | Comments | +|:------:|:------:|:------:|:------| +| Abs | ai.onnx(7-12, 13+) | abs | | +| Add | ai.onnx(7-12, 13, 14+) | add | | +| And | ai.onnx(7+) | logicalAnd | | +| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | | +| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | | +| AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 | +| BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | Only supports 'training_mode' value is 0, one output | +| Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | | +| Ceil | ai.onnx(7-12, 13+) | ceil | | +| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | | +| Concat | ai.onnx(7-10, 11-12, 13+) | concat | | +| Conv | ai.onnx(7-10, 11+) | conv2d | Only supports 3-D or 4-D input and 'W' (weight) | +| ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | Only supports 3-D or 4-D input and 'W' (weight) | +| Cos | ai.onnx(7+) | cos | | +| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | 'axis' input should be a constant | +| Div | ai.onnx(7-12, 13, 14+) | div | | +| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | The shape of x_scale should be a subsample of the shape of input | +| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | Only supports test mode | +| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | | +| Elu | ai.onnx(7+) | elu | | +| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | | +| Erf | ai.onnx(7-9, 10-12, 13+) | erf | | +| Exp | ai.onnx(7-12, 13+) | exp | | +| Expand | ai.onnx(8-12, 13+) | expand | 'shape' input should be a constant | +| Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape | | +| Floor | ai.onnx(7-12, 13+) | floor | | +| Gather | ai.onnx(7-10, 11-12, 13+) | gather | | +| GatherElements | ai.onnx(11-12, 13+) | gatherElements | | +| GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 | +| Gelu | ai.onnx(20+) | gelu | | +| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input | +| GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input | +| GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input | +| GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 | +| Greater | ai.onnx(7-8, 9-12, 13+) | greater | | +| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | | +| GRU | ai.onnx(7-13, 14-21, 22+) | gru | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' | +| HardSigmoid | ai.onnx(7+) | hardSigmoid | | +| HardSwish | ai.onnx(14+) | hardSwish | | +| Identity | ai.onnx(7-13, 14-15, 16-18, 19-20, 21+) | identity | | +| InstanceNormalization | ai.onnx(7+) | instanceNormalization | | +| LayerNormalization | ai.onnx(7-16, 17+) | layerNormalization | | +| LeakyRelu | ai.onnx(7-15, 16+) | leakyRelu | | +| Less | ai.onnx(7-8, 9-12, 13+) | lesser | | +| LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | | +| Log | ai.onnx(7-12, 13+) | log | | +| LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 | +| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | | +| LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' | +| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | | +| Max | ai.onnx(7, 8-11, 12, 13+) | max | | +| MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output | +| Min | ai.onnx(7, 8-11, 12, 13+) | min | | +| Mul | ai.onnx(7-12, 13, 14+) | mul | | +| Neg | ai.onnx(7-12, 13+) | neg | | +| Not | ai.onnx(7+) | logicalNot | | +| Or | ai.onnx(7+) | logicalOr | | +| Pad | ai.onnx(7-10, 11-12, 13-17, 18, 19-20, 21+) | pad | modes == 'wrap' is not supported | +| Pow | ai.onnx(7-11, 12, 13-14, 15+) | pow | | +| PRelu | ai.onnx(7-8, 9-15, 16+) | prelu | | +| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | The shape of x_scale should be a subsample of the shape of input | +| Reciprocal | ai.onnx(7-12, 13+) | reciprocal | | +| ReduceL1 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL1 | Input 'axes' if present should be a constant | +| ReduceL2 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL2 | Input 'axes' if present should be a constant | +| ReduceLogSum| ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSum | Input 'axes' if present should be a constant | +| ReduceLogSumExp | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSumExp | Input 'axes' if present should be a constant | +| ReduceMax | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMax | Input 'axes' if present should be a constant | +| ReduceMean | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceMean | Input 'axes' if present should be a constant | +| ReduceMin | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMin | Input 'axes' if present should be a constant | +| ReduceProd | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceProduct | Input 'axes' if present should be a constant | +| ReduceSum | ai.onnx(7-10, 11-12, 13+) | reduceSum | Input 'axes' if present should be a constant | +| ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | Input 'axes' if present should be a constant | +| Relu | ai.onnx(7-12, 13, 14+) | relu | | +| Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported | +| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant | +| RotaryEmbedding | com.microsoft(1+) | add, concat, gather, mul, reshape, split | | +| ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | Only supports 'reduction' == 'none' | +| ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | Only supports 'reduction' == 'none' | +| Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | | +| SimplifiedLayerNormalization | ai.onnx(1+) | pow, reduceMean, add, sqrt, div, mul | | +| Sigmoid | ai.onnx(7-12, 13+) | sigmoid | | +| Sign | ai.onnx(9-12, 13+) | sign | | +| SkipSimplifiedLayerNormalization | com.microsoft(1+) | pow, reduceMean, add, sqrt, div, mul | | +| Softplus | ai.onnx(7+) | softplus | | +| Softsign | ai.onnx(7+) | softsign | | +| Sin | ai.onnx(7+) | sin | | +| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant | +| Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | | +| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | Input 'split' if present should be a constant | +| Sqrt | ai.onnx(7-12, 13+) | sqrt | | +| Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | Input 'axes' if present should be a constant | +| Sub | ai.onnx(7-12, 13, 14+) | sub | | +| Tan | ai.onnx(7+) | tan | | +| Tanh | ai.onnx(7-12, 13+) | tanh | | +| Tile | ai.onnx(7-12, 13+) | tile | Input 'repeats' should be a constant | +| Transpose | ai.onnx(7-12, 13-20, 21+) | transpose | | +| Trilu | ai.onnx(14+) | triangular | Input 'k' (option 'diagonal' for WebNN) if present should be a constant | +| Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | | +| Where | ai.onnx(7-8, 9-15, 16+) | where | | +| Xor | ai.onnx(7+) | logicalXor | | diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts index 0aa3ad6c4c267..097e2552569c8 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts @@ -46,6 +46,11 @@ export const createConvTranspose2DProgramInfo = ( const inputChannelsPerGroup = wShape[2] / group; const outputChannelsPerGroup = wShape[3]; const aComponents = isChannelsLast ? getMaxComponents(inputChannelsPerGroup) : 1; + const packInputAs4 = isChannelsLast && outputChannelsPerGroup === 1; + const inputChannelsPerGroupInt = packInputAs4 + ? Math.floor(inputChannelsPerGroup / 4) * 4 + : Math.floor(inputChannelsPerGroup / aComponents) * aComponents; + const inputChannelsRemainder = inputChannelsPerGroup - inputChannelsPerGroupInt; const components = isChannelsLast ? getMaxComponents(outputChannelsPerGroup) : 1; const bComponents = isChannelsLast ? (outputChannelsPerGroup === 1 ? aComponents : components) : 1; const outputSize = ShapeUtil.size(outputShape) / components; @@ -78,7 +83,7 @@ export const createConvTranspose2DProgramInfo = ( { type: DataType.uint32, data: dilations }, { type: DataType.uint32, data: effectiveFilterDims }, { type: DataType.int32, data: pads }, - { type: DataType.uint32, data: inputChannelsPerGroup }, + { type: DataType.uint32, data: inputChannelsPerGroupInt }, { type: DataType.uint32, data: outputChannelsPerGroup }, ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims), ]; @@ -114,16 +119,40 @@ export const createConvTranspose2DProgramInfo = ( const calculateResult = (): string => { let calcStr = ''; - if (aComponents === 1) { - calcStr += ` - let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)}; - let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)}; - dotProd = dotProd + xValue * wValue;`; + if (packInputAs4) { + if (aComponents === 4) { + calcStr += ` + let xValue = ${dy.getByOffset('x_offset')}; + let wValue = ${w.getByOffset('w_offset')}; + dotProd = dotProd + dot(xValue, wValue); + x_offset += 1u; + w_offset += 1u;`; + } else if (aComponents === 2) { + calcStr += ` + dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')})); + x_offset += 2u; + w_offset += 2u;`; + } else if (aComponents === 1) { + calcStr += ` + dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}, ${dy.getByOffset('x_offset + 2u')}, ${dy.getByOffset('x_offset + 3u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}, ${w.getByOffset('w_offset + 2u')}, ${w.getByOffset('w_offset + 3u')})); + x_offset += 4u; + w_offset += 4u;`; + } } else { - if (outputChannelsPerGroup === 1) { + calcStr += ` + let xValue = ${ + isChannelsLast + ? dy.getByOffset( + `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`, + ) + : dy.get('batch', 'inputChannel', 'idyR', 'idyC') + }; + `; + if (aComponents === 1) { calcStr += ` - let wValue = ${w.getByOffset(`${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)} / ${bComponents}`)}; - dotProd = dotProd + dot(xValue, wValue);`; + let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)}; + let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)}; + dotProd = dotProd + xValue * wValue;`; } else { for (let c = 0; c < aComponents; c++) { calcStr += ` @@ -134,6 +163,32 @@ export const createConvTranspose2DProgramInfo = ( } return calcStr; }; + const calculateRemainder = (): string => { + if (inputChannelsRemainder === 0) { + return ''; + } + if (!packInputAs4) { + throw new Error(`packInputAs4 ${packInputAs4} is not true.`); + } + let calcStr = ''; + if (aComponents === 1) { + calcStr += 'dotProd = dotProd'; + for (let i = 0; i < inputChannelsRemainder; i++) { + calcStr += ` + + ${dy.getByOffset(`x_offset + ${i}`)} * ${w.getByOffset(`w_offset + ${i}`)}`; + } + calcStr += ';'; + } else if (aComponents === 2) { + if (inputChannelsRemainder !== 2) { + throw new Error(`Invalid inputChannelsRemainder ${inputChannelsRemainder}.`); + } + calcStr += ` + let xValue = ${dy.getByOffset('x_offset')}; + let wValue = ${w.getByOffset('w_offset')}; + dotProd = dotProd + dot(xValue, wValue);`; + } + return calcStr; + }; const codeSnippet = ` let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)}; let batch = ${output.indicesGet('outputIndices', 0)}; @@ -148,7 +203,12 @@ export const createConvTranspose2DProgramInfo = ( // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1). // ? = to be determined. : = across all values in that axis. var dotProd = ${output.type.value}(0.0); - for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) { + var wR: u32 = 0; + if (uniforms.dilations.x == 1) { + // Minimum wR >= 0 that satisfies (dyRCorner + wR) % (uniforms.strides.x) == 0 + wR = u32(((dyRCorner + i32(uniforms.strides.x) - 1) / i32(uniforms.strides.x)) * i32(uniforms.strides.x) - dyRCorner); + } + for (; wR < uniforms.effective_filter_dims.x; wR = wR + 1) { if (wR % uniforms.dilations.x != 0) { continue; } @@ -158,10 +218,13 @@ export const createConvTranspose2DProgramInfo = ( wRPerm < 0) { continue; } - wR = wR + uniforms.strides[0] - 1; let idyR: u32 = u32(dyR); - - for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) { + var wC: u32 = 0; + if (uniforms.dilations.y == 1) { + // Minimum wC >= 0 that satisfies (dyCCorner + wC) % (uniforms.strides.y) == 0 + wC = u32(((dyCCorner + i32(uniforms.strides.y) - 1) / i32(uniforms.strides.y)) * i32(uniforms.strides.y) - dyCCorner); + } + for (; wC < uniforms.effective_filter_dims.y; wC = wC + 1) { if (wC % uniforms.dilations.y != 0) { continue; } @@ -171,21 +234,24 @@ export const createConvTranspose2DProgramInfo = ( fract(dyC) > 0.0 || wCPerm < 0) { continue; } - wC = wC + uniforms.strides.y - 1; let idyC: u32 = u32(dyC); var inputChannel = groupId * uniforms.input_channels_per_group; - for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${aComponents}) { - let xValue = ${ - isChannelsLast - ? dy.getByOffset( - `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`, - ) - : dy.get('batch', 'inputChannel', 'idyR', 'idyC') - }; + ${ + packInputAs4 + ? ` + var x_offset = ${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}; + var w_offset = ${w.indicesToOffset(`${w.type.indices}(wRPerm, wCPerm, inputChannel, wOutChannel)`)} / ${bComponents}; + ` + : '' + } + for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${packInputAs4 ? 4 : aComponents}) { ${calculateResult()} - inputChannel = inputChannel + ${aComponents}; + inputChannel = inputChannel + ${packInputAs4 ? 4 : aComponents}; } + ${calculateRemainder()} + wC = wC + uniforms.strides.y - 1; } + wR = wR + uniforms.strides[0] - 1; } let value = dotProd${hasBias ? ` + bias[d1 / ${components}]` : ''}; ${output.setByOffset('global_idx', 'value')}; @@ -201,7 +267,7 @@ export const createConvTranspose2DProgramInfo = ( return { name: 'ConvTranspose2D', shaderCache: { - hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}`, + hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}${inputChannelsRemainder}`, inputDependencies, }, getRunData: () => ({ diff --git a/js/web/test/data/ops/conv-transpose.jsonc b/js/web/test/data/ops/conv-transpose.jsonc index f827601b3a89c..a6a799dccee86 100644 --- a/js/web/test/data/ops/conv-transpose.jsonc +++ b/js/web/test/data/ops/conv-transpose.jsonc @@ -458,6 +458,152 @@ } ] }, + { + "name": "ConvTranspose with output channels = 1", + "operator": "ConvTranspose", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [ + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "strides", "data": [2, 2], "type": "ints" } + ], + "cases": [ + { + "name": "inChannels = 5", + "inputs": [ + { + "data": [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 + ], + "dims": [1, 5, 3, 3], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8], + "dims": [5, 1, 2, 2], + "type": "float32" + }, + { + "data": [2], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 437, 532, 458, 558, 479, 584, 627, 722, 658, 758, 689, 794, 500, 610, 521, 636, 542, 662, 720, 830, 751, + 866, 782, 902, 563, 688, 584, 714, 605, 740, 813, 938, 844, 974, 875, 1010 + ], + "dims": [1, 1, 6, 6], + "type": "float32" + } + ] + }, + { + "name": "inChannels = 6", + "inputs": [ + { + "data": [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9 + ], + "dims": [1, 6, 3, 3], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4], + "dims": [6, 1, 2, 2], + "type": "float32" + }, + { + "data": [2], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 438, 534, 460, 562, 482, 590, 630, 726, 664, 766, 698, 806, 504, 618, 526, 646, 548, 674, 732, 846, 766, + 886, 800, 926, 570, 702, 592, 730, 614, 758, 834, 966, 868, 1006, 902, 1046 + ], + "dims": [1, 1, 6, 6], + "type": "float32" + } + ] + }, + { + "name": "inChannels = 7", + "inputs": [ + { + "data": [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18 + ], + "dims": [1, 7, 3, 3], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8], + "dims": [7, 1, 2, 2], + "type": "float32" + }, + { + "data": [2], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 488, 594, 515, 628, 542, 662, 700, 806, 741, 854, 782, 902, 569, 696, 596, 730, 623, 764, 823, 950, 864, + 998, 905, 1046, 650, 798, 677, 832, 704, 866, 946, 1094, 987, 1142, 1028, 1190 + ], + "dims": [1, 1, 6, 6], + "type": "float32" + } + ] + }, + { + "name": "inChannels = 8", + "inputs": [ + { + "data": [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 1, 2, 3, 4, 5, 6, 7, 8, 9 + ], + "dims": [1, 8, 3, 3], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4], + "dims": [8, 1, 2, 2], + "type": "float32" + }, + { + "data": [2], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 489, 596, 517, 632, 545, 668, 703, 810, 747, 862, 791, 914, 573, 704, 601, 740, 629, 776, 835, 966, 879, + 1018, 923, 1070, 657, 812, 685, 848, 713, 884, 967, 1122, 1011, 1174, 1055, 1226 + ], + "dims": [1, 1, 6, 6], + "type": "float32" + } + ] + } + ] + }, { "name": "ConvTranspose without bias addition C", "operator": "ConvTranspose", diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json index 96c19af9479e4..891b40710ff99 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json @@ -12,7 +12,7 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.0.5" + "vite": "^6.0.11" } }, "node_modules/@babel/helper-string-parser": { @@ -1069,9 +1069,9 @@ } }, "node_modules/vite": { - "version": "6.0.7", - "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.7.tgz", - "integrity": "sha512-RDt8r/7qx9940f8FcOIAH9PTViRrghKaK2K1jY3RaAURrEUbm9Du1mJ72G+jlhtG3WwodnfzY8ORQZbBavZEAQ==", + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.11.tgz", + "integrity": "sha512-4VL9mQPKoHy4+FE0NnRE/kbY51TOfaknxAjt3fJbGJxhIpBZiqVzlZDEesWWsuREXHwNdAoOFZ9MkPEVXczHwg==", "dev": true, "license": "MIT", "dependencies": { diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json index 7a1f370885bf4..9e204875a1d01 100644 --- a/js/web/test/e2e/exports/testcases/vite-default/package.json +++ b/js/web/test/e2e/exports/testcases/vite-default/package.json @@ -13,6 +13,6 @@ }, "devDependencies": { "@vitejs/plugin-vue": "^5.2.1", - "vite": "^6.0.5" + "vite": "^6.0.11" } } diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h index ccaeb6654e286..abb24e20a6178 100644 --- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h @@ -75,6 +75,7 @@ class GQAAttentionBase { int seqlen_present_kv_cache = static_cast(present_key->Shape().GetDims()[2]); // Compute the attention score. + // TODO(fajin): type depends on kernel supportability size_t bytes = SafeInt(batch_size) * num_heads_ * sequence_length * seqlen_present_kv_cache * sizeof(float); auto attention_probs = allocator->Alloc(bytes); BufferUniquePtr scratch_buffer(attention_probs, BufferDeleter(allocator)); @@ -198,6 +199,11 @@ class GQAAttentionBase { math::GemmEx(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size, alpha, q, static_cast(head_size), k, static_cast(head_size), 0.0f /*bata*/, output, static_cast(present_buffer_sequence_length), nullptr); + // TODO(fajin): update later + // } else if (MlasHGemmSupported(CblasNoTrans, CblasTrans)) { + // MlasGemm(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size, + // q, static_cast(head_size), k, static_cast(head_size), output, + // static_cast(present_buffer_sequence_length), alpha, 0.0f /*beta*/, nullptr); } else { size_t bytes = head_size * (sequence_length + total_seqlen) * sizeof(float); auto q_k_fp32 = allocator->Alloc(bytes); diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc index 28e4ccec09b32..90e6516ff45d1 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc @@ -530,6 +530,222 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { return Status::OK(); } +Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const { + shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + shader.AddOutput("output", ShaderUsage::UseUniform); + shader.AddOutput("scales", ShaderUsage::UseUniform); + + shader.AdditionalImplementation() << R"ADDNL_FN( + var max_values : array; + )ADDNL_FN"; + + shader.MainFunctionBody() << R"MAIN_FN( + var local_a = input_a[global_idx]; + var max_val = subgroupMax(abs(local_a)); + var max_temp = max(max_val.xy, max_val.zw); + var scale = max(max_temp[0], max_temp[1]); + if (local_idx % sg_size == 0) { + max_values[local_idx / sg_size] = scale; + } + workgroupBarrier(); + + if (sg_size == 8) + { + scale = max(max_values[0], max_values[1]); + scale = max(scale, max_values[2]); + scale = max(scale, max_values[3]); + } + else if (sg_size == 16) + { + scale = max(max_values[0], max_values[1]); + } + else + { + scale = max_values[0]; + } + + var norm_a = local_a/scale; + output[global_idx] = pack4x8snorm(vec4(norm_a)); + if (local_idx == 0) + { + // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f. + scales[workgroup_idx] = scale/127; + } +)MAIN_FN"; + return Status::OK(); +} + +Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { + shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + shader.AddInput("scales_a", ShaderUsage::UseUniform); + shader.AddInput("input_b", ShaderUsage::UseUniform); + shader.AddInput("scales_b", ShaderUsage::UseUniform); + shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias); + + // This shader implements co-operative matrix multiply. The key idea here is to + // assume there is a primitive for medium size matrix multiply a subgroup can perform, + // using all its lanes and pooling all its registers to keep the values in registry. + // + // The entire workgroup which has N subgroups first loads a tile into shared memory, + // Then each subgroup loads a subtile from shared memory into registers and uses + // the medium size matrix multiply primitive to perform the math. + // The values for tile/subtile size are chosen to conform to the resource limits + // of an alderlake/tiger lake gpu. A tile is 64x64, workgroup is 256 threads - + // therefore there are 16 subgroups and 16 lanes in each subgroup. + // K the hidden dimension is paged in from RAM at k tile size which is 64. + // All this puts the shared memory requirement slightly above 16KB. + // WebGPU limit is 16KB, output is moved to registers instead of SHM to make + // everything fit in shared memory. + // + // Each subgroup performs a 16 x 64 x 16 multiply which is implemented with + // subgroup shuffle as a placeholder for the day the medium matrix mul primitive + // becomes available in WGSL. The registry requirements is ~2KB per subgroup, on + // Alderlake/Tigerlake subgroup has 8KB of registry space pooling the + // 512B of registry from each lane. + // + // The medium size matmul is implemented using dot4I8Packed, so the inputs for + // this shader require A to be int8 quantized with block size 64. B is regular + // matmulnbits input with block size 32. + + shader.AdditionalImplementation() << R"ADDNL_FN( + const tile_size = 64; + const subtile_size = 16; + const tile_size_k = 32; + const vec_factor = 4; + const u32_factor = 4; + const tile_size_k_vec = 4; + const block_size = 32; + + // Shared memory + var tile_A : array, tile_size_k_vec>, tile_size>; // 64 x 32 + var scale_A : array; // 64 x 1 + var tile_B : array, tile_size_k_vec>, tile_size>; // 64 x 32 + var scale_B : array; // 64 x 1 + + // Private memory + var lane_output: array; + + fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32) + { + let a_global = a_global_base + row; + if (a_global >= uniforms.M) + { + return; + } + tile_A[row][col] = input_a[a_global*uniforms.K8+kidx_v+col]; + if (col == 0) + { + // kidx_v - covers 8 values of k + scale_A[row] = scales_a[a_global*(uniforms.K/128) + kidx_v/16]; + } + } + + fn loadSHMB(b_global_base:u32, kidx_v:u32, row: u32, col: u32) + { + let b_global = b_global_base + row; + if (b_global >= uniforms.N) + { + return; + } + + let b_value = input_b[b_global*uniforms.K8+kidx_v+col]; + var b_value_lower = vec4(unpack4xU8(b_value & 0x0F0F0F0Fu)) - vec4(8); + var b_value_upper = vec4(unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu)) - vec4(8); + tile_B[row][col][0] = pack4xI8(vec4(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1])); + tile_B[row][col][1] = pack4xI8(vec4(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3])); + if (col == 0) + { + // kidx_v - each kidx_v covers 8 values of k + scale_B[row] = scales_b[b_global*(uniforms.K/32) + kidx_v/4]; + } + } + + fn DP4AI(a:vec4, b:vec4) -> i32 + { + var local_sum = dot4I8Packed(a[0], b[0]); + local_sum += dot4I8Packed(a[1], b[1]); + local_sum += dot4I8Packed(a[2], b[2]); + local_sum += dot4I8Packed(a[3], b[3]); + return local_sum; + } + +)ADDNL_FN"; + + shader.MainFunctionBody() << R"MAIN_FN( + // During the load phase we use all 256 threads to load 64 rows of A/B. + // For each row we load 4 vectorized elements, which are 32 elements of K. + let a_global_base = workgroup_id.x * tile_size; + let b_global_base = workgroup_id.y * tile_size; + let load_row = u32(local_idx/4); + let load_col = u32(local_idx%4); + + // During the compute phase, we have the 64x64 tile split into + // subtiles of 16x16. We have a grid of 4x4 subtiles. + let subtile_id = u32(local_idx / subtile_size); + let subtile_idx = u32(subtile_id / 4); + let subtile_idy = u32(subtile_id % 4); + let base_A = subtile_idx * 16; + let base_B = subtile_idy * 16; + // For each subtile we have 16 threads assigned. + let a_idx = u32(local_idx % subtile_size); + + // K's vectrorization is 8 items per index. See input_a/input_b. + // tile_size_k_vec - is the k tile size in vectorized k units/space (1/8). + for (var kidx_v:u32 = 0; kidx_v < uniforms.K8; kidx_v+=tile_size_k_vec) + { + // Populate shared memory for the workgroup + loadSHMA(a_global_base, kidx_v, load_row, load_col); + loadSHMB(b_global_base, kidx_v, load_row, load_col); + workgroupBarrier(); + + var own_a0: vec4 = vec4(tile_A[base_A + a_idx][0], tile_A[base_A + a_idx][1]); + var own_a1: vec4 = vec4(tile_A[base_A + a_idx][2], tile_A[base_A + a_idx][3]); + var own_scale_a = scale_A[base_A + a_idx]; + if (sg_size == 16) + { + var own_b0: vec4 = vec4(tile_B[base_B + sg_id][0], tile_B[base_B + sg_id][1]); + var own_b1: vec4 = vec4(tile_B[base_B + sg_id][2], tile_B[base_B + sg_id][3]); + var own_scale_b = scale_B[base_B + sg_id]; + for (var col:u32 = 0; col < 16; col++) + { + var local_scale_b = subgroupShuffle(own_scale_b, col); + local_scale_b = local_scale_b * own_scale_a; + var local_sum = DP4AI(own_a0, subgroupShuffle(own_b0, col)); + local_sum += DP4AI(own_a1, subgroupShuffle(own_b1, col)); + lane_output[col] += (output_element_t(local_sum) * local_scale_b); + } + } + else + { + for (var col:u32 = 0; col < 16; col++) + { + var b0: vec4 = vec4(tile_B[base_B + col][0], tile_B[base_B + col][1]); + var b1: vec4 = vec4(tile_B[base_B + col][2], tile_B[base_B + col][3]); + var local_sum = DP4AI(own_a0, b0); + local_sum += DP4AI(own_a1, b1); + lane_output[col] += (output_element_t(local_sum) * own_scale_a * scale_B[base_B + col]); + } + } + workgroupBarrier(); + } + + let a_global = a_global_base + base_A + a_idx; + let b_global = b_global_base + base_B; + let output_idx = ((a_global) * uniforms.N + b_global)/4; + // This creates a shader requirement that uniforms.N % 16 == 0 + if (a_global < uniforms.M && b_global < uniforms.N) + { + for (var i:u32 = 0; i < 4; i++) + { + let lidx = i * 4; + output[output_idx+i] = vec4(lane_output[lidx], lane_output[lidx+1] , lane_output[lidx+2], lane_output[lidx+3]); + } + } +)MAIN_FN"; + + return Status::OK(); +} + Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const { const Tensor* a = context.Input(0); const Tensor* b = context.Input(1); @@ -565,11 +781,54 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context uint32_t components = GetMaxComponents(N); const bool has_zero_points = zero_points != nullptr; + const bool has_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups); + // macOS - Avoid using dp4a on Metal, as it does not appear to have native dp4a support. + // https://github.com/gpuweb/gpuweb/issues/2677#issuecomment-1713292226 + const bool use_dp4a = has_subgroup && context.AdapterInfo().backendType != wgpu::BackendType::Metal; + if (accuracy_level_ == 4 && block_size == 32 && + batch_count == 1 && components_a == 4 && K % 64 == 0 && N % 16 == 0 && + !has_zero_points && use_dp4a && M >= kMinMForTileOptimization) { + constexpr uint32_t kVec4Components = 4; + constexpr uint32_t kVec2Components = 2; + constexpr uint32_t kU32Components = 4; + + constexpr uint32_t kBlockSizeA = 128; + DP4AMatMulQuantizeProgram quantize_program; + quantize_program.SetWorkgroupSize(32); + quantize_program.SetDispatchGroupSize(M * K / kBlockSizeA, 1, 1); + TensorShape a_quant_shape{1, M, K / kU32Components}; + Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType(), a_quant_shape); + TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA}); + Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims); + quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(kVec4Components)}}) + .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), gsl::narrow(1)}, + {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), gsl::narrow(1)}}); + ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program)); + + constexpr uint32_t kTileSize = 64; + TensorShape reshaped_y_shape{1, M, N / kVec4Components}; + DP4AMatMulNBitsProgram mul_program; + mul_program.SetWorkgroupSize(256); + mul_program.SetDispatchGroupSize( + (M + kTileSize - 1) / kTileSize, + (N + kTileSize - 1) / kTileSize, 1); + mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(kVec2Components)}, + {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(1)}, + {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(kU32Components)}, + {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(1)}}) + .AddUniformVariables({{static_cast(M)}, + {static_cast(N)}, + {static_cast(K)}, + {static_cast(K / 8)}, + {static_cast(K / 16)}}) + .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow(kVec4Components)}); + return context.RunProgram(mul_program); + } // TODO: Support output_number > 1. Some cases are failed when output_number > 1. constexpr uint32_t output_number = 1; const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1; - const bool use_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups) && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32; + const bool use_subgroup = has_subgroup && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32; MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow(components_b), has_zero_points, use_subgroup}; if (M > kMinMForTileOptimization && block_size == 32) { components = 1; diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h index ca3c485566d50..a2470d9268907 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h @@ -35,6 +35,24 @@ class MatMulNBitsProgram final : public Program { bool use_subgroup_; }; +class DP4AMatMulQuantizeProgram final : public Program { + public: + DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {} + Status GenerateShaderCode(ShaderHelper& sh) const override; +}; + +class DP4AMatMulNBitsProgram final : public Program { + public: + DP4AMatMulNBitsProgram() : Program{"DP4AMatMulNBits"} {} + Status GenerateShaderCode(ShaderHelper& sh) const override; + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES( + {"M", ProgramUniformVariableDataType::Uint32}, + {"N", ProgramUniformVariableDataType::Uint32}, + {"K", ProgramUniformVariableDataType::Uint32}, + {"K8", ProgramUniformVariableDataType::Uint32}, + {"K16", ProgramUniformVariableDataType::Uint32}); +}; + class MatMulNBits final : public WebGpuKernel { public: MatMulNBits(const OpKernelInfo& info) : WebGpuKernel(info) { @@ -42,6 +60,7 @@ class MatMulNBits final : public WebGpuKernel { N_ = info.GetAttr("N"); block_size_ = info.GetAttr("block_size"); int64_t bits = info.GetAttr("bits"); + accuracy_level_ = info.GetAttrOrDefault("accuracy_level", 4); ORT_ENFORCE(bits == 4, "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned."); } @@ -52,6 +71,7 @@ class MatMulNBits final : public WebGpuKernel { int64_t K_; int64_t N_; int64_t block_size_; + int64_t accuracy_level_; }; } // namespace webgpu diff --git a/onnxruntime/core/graph/function_template.h b/onnxruntime/core/graph/function_template.h index 978174d943f14..0d3fee18d5d59 100644 --- a/onnxruntime/core/graph/function_template.h +++ b/onnxruntime/core/graph/function_template.h @@ -2,7 +2,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" namespace onnxruntime { diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 207c058d899b4..7e0335cc66ef0 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -1458,7 +1458,107 @@ MlasRotaryEmbedOneRow( T* output ); - /** +/** + * @brief Supply matrices data information to half precision gemm functions + */ +struct MLAS_HGEMM_DATA_PARAMS { + const MLAS_FP16* A; /**< Supplies the address of matrix A */ + size_t lda; /**< Supplies the first dimension of matrix A. */ + const MLAS_FP16* B; /**< Supplies the address of matrix B */ + size_t ldb; /**< Supplies the first dimension of matrix B. */ + MLAS_FP16* C; /**< Supplies the address of matrix C */ + size_t ldc; /**< Supplies the first dimension of matrix C. */ + uint16_t alpha; /**< Supplies the scalar alpha multiplier (see GEMM definition). FP16 encoding. */ + uint16_t beta; /**< Supplies the scalar beta multiplier (see GEMM definition). FP16 encoding. */ +}; + +/** + * @brief Check whether current CPU supports half precision gemm. + */ +bool +MLASCALL +MlasHGemmSupported( + CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB + ); + +/** + * @brief Batched half precision matrix/matrix multiply operation (HGEMM) + * + * @param TransA Supplies the transpose operation for matrix A. + * @param TransB Supplies the transpose operation for matrix B. + * @param M Supplies the number of rows of matrix A and matrix C. + * @param N Supplies the number of columns of matrix B and matrix C. + * @param K Supplies the number of columns of matrix A and the number of rows of matrix B. + * @param Data A array of matrices data parameters + * @param BatchSize Supplies number of multiplications in this batch + * @param ThreadPool Supplies the thread pool object to use, else nullptr if the + base library threading support should be used. + */ +void +MLASCALL +MlasGemmBatch( + CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, + size_t M, + size_t N, + size_t K, + const MLAS_HGEMM_DATA_PARAMS* Data, + size_t BatchSize, + MLAS_THREADPOOL* ThreadPool + ); + +/** + * @brief half precision matrix/matrix multiply operation (HGEMM) + * C = alpha * op(A) * op(B) + beta * C + * + * @param TransA Supplies the transpose operation for matrix A. Currently only support CblasNoTrans. + * @param TransB Supplies the transpose operation for matrix B. Currently only support CblasTrans. + * @param M Supplies the number of rows of matrix A and matrix C. + * @param N Supplies the number of columns of matrix B and matrix C. + * @param K Supplies the number of columns of matrix A and the number of rows of matrix B. + * @param A Supplies the address of matrix A + * @param lda Supplies the first dimension of matrix A. + * @param B Supplies the address of matrix B + * @param ldb Supplies the first dimension of matrix B. + * @param C Supplies the address of matrix C + * @param ldc Supplies the first dimension of matrix C. + * @param alpha Supplies the scalar alpha multiplier (see GEMM definition) + * @param beta Supplies the scalar beta multiplier (see GEMM definition) + * @param ThreadPool Supplies the thread pool object to use, else nullptr if the base library threading support + * should be used. + */ +inline +void +MlasGemm( + CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, + size_t M, + size_t N, + size_t K, + const MLAS_FP16* A, + size_t lda, + const MLAS_FP16* B, + size_t ldb, + MLAS_FP16* C, + size_t ldc, + uint16_t alpha, + uint16_t beta, + MLAS_THREADPOOL* ThreadPool +) { + MLAS_HGEMM_DATA_PARAMS Data; + Data.A = A; + Data.lda = lda; + Data.B = B; + Data.ldb = ldb; + Data.C = C; + Data.ldc = ldc; + Data.alpha = alpha; + Data.beta = beta; + MlasGemmBatch(TransA, TransB, M, N, K, &Data, 1, ThreadPool); +} + +/** * @brief Whether current CPU supports FP16 acceleration. */ bool MLASCALL diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h index f4c49905ebbd7..acee567162b9d 100644 --- a/onnxruntime/core/mlas/lib/fp16_common.h +++ b/onnxruntime/core/mlas/lib/fp16_common.h @@ -349,4 +349,103 @@ MlasBitwiseSelectFloat16x4(MLAS_UINT16X4 select, MLAS_FLOAT16X4 ones, MLAS_FLOAT return vbsl_f16(select, ones, zeros); } +MLAS_FORCEINLINE +void +Transpose8x8(MLAS_FLOAT16X8& v0, MLAS_FLOAT16X8& v1, MLAS_FLOAT16X8& v2, MLAS_FLOAT16X8& v3, + MLAS_FLOAT16X8& v4, MLAS_FLOAT16X8& v5, MLAS_FLOAT16X8& v6, MLAS_FLOAT16X8& v7) +{ + // |v00|v01|v02|v03|v04|v05|v06|v07| + // |v10|v11|v12|v13|v14|v15|v16|v17| + // |v20|v21|v22|v23|v24|v25|v26|v27| + // |v30|v31|v32|v33|v34|v35|v36|v37| + // |v40|v41|v42|v43|v44|v45|v46|v47| + // |v50|v51|v52|v53|v54|v55|v56|v57| + // |v60|v61|v62|v63|v64|v65|v66|v67| + // |v70|v71|v72|v73|v74|v75|v76|v77| + float16x8x2_t t01 = vtrnq_f16(v0, v1); + float16x8x2_t t23 = vtrnq_f16(v2, v3); + float16x8x2_t t45 = vtrnq_f16(v4, v5); + float16x8x2_t t67 = vtrnq_f16(v6, v7); + // |v00|v10|v02|v12|v04|v14|v06|v16| + // |v01|v11|v03|v13|v05|v15|v07|v17| + // |v20|v30|v22|v32|v24|v34|v26|v36| + // |v21|v31|v23|v33|v25|v35|v27|v37| + // |v40|v50|v42|v52|v44|v54|v46|v56| + // |v41|v51|v43|v53|v45|v55|v47|v57| + // |v60|v70|v62|v72|v64|v74|v66|v76| + // |v61|v71|v63|v73|v65|v75|v67|v77| + float32x4x2_t t02 = vtrnq_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])); + float32x4x2_t t13 = vtrnq_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])); + float32x4x2_t t46 = vtrnq_f32(vreinterpretq_f32_f16(t45.val[0]), vreinterpretq_f32_f16(t67.val[0])); + float32x4x2_t t57 = vtrnq_f32(vreinterpretq_f32_f16(t45.val[1]), vreinterpretq_f32_f16(t67.val[1])); + // |v00|v10|v20|v30|v04|v14|v24|v34| + // |v01|v11|v21|v31|v05|v15|v25|v35| + // |v02|v12|v22|v32|v06|v16|v26|v36| + // |v03|v13|v23|v33|v07|v17|v27|v37| + // |v40|v50|v60|v70|v44|v54|v64|v74| + // |v41|v51|v61|v71|v45|v55|v65|v75| + // |v42|v52|v62|v72|v46|v56|v66|v76| + // |v43|v53|v63|v73|v47|v57|v67|v77| + v0 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t02.val[0]), vreinterpretq_f64_f32(t46.val[0]))); + v4 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t02.val[0]), vreinterpretq_f64_f32(t46.val[0]))); + v2 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t02.val[1]), vreinterpretq_f64_f32(t46.val[1]))); + v6 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t02.val[1]), vreinterpretq_f64_f32(t46.val[1]))); + v1 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t13.val[0]), vreinterpretq_f64_f32(t57.val[0]))); + v5 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t13.val[0]), vreinterpretq_f64_f32(t57.val[0]))); + v3 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t13.val[1]), vreinterpretq_f64_f32(t57.val[1]))); + v7 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t13.val[1]), vreinterpretq_f64_f32(t57.val[1]))); + // |v00|v10|v20|v30|v40|v50|v60|v70| + // |v01|v11|v21|v31|v41|v51|v61|v71| + // |v02|v12|v22|v32|v42|v52|v62|v72| + // |v03|v13|v23|v33|v43|v53|v63|v73| + // |v04|v14|v24|v34|v44|v54|v64|v74| + // |v05|v15|v25|v35|v45|v55|v65|v75| + // |v06|v16|v26|v36|v46|v56|v66|v76| + // |v07|v17|v27|v37|v47|v57|v67|v77| +} + +MLAS_FORCEINLINE +void +Transpose4x8(MLAS_FLOAT16X8& v0, MLAS_FLOAT16X8& v1, MLAS_FLOAT16X8& v2, MLAS_FLOAT16X8& v3) +{ + // |v00|v01|v02|v03|v04|v05|v06|v07| + // |v10|v11|v12|v13|v14|v15|v16|v17| + // |v20|v21|v22|v23|v24|v25|v26|v27| + // |v30|v31|v32|v33|v34|v35|v36|v37| + // => + // |v00|v10|v20|v30|v04|v14|v24|v34| + // |v01|v11|v21|v31|v05|v15|v25|v35| + // |v02|v12|v22|v32|v06|v16|v26|v36| + // |v03|v13|v23|v33|v07|v17|v27|v37| + float16x8x2_t t01 = vtrnq_f16(v0, v1); + float16x8x2_t t23 = vtrnq_f16(v2, v3); + + v0 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0]))); + v2 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0]))); + v1 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1]))); + v3 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1]))); +} + +MLAS_FORCEINLINE +void +Transpose4x4(MLAS_FLOAT16X4& v0, MLAS_FLOAT16X4& v1, MLAS_FLOAT16X4& v2, MLAS_FLOAT16X4& v3) +{ + // |v00|v01|v02|v03| + // |v10|v11|v12|v13| + // |v20|v21|v22|v23| + // |v30|v31|v32|v33| + // => + // |v00|v10|v20|v30| + // |v01|v11|v21|v31| + // |v02|v12|v22|v32| + // |v03|v13|v23|v33| + float16x4x2_t t01 = vtrn_f16(v0, v1); + float16x4x2_t t23 = vtrn_f16(v2, v3); + + v0 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0]))); + v1 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1]))); + v2 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0]))); + v3 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1]))); +} + #endif // fp16 vector intrinsic supported diff --git a/onnxruntime/core/mlas/lib/halfgemm.cpp b/onnxruntime/core/mlas/lib/halfgemm.cpp index 49387d2fc998f..65ab0e9ce4630 100644 --- a/onnxruntime/core/mlas/lib/halfgemm.cpp +++ b/onnxruntime/core/mlas/lib/halfgemm.cpp @@ -324,6 +324,176 @@ MlasHalfGemmKernel( } } +bool +MLASCALL +MlasHGemmSupported( + CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB +) { + auto* dispatch = GetMlasPlatform().HGemmDispatch; + if (TransA == CblasNoTrans && TransB == CblasTrans) { + return dispatch && + dispatch->HGemmKernel_TransposedB && + dispatch->HPackBKernel_TransposedB && + dispatch->HGemmKernel_TransposedPackedB; + } + + return false; +} + +void +HGemmOperation( + CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, + size_t K, // full K slice + const MLAS_HGEMM_DATA_PARAMS* DataParams, + const size_t RangeStartM, + const size_t RangeCountM, + const size_t RangeStartN, + const size_t RangeCountN +) { + const size_t lda = DataParams->lda; + const size_t ldb = DataParams->ldb; + const size_t ldc = DataParams->ldc; + const _mlas_fp16_ alpha = DataParams->alpha; + const _mlas_fp16_ beta = DataParams->beta; + auto* dispatch = GetMlasPlatform().HGemmDispatch; + constexpr size_t StrideM = 2; + const auto beta_add = MLAS_FP16(1.0f); + constexpr size_t buffer_size = MLAS_HGEMM_STRIDEN * MLAS_HGEMM_STRIDEK; + MLAS_DECLSPEC_ALIGN(MLAS_FP16 PackedB[buffer_size], 16 * sizeof(_mlas_fp16_)); + + if (TransA == CblasNoTrans && TransB == CblasTrans) { + const auto* A = DataParams->A + RangeStartM * lda; + const auto* B = DataParams->B + RangeStartN * ldb; + auto* C = DataParams->C + RangeStartM * ldc + RangeStartN; + + if (RangeCountM <= StrideM) { + if (!dispatch || !dispatch->HGemmKernel_TransposedB) { + MLAS_THROW_EX(std::runtime_error, "hgemm does not have A x Transposed(B) kernels"); + } + // When M is small, B is visited once. The overhead of Pack(B') exceeds the benefits + // from A x Pack(B'). Therefore directly calculate A x B'. + // Without PackB, to utilize memory locality, iterate full K. + constexpr size_t StrideN = 16; + for (size_t n = 0, countN; n < RangeCountN; n += countN) { + countN = std::min(StrideN, RangeCountN - n); + dispatch->HGemmKernel_TransposedB(A, B, C, RangeCountM, countN, K, lda, ldb, ldc, alpha, beta); + B += countN * ldb; + C += countN; + } + } else { + if (!dispatch || !dispatch->HPackBKernel_TransposedB || !dispatch->HGemmKernel_TransposedPackedB) { + MLAS_THROW_EX(std::runtime_error, "hgemm does not have A x Transposed(B) kernels"); + } + // 16N is the smallest pack unit. + const size_t StrideK = std::min(K, size_t(MLAS_HGEMM_STRIDEK)); + const size_t StrideN = buffer_size/StrideK & (~15); // >= MLAS_HGEMM_STRIDEN + for (size_t n = 0, countN; n < RangeCountN; n += countN) { + countN = std::min(StrideN, RangeCountN - n); + const MLAS_FP16* a = A; + const MLAS_FP16* b = B; + MLAS_FP16* c = C; + for (size_t k = 0, countK; k < K; k += countK) { + countK = std::min(StrideK, K - k); + dispatch->HPackBKernel_TransposedB(b, PackedB, countN, countK, ldb); + const MLAS_FP16* aa = a; + MLAS_FP16* cc = c; + for (size_t m = 0, countM; m < RangeCountM; m += countM) { + countM = std::min(StrideM, RangeCountM - m); + // First K iteration, beta is applied to the whole C. In rest K iterations, use add mode. + dispatch->HGemmKernel_TransposedPackedB( + aa, PackedB, cc, countM, countN, countK, lda, ldc, alpha, k == 0 ? beta : beta_add.val); + aa += countM * lda; + cc += countM * ldc; + } + a += countK; + b += countK; + } + B += countN * ldb; + C += countN; + } + } + } else { + MLAS_THROW_EX(std::runtime_error, "hgemm currently only support A x Transpoe(B)"); + } +} + +void +MLASCALL +MlasGemmBatch( + CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, + size_t M, + size_t N, + size_t K, + const MLAS_HGEMM_DATA_PARAMS* Data, + size_t BatchSize, + MLAS_THREADPOOL* ThreadPool +) { + if (!ThreadPool) { + for (size_t gemm_i = 0; gemm_i < BatchSize; gemm_i++) { + HGemmOperation(TransA, TransB, K, &Data[gemm_i], 0, M, 0, N); + } + return; + } + + const double Complexity = double(M) * double(N) * double(K) * double(BatchSize); + ptrdiff_t TargetThreadCount; + + if (Complexity < double(MLAS_HGEMM_THREAD_COMPLEXITY) * GetMlasPlatform().MaximumThreadCount) { + TargetThreadCount = ptrdiff_t(Complexity / double(MLAS_HGEMM_THREAD_COMPLEXITY)) + 1; + } else { + TargetThreadCount = GetMlasPlatform().MaximumThreadCount; + } + + ptrdiff_t MaximumThreadCount = MlasGetMaximumThreadCount(ThreadPool); + if (TargetThreadCount >= MaximumThreadCount) { + TargetThreadCount = MaximumThreadCount; + } + + // Segment the operation across multiple threads. + + ptrdiff_t ThreadsPerGemm = TargetThreadCount / BatchSize; + if (ThreadsPerGemm < 1) { + ThreadsPerGemm = 1; + } + + constexpr size_t StrideM = 128; + + size_t nc = N; + if (ThreadsPerGemm > 1) { + // more than one thread per GEMM + + const size_t BlockedM = MlasDivRoundup(M, StrideM); + const size_t max_nc = MlasDivRoundup(N * BlockedM, ThreadsPerGemm); + if (max_nc < nc) { + nc = std::min( + nc, MlasDivRoundup(max_nc, MLAS_HGEMM_STRIDEN_THREAD_ALIGN) * MLAS_HGEMM_STRIDEN_THREAD_ALIGN); + } + } + const size_t StrideN = nc; + + const size_t ThreadCountM = MlasDivRoundup(M, StrideM); + const size_t ThreadCountN = MlasDivRoundup(N, StrideN); + ThreadsPerGemm = ThreadCountM * ThreadCountN; + + MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * static_cast(BatchSize), [&](ptrdiff_t tid) { + const auto gemm_i = tid / ThreadsPerGemm; + const auto blk_i = tid % ThreadsPerGemm; + + const ptrdiff_t ThreadIdN = blk_i / ThreadCountM; + const ptrdiff_t ThreadIdM = blk_i % ThreadCountM; + + const size_t RangeStartM = ThreadIdM * StrideM; + const size_t RangeCountM = std::min(M - RangeStartM, (size_t)StrideM); + + const size_t RangeStartN = ThreadIdN * StrideN; + const size_t RangeCountN = std::min(N - RangeStartN, (size_t)StrideN); + + HGemmOperation(TransA, TransB, K, &Data[gemm_i], RangeStartM, RangeCountM, RangeStartN, RangeCountN); + }); +} const MLAS_HALFGEMM_DISPATCH MlasHalfGemmDispatchDefault = { MlasHalfGemmOperation, diff --git a/onnxruntime/core/mlas/lib/halfgemm.h b/onnxruntime/core/mlas/lib/halfgemm.h index 61e2fbb0afc6a..e280e6d40973f 100644 --- a/onnxruntime/core/mlas/lib/halfgemm.h +++ b/onnxruntime/core/mlas/lib/halfgemm.h @@ -513,3 +513,125 @@ MlasHalfGemmGetDispatch() return &MlasHalfGemmDispatchDefault; #endif } + +namespace hgemm_neon { + +void HPackB_TransposedB_Kernel( + const MLAS_FP16* B, + MLAS_FP16* PackedB, + size_t CountN, + size_t CountK, + size_t ldb +); + +void HGemm_TransposedB_Kernel( + const MLAS_FP16* A, + const MLAS_FP16* B, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldb, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +); + +void HGemm_TransposedPackedB_Kernel( + const MLAS_FP16* A, + const MLAS_FP16* PackedB, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +); + +} // namespace hgemm_neon + +struct MLAS_HGEMM_DISPATCH { + /** + * @brief Pack the B matrix segment. B is column-major. Elements from CountK rows x N columns are packed + * continuously in row-major. + * First pack CountK rows x 16 columns, then pack CountK rows x 8 columns. + * If there are < 8 columns left, pad the columns with 0. + * @param B the first element of the B matrix segment. Column major. + * @param[out] PackedB the first element of the packed B matrix segment. + * @param CountN the number of columns of B chunk. + * @param CountK the number of rows of B chunk. + */ + typedef void(HPackBKernel_TransposedB_Fn) ( + const MLAS_FP16* B, + MLAS_FP16* PackedB, + size_t CountN, + size_t CountK, + size_t ldb + ); + + HPackBKernel_TransposedB_Fn* HPackBKernel_TransposedB = nullptr; + + /** + * @brief C = alpha * A * Transpose(B) + beta * C. CountM <= 2. B is not packed. Used when M is small. + * + * @param A first row of the A matrix segment. Row major. + * @param B first column of the B matrix segment. Column major. + * @param[out] C first element of the output matrix segment. Row major. + * @param CountM the number of rows of A chunk. + * @param CountN the number of columns of B chunk. + * @param CountK the number of columns of A chunk and the number of rows of B chunk. + * @param lda the leading dimension of A. + * @param ldb the leading dimension of B. + * @param ldc the leading dimension of C. + * @param alpha the alpha scalar value. + * @param beta the beta scalar value. + */ + typedef void(HGemmKernel_TransposedB_Fn)( + const MLAS_FP16* A, + const MLAS_FP16* B, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldb, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta + ); + + HGemmKernel_TransposedB_Fn* HGemmKernel_TransposedB = nullptr; + + /** + * @brief C = alpha * A * Transpose(B) + beta * C. CountM <= 2. B has been packed using HPackBKernel_TransposedB_Fn. + * Use when M is large. + * + * @param A first row of the A matrix segment. Row major. + * @param PackedB first element of the packed B buffer. + * @param[out] C first element of the output matrix segment. Row major. + * @param CountM the number of rows of A chunk. + * @param CountN the number of columns of B chunk. + * @param CountK the number of columns of A chunk and the number of rows of B chunk. + * @param lda the leading dimension of A. + * @param ldc the leading dimension of C. + * @param alpha the alpha scalar value. + * @param beta the beta scalar value. + */ + typedef void(HGemmKernel_TransposedPackedB_Fn)( + const MLAS_FP16* A, + const MLAS_FP16* PackedB, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta + ); + + HGemmKernel_TransposedPackedB_Fn* HGemmKernel_TransposedPackedB = nullptr; +}; diff --git a/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp new file mode 100644 index 0000000000000..02ce38fcb21d6 --- /dev/null +++ b/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp @@ -0,0 +1,1572 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + halfgemm_kernel_neon_fp16.cpp + +Abstract: + + This module implements half precision GEMM kernel for neon. + +--*/ + +#include + +#include "halfgemm.h" +#include "fp16_common.h" + +namespace hgemm_neon { + +void HPackB_TransposedB_Kernel( + const MLAS_FP16* B, + MLAS_FP16* PackedB, + size_t CountN, + size_t CountK, + size_t ldb +) { + const _mlas_fp16_* B_data = reinterpret_cast(B); + _mlas_fp16_* PackedB_data = reinterpret_cast<_mlas_fp16_*>(PackedB); + + for (; CountN >= 16; CountN -= 16, B_data += 16 * ldb) { + const _mlas_fp16_* b = B_data; + size_t k = CountK; + constexpr size_t step = 8 * 16; // pack 8 * 16 + for (; k >= 8; k -= 8, b += 8, PackedB_data += step) { + float16x8_t v0 = MlasLoadFloat16x8(b); + float16x8_t v1 = MlasLoadFloat16x8(b + ldb); + float16x8_t v2 = MlasLoadFloat16x8(b + 2 * ldb); + float16x8_t v3 = MlasLoadFloat16x8(b + 3 * ldb); + float16x8_t v4 = MlasLoadFloat16x8(b + 4 * ldb); + float16x8_t v5 = MlasLoadFloat16x8(b + 5 * ldb); + float16x8_t v6 = MlasLoadFloat16x8(b + 6 * ldb); + float16x8_t v7 = MlasLoadFloat16x8(b + 7 * ldb); + float16x8_t v8 = MlasLoadFloat16x8(b + 8 * ldb); + float16x8_t v9 = MlasLoadFloat16x8(b + 9 * ldb); + float16x8_t vA = MlasLoadFloat16x8(b + 10 * ldb); + float16x8_t vB = MlasLoadFloat16x8(b + 11 * ldb); + float16x8_t vC = MlasLoadFloat16x8(b + 12 * ldb); + float16x8_t vD = MlasLoadFloat16x8(b + 13 * ldb); + float16x8_t vE = MlasLoadFloat16x8(b + 14 * ldb); + float16x8_t vF = MlasLoadFloat16x8(b + 15 * ldb); + Transpose8x8(v0, v1, v2, v3, v4, v5, v6, v7); + Transpose8x8(v8, v9, vA, vB, vC, vD, vE, vF); + + MlasStoreFloat16x8(PackedB_data, v0); + MlasStoreFloat16x8(PackedB_data + 8, v8); + MlasStoreFloat16x8(PackedB_data + 16, v1); + MlasStoreFloat16x8(PackedB_data + 24, v9); + MlasStoreFloat16x8(PackedB_data + 32, v2); + MlasStoreFloat16x8(PackedB_data + 40, vA); + MlasStoreFloat16x8(PackedB_data + 48, v3); + MlasStoreFloat16x8(PackedB_data + 56, vB); + MlasStoreFloat16x8(PackedB_data + 64, v4); + MlasStoreFloat16x8(PackedB_data + 72, vC); + MlasStoreFloat16x8(PackedB_data + 80, v5); + MlasStoreFloat16x8(PackedB_data + 88, vD); + MlasStoreFloat16x8(PackedB_data + 96, v6); + MlasStoreFloat16x8(PackedB_data + 104, vE); + MlasStoreFloat16x8(PackedB_data + 112, v7); + MlasStoreFloat16x8(PackedB_data + 120, vF); + } + + if (k & 4) { + float16x4_t v0 = MlasLoadFloat16x4(b); + float16x4_t v1 = MlasLoadFloat16x4(b + ldb); + float16x4_t v2 = MlasLoadFloat16x4(b + 2 * ldb); + float16x4_t v3 = MlasLoadFloat16x4(b + 3 * ldb); + float16x4_t v4 = MlasLoadFloat16x4(b + 4 * ldb); + float16x4_t v5 = MlasLoadFloat16x4(b + 5 * ldb); + float16x4_t v6 = MlasLoadFloat16x4(b + 6 * ldb); + float16x4_t v7 = MlasLoadFloat16x4(b + 7 * ldb); + float16x4_t v8 = MlasLoadFloat16x4(b + 8 * ldb); + float16x4_t v9 = MlasLoadFloat16x4(b + 9 * ldb); + float16x4_t vA = MlasLoadFloat16x4(b + 10 * ldb); + float16x4_t vB = MlasLoadFloat16x4(b + 11 * ldb); + float16x4_t vC = MlasLoadFloat16x4(b + 12 * ldb); + float16x4_t vD = MlasLoadFloat16x4(b + 13 * ldb); + float16x4_t vE = MlasLoadFloat16x4(b + 14 * ldb); + float16x4_t vF = MlasLoadFloat16x4(b + 15 * ldb); + Transpose4x4(v0, v1, v2, v3); + Transpose4x4(v4, v5, v6, v7); + Transpose4x4(v8, v9, vA, vB); + Transpose4x4(vC, vD, vE, vF); + MlasStoreFloat16x4(PackedB_data, v0); + MlasStoreFloat16x4(PackedB_data + 4, v4); + MlasStoreFloat16x4(PackedB_data + 8, v8); + MlasStoreFloat16x4(PackedB_data + 12, vC); + MlasStoreFloat16x4(PackedB_data + 16, v1); + MlasStoreFloat16x4(PackedB_data + 20, v5); + MlasStoreFloat16x4(PackedB_data + 24, v9); + MlasStoreFloat16x4(PackedB_data + 28, vD); + MlasStoreFloat16x4(PackedB_data + 32, v2); + MlasStoreFloat16x4(PackedB_data + 36, v6); + MlasStoreFloat16x4(PackedB_data + 40, vA); + MlasStoreFloat16x4(PackedB_data + 44, vE); + MlasStoreFloat16x4(PackedB_data + 48, v3); + MlasStoreFloat16x4(PackedB_data + 52, v7); + MlasStoreFloat16x4(PackedB_data + 56, vB); + MlasStoreFloat16x4(PackedB_data + 60, vF); + + k -= 4, b += 4, PackedB_data += 4 * 16; + } + + if (k > 0) { + float16x4_t v0 = MlasLoadPartialFloat16x4(b, k); + float16x4_t v1 = MlasLoadPartialFloat16x4(b + ldb, k); + float16x4_t v2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k); + float16x4_t v3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k); + float16x4_t v4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k); + float16x4_t v5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k); + float16x4_t v6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k); + float16x4_t v7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k); + float16x4_t v8 = MlasLoadPartialFloat16x4(b + 8 * ldb, k); + float16x4_t v9 = MlasLoadPartialFloat16x4(b + 9 * ldb, k); + float16x4_t vA = MlasLoadPartialFloat16x4(b + 10 * ldb, k); + float16x4_t vB = MlasLoadPartialFloat16x4(b + 11 * ldb, k); + float16x4_t vC = MlasLoadPartialFloat16x4(b + 12 * ldb, k); + float16x4_t vD = MlasLoadPartialFloat16x4(b + 13 * ldb, k); + float16x4_t vE = MlasLoadPartialFloat16x4(b + 14 * ldb, k); + float16x4_t vF = MlasLoadPartialFloat16x4(b + 15 * ldb, k); + Transpose4x4(v0, v1, v2, v3); + Transpose4x4(v4, v5, v6, v7); + Transpose4x4(v8, v9, vA, vB); + Transpose4x4(vC, vD, vE, vF); + MlasStoreFloat16x4(PackedB_data, v0); + MlasStoreFloat16x4(PackedB_data + 4, v4); + MlasStoreFloat16x4(PackedB_data + 8, v8); + MlasStoreFloat16x4(PackedB_data + 12, vC); + if (k > 1) { + MlasStoreFloat16x4(PackedB_data + 16, v1); + MlasStoreFloat16x4(PackedB_data + 20, v5); + MlasStoreFloat16x4(PackedB_data + 24, v9); + MlasStoreFloat16x4(PackedB_data + 28, vD); + } + if (k > 2) { + MlasStoreFloat16x4(PackedB_data + 32, v2); + MlasStoreFloat16x4(PackedB_data + 36, v6); + MlasStoreFloat16x4(PackedB_data + 40, vA); + MlasStoreFloat16x4(PackedB_data + 44, vE); + } + + PackedB_data += k * 16; + } + } + + if (CountN & 8) { + const _mlas_fp16_* b = B_data; + size_t k = CountK; + constexpr size_t step = 8 * 8; // pack 8 * 8 + for (; k >= 8; k -= 8, b += 8, PackedB_data += step) { + float16x8_t v0 = MlasLoadFloat16x8(b); + float16x8_t v1 = MlasLoadFloat16x8(b + ldb); + float16x8_t v2 = MlasLoadFloat16x8(b + 2 * ldb); + float16x8_t v3 = MlasLoadFloat16x8(b + 3 * ldb); + float16x8_t v4 = MlasLoadFloat16x8(b + 4 * ldb); + float16x8_t v5 = MlasLoadFloat16x8(b + 5 * ldb); + float16x8_t v6 = MlasLoadFloat16x8(b + 6 * ldb); + float16x8_t v7 = MlasLoadFloat16x8(b + 7 * ldb); + Transpose8x8(v0, v1, v2, v3, v4, v5, v6, v7); + + MlasStoreFloat16x8(PackedB_data, v0); + MlasStoreFloat16x8(PackedB_data + 8, v1); + MlasStoreFloat16x8(PackedB_data + 16, v2); + MlasStoreFloat16x8(PackedB_data + 24, v3); + MlasStoreFloat16x8(PackedB_data + 32, v4); + MlasStoreFloat16x8(PackedB_data + 40, v5); + MlasStoreFloat16x8(PackedB_data + 48, v6); + MlasStoreFloat16x8(PackedB_data + 56, v7); + } + + if (k & 4) { + float16x4_t v0 = MlasLoadFloat16x4(b); + float16x4_t v1 = MlasLoadFloat16x4(b + ldb); + float16x4_t v2 = MlasLoadFloat16x4(b + 2 * ldb); + float16x4_t v3 = MlasLoadFloat16x4(b + 3 * ldb); + float16x4_t v4 = MlasLoadFloat16x4(b + 4 * ldb); + float16x4_t v5 = MlasLoadFloat16x4(b + 5 * ldb); + float16x4_t v6 = MlasLoadFloat16x4(b + 6 * ldb); + float16x4_t v7 = MlasLoadFloat16x4(b + 7 * ldb); + Transpose4x4(v0, v1, v2, v3); + Transpose4x4(v4, v5, v6, v7); + MlasStoreFloat16x4(PackedB_data, v0); + MlasStoreFloat16x4(PackedB_data + 4, v4); + MlasStoreFloat16x4(PackedB_data + 8, v1); + MlasStoreFloat16x4(PackedB_data + 12, v5); + MlasStoreFloat16x4(PackedB_data + 16, v2); + MlasStoreFloat16x4(PackedB_data + 20, v6); + MlasStoreFloat16x4(PackedB_data + 24, v3); + MlasStoreFloat16x4(PackedB_data + 28, v7); + k -= 4, b += 4, PackedB_data += 4 * 8; + } + + if (k > 0) { + float16x4_t v0 = MlasLoadPartialFloat16x4(b, k); + float16x4_t v1 = MlasLoadPartialFloat16x4(b + ldb, k); + float16x4_t v2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k); + float16x4_t v3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k); + float16x4_t v4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k); + float16x4_t v5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k); + float16x4_t v6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k); + float16x4_t v7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k); + Transpose4x4(v0, v1, v2, v3); + Transpose4x4(v4, v5, v6, v7); + MlasStoreFloat16x4(PackedB_data, v0); + MlasStoreFloat16x4(PackedB_data + 4, v4); + if (k > 1) { + MlasStoreFloat16x4(PackedB_data + 8, v1); + MlasStoreFloat16x4(PackedB_data + 12, v5); + } + if (k > 2) { + MlasStoreFloat16x4(PackedB_data + 16, v2); + MlasStoreFloat16x4(PackedB_data + 20, v6); + } + + PackedB_data += k * 8; + } + + B_data += 8 * ldb; + CountN -= 8; + } + + if (CountN > 0) { + const _mlas_fp16_* b = B_data; + size_t k = CountK; + constexpr size_t step = 8 * 8; // pack extended 8 * 8 + for (; k >= 8; k -= 8, b += 8, PackedB_data += step) { + float16x8_t v[8]; + size_t i = 0; + for (; i < CountN; ++i) { + v[i] = MlasLoadFloat16x8(b + i * ldb); + } + for (; i < 8; ++i) { + v[i] = MlasZeroFloat16x8(); + } + Transpose8x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); + MlasStoreFloat16x8(PackedB_data, v[0]); + MlasStoreFloat16x8(PackedB_data + 8, v[1]); + MlasStoreFloat16x8(PackedB_data + 16, v[2]); + MlasStoreFloat16x8(PackedB_data + 24, v[3]); + MlasStoreFloat16x8(PackedB_data + 32, v[4]); + MlasStoreFloat16x8(PackedB_data + 40, v[5]); + MlasStoreFloat16x8(PackedB_data + 48, v[6]); + MlasStoreFloat16x8(PackedB_data + 56, v[7]); + } + + if (k & 4) { + float16x4_t v[8]; + size_t i = 0; + for (; i < CountN; ++i) { + v[i] = MlasLoadFloat16x4(b + i * ldb); + } + for (; i < 8; ++i) { + v[i] = MlasZeroFloat16x4(); + } + Transpose4x4(v[0], v[1], v[2], v[3]); + Transpose4x4(v[4], v[5], v[6], v[7]); + MlasStoreFloat16x4(PackedB_data, v[0]); + MlasStoreFloat16x4(PackedB_data + 4, v[4]); + MlasStoreFloat16x4(PackedB_data + 8, v[1]); + MlasStoreFloat16x4(PackedB_data + 12, v[5]); + MlasStoreFloat16x4(PackedB_data + 16, v[2]); + MlasStoreFloat16x4(PackedB_data + 20, v[6]); + MlasStoreFloat16x4(PackedB_data + 24, v[3]); + MlasStoreFloat16x4(PackedB_data + 28, v[7]); + k -= 4, b += 4, PackedB_data += 4 * 8; + } + + if (k > 0) { + float16x4_t v[8]; + size_t i = 0; + for (; i < CountN; ++i) { + v[i] = MlasLoadPartialFloat16x4(b + i * ldb, k); + } + for (; i < 8; ++i) { + v[i] = MlasZeroFloat16x4(); + } + Transpose4x4(v[0], v[1], v[2], v[3]); + Transpose4x4(v[4], v[5], v[6], v[7]); + MlasStoreFloat16x4(PackedB_data, v[0]); + MlasStoreFloat16x4(PackedB_data + 4, v[4]); + if (k > 1) { + MlasStoreFloat16x4(PackedB_data + 8, v[1]); + MlasStoreFloat16x4(PackedB_data + 12, v[5]); + } + if (k > 2) { + MlasStoreFloat16x4(PackedB_data + 16, v[2]); + MlasStoreFloat16x4(PackedB_data + 20, v[6]); + } + } + } +} + +MLAS_FORCEINLINE +float16x8_t addq_f16x4(float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3) { + v0 = vaddq_f16(v0, v1); + v2 = vaddq_f16(v2, v3); + v0 = vaddq_f16(v0, v2); + return v0; +} + +MLAS_FORCEINLINE +float16x8_t addq_f16x8(float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3, + float16x8_t v4, float16x8_t v5, float16x8_t v6, float16x8_t v7) { + return vaddq_f16(addq_f16x4(v0, v1, v2, v3), addq_f16x4(v4, v5, v6, v7)); +} + +MLAS_FORCEINLINE +float16x8_t maq_lane_f16_accu(float16x8_t accu0, float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3, + float16x4_t a0) { + accu0 = vfmaq_lane_f16(accu0, v0, a0, 0); + accu0 = vfmaq_lane_f16(accu0, v1, a0, 1); + accu0 = vfmaq_lane_f16(accu0, v2, a0, 2); + accu0 = vfmaq_lane_f16(accu0, v3, a0, 3); + return accu0; +} + +MLAS_FORCEINLINE +float16x8_t maq_laneq_f16_accu(float16x8_t accu0, float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3, + float16x8_t v4, float16x8_t v5, float16x8_t v6, float16x8_t v7, float16x8_t a0) { + accu0 = vfmaq_laneq_f16(accu0, v0, a0, 0); + accu0 = vfmaq_laneq_f16(accu0, v1, a0, 1); + accu0 = vfmaq_laneq_f16(accu0, v2, a0, 2); + accu0 = vfmaq_laneq_f16(accu0, v3, a0, 3); + accu0 = vfmaq_laneq_f16(accu0, v4, a0, 4); + accu0 = vfmaq_laneq_f16(accu0, v5, a0, 5); + accu0 = vfmaq_laneq_f16(accu0, v6, a0, 6); + accu0 = vfmaq_laneq_f16(accu0, v7, a0, 7); + return accu0; +} + +MLAS_FORCEINLINE +float16x4_t ma_lane_f16_accu(float16x4_t accu, float16x4_t v0, float16x4_t v1, float16x4_t v2, float16x4_t v3, + float16x4_t a0) { + accu = vfma_lane_f16(accu, v0, a0, 0); + accu = vfma_lane_f16(accu, v1, a0, 1); + accu = vfma_lane_f16(accu, v2, a0, 2); + accu = vfma_lane_f16(accu, v3, a0, 3); + return accu; +} + +template // 0: beta == 0.0f16, 1: beta == 1.0f16, 2: beta != 0.0f16 && beta != 1.0f16 +void HGemm_TransposedB_Kernel_M1( + const _mlas_fp16_* A_data, + const _mlas_fp16_* B_data, + _mlas_fp16_* C_data, + size_t CountN, + size_t CountK, + size_t ldb, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +) { + for (; CountN >= 8; CountN -= 8, B_data += 8 * ldb, C_data += 8) { + const auto* a = A_data; + const auto* b = B_data; + size_t k = CountK; + float16x8_t accu0 = MlasZeroFloat16x8(); + float16x8_t accu1 = MlasZeroFloat16x8(); + float16x8_t accu2 = MlasZeroFloat16x8(); + float16x8_t accu3 = MlasZeroFloat16x8(); + float16x8_t accu4 = MlasZeroFloat16x8(); + float16x8_t accu5 = MlasZeroFloat16x8(); + float16x8_t accu6 = MlasZeroFloat16x8(); + float16x8_t accu7 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, b += 8) { + float16x8_t b0 = MlasLoadFloat16x8(b); + float16x8_t b1 = MlasLoadFloat16x8(b + ldb); + float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb); + float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb); + float16x8_t b4 = MlasLoadFloat16x8(b + 4 * ldb); + float16x8_t b5 = MlasLoadFloat16x8(b + 5 * ldb); + float16x8_t b6 = MlasLoadFloat16x8(b + 6 * ldb); + float16x8_t b7 = MlasLoadFloat16x8(b + 7 * ldb); + float16x8_t a0 = MlasLoadFloat16x8(a); + accu0 = vfmaq_f16(accu0, b0, a0); + accu1 = vfmaq_f16(accu1, b1, a0); + accu2 = vfmaq_f16(accu2, b2, a0); + accu3 = vfmaq_f16(accu3, b3, a0); + accu4 = vfmaq_f16(accu4, b4, a0); + accu5 = vfmaq_f16(accu5, b5, a0); + accu6 = vfmaq_f16(accu6, b6, a0); + accu7 = vfmaq_f16(accu7, b7, a0); + } + Transpose8x8(accu0, accu1, accu2, accu3, accu4, accu5, accu6, accu7); + accu0 = addq_f16x8(accu0, accu1, accu2, accu3, accu4, accu5, accu6, accu7); // accumulator of 8 columns + + if (k & 4) { + float16x4_t b0 = MlasLoadFloat16x4(b); + float16x4_t b1 = MlasLoadFloat16x4(b + ldb); + float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb); + float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb); + float16x4_t b4 = MlasLoadFloat16x4(b + 4 * ldb); + float16x4_t b5 = MlasLoadFloat16x4(b + 5 * ldb); + float16x4_t b6 = MlasLoadFloat16x4(b + 6 * ldb); + float16x4_t b7 = MlasLoadFloat16x4(b + 7 * ldb); + Transpose4x4(b0, b1, b2, b3); + Transpose4x4(b4, b5, b6, b7); + float16x8_t v0 = vcombine_f16(b0, b4); + float16x8_t v1 = vcombine_f16(b1, b5); + float16x8_t v2 = vcombine_f16(b2, b6); + float16x8_t v3 = vcombine_f16(b3, b7); + float16x4_t a0 = MlasLoadFloat16x4(a); + accu0 = maq_lane_f16_accu(accu0, v0, v1, v2, v3, a0); + k -= 4, a += 4, b += 4; + } + + if (k > 0) { + float16x4_t b0 = MlasLoadPartialFloat16x4(b, k); + float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k); + float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k); + float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k); + float16x4_t b4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k); + float16x4_t b5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k); + float16x4_t b6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k); + float16x4_t b7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k); + Transpose4x4(b0, b1, b2, b3); + Transpose4x4(b4, b5, b6, b7); + float16x8_t v0 = vcombine_f16(b0, b4), v1, v2; + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + accu0 = vfmaq_lane_f16(accu0, v0, a0, 0); + if (k > 1) { + v1 = vcombine_f16(b1, b5); + accu0 = vfmaq_lane_f16(accu0, v1, a0, 1); + } + if (k > 2) { + v2 = vcombine_f16(b2, b6); + accu0 = vfmaq_lane_f16(accu0, v2, a0, 2); + } + } + + if constexpr (beta_behavior == 1) { + float16x8_t c = MlasLoadFloat16x8(C_data); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu0 = vfmaq_f16(c, accu0, alpha_v); + MlasStoreFloat16x8(C_data, accu0); + } else if constexpr (beta_behavior == 2) { + float16x8_t c = MlasLoadFloat16x8(C_data); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + float16x8_t beta_v = MlasBroadcastFloat16x8(beta); + accu0 = vfmaq_f16(vmulq_f16(c, beta_v), accu0, alpha_v); + MlasStoreFloat16x8(C_data, accu0); + } else { + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu0 = vmulq_f16(accu0, alpha_v); + MlasStoreFloat16x8(C_data, accu0); + } + } + + if (CountN & 4) { + const auto* a = A_data; + const auto* b = B_data; + size_t k = CountK; + float16x8_t accu0 = MlasZeroFloat16x8(); + float16x8_t accu1 = MlasZeroFloat16x8(); + float16x8_t accu2 = MlasZeroFloat16x8(); + float16x8_t accu3 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, b += 8) { + float16x8_t b0 = MlasLoadFloat16x8(b); + float16x8_t b1 = MlasLoadFloat16x8(b + ldb); + float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb); + float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb); + float16x8_t a0 = MlasLoadFloat16x8(a); + accu0 = vfmaq_f16(accu0, b0, a0); + accu1 = vfmaq_f16(accu1, b1, a0); + accu2 = vfmaq_f16(accu2, b2, a0); + accu3 = vfmaq_f16(accu3, b3, a0); + } + Transpose4x8(accu0, accu1, accu2, accu3); + accu0 = addq_f16x4(accu0, accu1, accu2, accu3); // accumulator of 4 columns + float16x4_t accu = vadd_f16(vget_low_f16(accu0), vget_high_f16(accu0)); + + if (k & 4) { + float16x4_t b0 = MlasLoadFloat16x4(b); + float16x4_t b1 = MlasLoadFloat16x4(b + ldb); + float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb); + float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb); + Transpose4x4(b0, b1, b2, b3); + float16x4_t a0 = MlasLoadFloat16x4(a); + accu = ma_lane_f16_accu(accu, b0, b1, b2, b3, a0); + k -= 4, a += 4, b += 4; + } + + if (k > 0) { + float16x4_t b0 = MlasLoadPartialFloat16x4(b, k); + float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k); + float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k); + float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k); + Transpose4x4(b0, b1, b2, b3); + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + accu = vfma_lane_f16(accu, b0, a0, 0); + if (k > 1) { + accu = vfma_lane_f16(accu, b1, a0, 1); + } + if (k > 2) { + accu = vfma_lane_f16(accu, b2, a0, 2); + } + } + + if constexpr (beta_behavior == 1) { + float16x4_t c = MlasLoadFloat16x4(C_data); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu = vfma_f16(c, accu, alpha_v); + MlasStoreFloat16x4(C_data, accu); + } else if constexpr (beta_behavior == 2) { + float16x4_t c = MlasLoadFloat16x4(C_data); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + accu = vfma_f16(vmul_f16(c, beta_v), accu, alpha_v); + MlasStoreFloat16x4(C_data, accu); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu = vmul_f16(accu, alpha_v); + MlasStoreFloat16x4(C_data, accu); + } + + CountN -= 4, B_data += 4 * ldb, C_data += 4; + } + + if (CountN > 0) { + const auto* a = A_data; + const auto* b = B_data; + size_t k = CountK; + float16x8_t accus[4]; + size_t i = 0; + for (i = 0; i < 4; ++i) { + accus[i] = MlasZeroFloat16x8(); + } + for (; k >= 8; k -= 8, a += 8, b += 8) { + float16x8_t a0 = MlasLoadFloat16x8(a); + for (i = 0; i < CountN; ++i) { + accus[i] = vfmaq_f16(accus[i], MlasLoadFloat16x8(b + i * ldb), a0); + } + } + Transpose4x8(accus[0], accus[1], accus[2], accus[3]); + float16x8_t accu0 = addq_f16x4(accus[0], accus[1], accus[2], accus[3]); // accumulator of 4 columns + float16x4_t accu = vadd_f16(vget_low_f16(accu0), vget_high_f16(accu0)); + + if (k & 4) { + float16x4_t bs[4]; + for (i = 0; i < CountN; ++i) { + bs[i] = MlasLoadFloat16x4(b + i * ldb); + } + for (; i < 4; ++i) { + bs[i] = MlasZeroFloat16x4(); + } + Transpose4x4(bs[0], bs[1], bs[2], bs[3]); + float16x4_t a0 = MlasLoadFloat16x4(a); + accu = ma_lane_f16_accu(accu, bs[0], bs[1], bs[2], bs[3], a0); + k -= 4, a += 4, b += 4; + } + + if (k > 0) { + float16x4_t bs[4]; + for (i = 0; i < CountN; ++i) { + bs[i] = MlasLoadPartialFloat16x4(b + i * ldb, k); + } + for (; i < 4; ++i) { + bs[i] = MlasZeroFloat16x4(); + } + Transpose4x4(bs[0], bs[1], bs[2], bs[3]); + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + accu = vfma_lane_f16(accu, bs[0], a0, 0); + if (k > 1) { + accu = vfma_lane_f16(accu, bs[1], a0, 1); + } + if (k > 2) { + accu = vfma_lane_f16(accu, bs[2], a0, 2); + } + } + + if constexpr (beta_behavior == 1) { + float16x4_t c = MlasLoadPartialFloat16x4(C_data, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu = vfma_f16(c, accu, alpha_v); + MlasStorePartialFloat16x4(C_data, accu, CountN); + } else if constexpr (beta_behavior == 2) { + float16x4_t c = MlasLoadPartialFloat16x4(C_data, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + accu = vfma_f16(vmul_f16(c, beta_v), accu, alpha_v); + MlasStorePartialFloat16x4(C_data, accu, CountN); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu = vmul_f16(accu, alpha_v); + MlasStorePartialFloat16x4(C_data, accu, CountN); + } + } +} + +template // 0: beta == 0.0f16, 1: beta == 1.0f16, 2: beta != 0.0f16 && beta != 1.0f16 +void HGemm_TransposedB_Kernel_M2( + const _mlas_fp16_* A_data, + const _mlas_fp16_* B_data, + _mlas_fp16_* C_data, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldb, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +) { + for (; CountN >= 8; CountN -= 8, B_data += 8 * ldb, C_data += 8) { + const auto* a = A_data; + const auto* b = B_data; + size_t k = CountK; + float16x8_t accu00 = MlasZeroFloat16x8(); + float16x8_t accu01 = MlasZeroFloat16x8(); + float16x8_t accu02 = MlasZeroFloat16x8(); + float16x8_t accu03 = MlasZeroFloat16x8(); + float16x8_t accu04 = MlasZeroFloat16x8(); + float16x8_t accu05 = MlasZeroFloat16x8(); + float16x8_t accu06 = MlasZeroFloat16x8(); + float16x8_t accu07 = MlasZeroFloat16x8(); + float16x8_t accu10 = MlasZeroFloat16x8(); + float16x8_t accu11 = MlasZeroFloat16x8(); + float16x8_t accu12 = MlasZeroFloat16x8(); + float16x8_t accu13 = MlasZeroFloat16x8(); + float16x8_t accu14 = MlasZeroFloat16x8(); + float16x8_t accu15 = MlasZeroFloat16x8(); + float16x8_t accu16 = MlasZeroFloat16x8(); + float16x8_t accu17 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, b += 8) { + float16x8_t b0 = MlasLoadFloat16x8(b); + float16x8_t b1 = MlasLoadFloat16x8(b + ldb); + float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb); + float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb); + float16x8_t b4 = MlasLoadFloat16x8(b + 4 * ldb); + float16x8_t b5 = MlasLoadFloat16x8(b + 5 * ldb); + float16x8_t b6 = MlasLoadFloat16x8(b + 6 * ldb); + float16x8_t b7 = MlasLoadFloat16x8(b + 7 * ldb); + float16x8_t a0 = MlasLoadFloat16x8(a); + float16x8_t a1 = MlasLoadFloat16x8(a + lda); + accu00 = vfmaq_f16(accu00, b0, a0); + accu01 = vfmaq_f16(accu01, b1, a0); + accu02 = vfmaq_f16(accu02, b2, a0); + accu03 = vfmaq_f16(accu03, b3, a0); + accu04 = vfmaq_f16(accu04, b4, a0); + accu05 = vfmaq_f16(accu05, b5, a0); + accu06 = vfmaq_f16(accu06, b6, a0); + accu07 = vfmaq_f16(accu07, b7, a0); + accu10 = vfmaq_f16(accu10, b0, a1); + accu11 = vfmaq_f16(accu11, b1, a1); + accu12 = vfmaq_f16(accu12, b2, a1); + accu13 = vfmaq_f16(accu13, b3, a1); + accu14 = vfmaq_f16(accu14, b4, a1); + accu15 = vfmaq_f16(accu15, b5, a1); + accu16 = vfmaq_f16(accu16, b6, a1); + accu17 = vfmaq_f16(accu17, b7, a1); + } + Transpose8x8(accu00, accu01, accu02, accu03, accu04, accu05, accu06, accu07); + Transpose8x8(accu10, accu11, accu12, accu13, accu14, accu15, accu16, accu17); + accu00 = addq_f16x8(accu00, accu01, accu02, accu03, accu04, accu05, accu06, accu07); + accu10 = addq_f16x8(accu10, accu11, accu12, accu13, accu14, accu15, accu16, accu17); + + if (k & 4) { + float16x4_t b0 = MlasLoadFloat16x4(b); + float16x4_t b1 = MlasLoadFloat16x4(b + ldb); + float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb); + float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb); + float16x4_t b4 = MlasLoadFloat16x4(b + 4 * ldb); + float16x4_t b5 = MlasLoadFloat16x4(b + 5 * ldb); + float16x4_t b6 = MlasLoadFloat16x4(b + 6 * ldb); + float16x4_t b7 = MlasLoadFloat16x4(b + 7 * ldb); + Transpose4x4(b0, b1, b2, b3); + Transpose4x4(b4, b5, b6, b7); + float16x8_t v0 = vcombine_f16(b0, b4); + float16x8_t v1 = vcombine_f16(b1, b5); + float16x8_t v2 = vcombine_f16(b2, b6); + float16x8_t v3 = vcombine_f16(b3, b7); + float16x4_t a0 = MlasLoadFloat16x4(a); + float16x4_t a1 = MlasLoadFloat16x4(a + lda); + accu00 = maq_lane_f16_accu(accu00, v0, v1, v2, v3, a0); + accu10 = maq_lane_f16_accu(accu10, v0, v1, v2, v3, a1); + k -= 4, a += 4, b += 4; + } + + if (k > 0) { + float16x4_t b0 = MlasLoadPartialFloat16x4(b, k); + float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k); + float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k); + float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k); + float16x4_t b4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k); + float16x4_t b5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k); + float16x4_t b6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k); + float16x4_t b7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k); + Transpose4x4(b0, b1, b2, b3); + Transpose4x4(b4, b5, b6, b7); + float16x8_t v0 = vcombine_f16(b0, b4); + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k); + accu00 = vfmaq_lane_f16(accu00, v0, a0, 0); + accu10 = vfmaq_lane_f16(accu10, v0, a1, 0); + if (k > 1) { + float16x8_t v1 = vcombine_f16(b1, b5); + accu00 = vfmaq_lane_f16(accu00, v1, a0, 1); + accu10 = vfmaq_lane_f16(accu10, v1, a1, 1); + } + if (k > 2) { + float16x8_t v2 = vcombine_f16(b2, b6); + accu00 = vfmaq_lane_f16(accu00, v2, a0, 2); + accu10 = vfmaq_lane_f16(accu10, v2, a1, 2); + } + } + + if constexpr (beta_behavior == 1) { + float16x8_t c0 = MlasLoadFloat16x8(C_data); + float16x8_t c1 = MlasLoadFloat16x8(C_data + ldc); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu00 = vfmaq_f16(c0, accu00, alpha_v); + accu10 = vfmaq_f16(c1, accu10, alpha_v); + MlasStoreFloat16x8(C_data, accu00); + MlasStoreFloat16x8(C_data + ldc, accu10); + } else if constexpr (beta_behavior == 2) { + float16x8_t c0 = MlasLoadFloat16x8(C_data); + float16x8_t c1 = MlasLoadFloat16x8(C_data + ldc); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + float16x8_t beta_v = MlasBroadcastFloat16x8(beta); + accu00 = vfmaq_f16(vmulq_f16(c0, beta_v), accu00, alpha_v); + accu10 = vfmaq_f16(vmulq_f16(c1, beta_v), accu10, alpha_v); + MlasStoreFloat16x8(C_data, accu00); + MlasStoreFloat16x8(C_data + ldc, accu10); + } else { + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu00 = vmulq_f16(accu00, alpha_v); + accu10 = vmulq_f16(accu10, alpha_v); + MlasStoreFloat16x8(C_data, accu00); + MlasStoreFloat16x8(C_data + ldc, accu10); + } + } + + if (CountN & 4) { + const auto* a = A_data; + const auto* b = B_data; + size_t k = CountK; + float16x8_t accu00 = MlasZeroFloat16x8(); + float16x8_t accu01 = MlasZeroFloat16x8(); + float16x8_t accu02 = MlasZeroFloat16x8(); + float16x8_t accu03 = MlasZeroFloat16x8(); + float16x8_t accu10 = MlasZeroFloat16x8(); + float16x8_t accu11 = MlasZeroFloat16x8(); + float16x8_t accu12 = MlasZeroFloat16x8(); + float16x8_t accu13 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, b += 8) { + float16x8_t b0 = MlasLoadFloat16x8(b); + float16x8_t b1 = MlasLoadFloat16x8(b + ldb); + float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb); + float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb); + float16x8_t a0 = MlasLoadFloat16x8(a); + float16x8_t a1 = MlasLoadFloat16x8(a + lda); + accu00 = vfmaq_f16(accu00, b0, a0); + accu01 = vfmaq_f16(accu01, b1, a0); + accu02 = vfmaq_f16(accu02, b2, a0); + accu03 = vfmaq_f16(accu03, b3, a0); + accu10 = vfmaq_f16(accu10, b0, a1); + accu11 = vfmaq_f16(accu11, b1, a1); + accu12 = vfmaq_f16(accu12, b2, a1); + accu13 = vfmaq_f16(accu13, b3, a1); + } + Transpose4x8(accu00, accu01, accu02, accu03); + Transpose4x8(accu10, accu11, accu12, accu13); + accu00 = addq_f16x4(accu00, accu01, accu02, accu03); + accu10 = addq_f16x4(accu10, accu11, accu12, accu13); + float16x4_t accu0 = vadd_f16(vget_low_f16(accu00), vget_high_f16(accu00)); + float16x4_t accu1 = vadd_f16(vget_low_f16(accu10), vget_high_f16(accu10)); + + if (k & 4) { + float16x4_t b0 = MlasLoadFloat16x4(b); + float16x4_t b1 = MlasLoadFloat16x4(b + ldb); + float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb); + float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb); + Transpose4x4(b0, b1, b2, b3); + float16x4_t a0 = MlasLoadFloat16x4(a); + float16x4_t a1 = MlasLoadFloat16x4(a + lda); + accu0 = ma_lane_f16_accu(accu0, b0, b1, b2, b3, a0); + accu1 = ma_lane_f16_accu(accu1, b0, b1, b2, b3, a1); + k -= 4, a += 4, b += 4; + } + + if (k > 0) { + float16x4_t b0 = MlasLoadPartialFloat16x4(b, k); + float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k); + float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k); + float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k); + Transpose4x4(b0, b1, b2, b3); + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k); + accu0 = vfma_lane_f16(accu0, b0, a0, 0); + accu1 = vfma_lane_f16(accu1, b0, a1, 0); + if (k > 1) { + accu0 = vfma_lane_f16(accu0, b1, a0, 1); + accu1 = vfma_lane_f16(accu1, b1, a1, 1); + } + if (k > 2) { + accu0 = vfma_lane_f16(accu0, b2, a0, 2); + accu1 = vfma_lane_f16(accu1, b2, a1, 2); + } + } + + if constexpr (beta_behavior == 1) { + float16x4_t c0 = MlasLoadFloat16x4(C_data); + float16x4_t c1 = MlasLoadFloat16x4(C_data + ldc); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu0 = vfma_f16(c0, accu0, alpha_v); + accu1 = vfma_f16(c1, accu1, alpha_v); + MlasStoreFloat16x4(C_data, accu0); + MlasStoreFloat16x4(C_data + ldc, accu1); + } else if constexpr (beta_behavior == 2) { + float16x4_t c0 = MlasLoadFloat16x4(C_data); + float16x4_t c1 = MlasLoadFloat16x4(C_data + ldc); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + accu0 = vfma_f16(vmul_f16(c0, beta_v), accu0, alpha_v); + accu1 = vfma_f16(vmul_f16(c1, beta_v), accu1, alpha_v); + MlasStoreFloat16x4(C_data, accu0); + MlasStoreFloat16x4(C_data + ldc, accu1); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu0 = vmul_f16(accu0, alpha_v); + accu1 = vmul_f16(accu1, alpha_v); + MlasStoreFloat16x4(C_data, accu0); + MlasStoreFloat16x4(C_data + ldc, accu1); + } + + CountN -= 4, B_data += 4 * ldb, C_data += 4; + } + + if (CountN > 0) { + const auto* a = A_data; + const auto* b = B_data; + size_t k = CountK; + float16x8_t accu0[4]; + float16x8_t accu1[4]; + size_t i = 0; + for (i = 0; i < 4; ++i) { + accu0[i] = MlasZeroFloat16x8(); + accu1[i] = MlasZeroFloat16x8(); + } + for (; k >= 8; k -= 8, a += 8, b += 8) { + float16x8_t a0 = MlasLoadFloat16x8(a); + float16x8_t a1 = MlasLoadFloat16x8(a + lda); + for (i = 0; i < CountN; ++i) { + float16x8_t bi = MlasLoadFloat16x8(b + i * ldb); + accu0[i] = vfmaq_f16(accu0[i], bi, a0); + accu1[i] = vfmaq_f16(accu1[i], bi, a1); + } + } + Transpose4x8(accu0[0], accu0[1], accu0[2], accu0[3]); + Transpose4x8(accu1[0], accu1[1], accu1[2], accu1[3]); + float16x8_t accu00 = addq_f16x4(accu0[0], accu0[1], accu0[2], accu0[3]); + float16x4_t accu_0 = vadd_f16(vget_low_f16(accu00), vget_high_f16(accu00)); + float16x8_t accu10 = addq_f16x4(accu1[0], accu1[1], accu1[2], accu1[3]); + float16x4_t accu_1 = vadd_f16(vget_low_f16(accu10), vget_high_f16(accu10)); + + if (k & 4) { + float16x4_t bs[4]; + for (i = 0; i < CountN; ++i) { + bs[i] = MlasLoadFloat16x4(b + i * ldb); + } + for (; i < 4; ++i) { + bs[i] = MlasZeroFloat16x4(); + } + Transpose4x4(bs[0], bs[1], bs[2], bs[3]); + float16x4_t a0 = MlasLoadFloat16x4(a); + float16x4_t a1 = MlasLoadFloat16x4(a + lda); + accu_0 = ma_lane_f16_accu(accu_0, bs[0], bs[1], bs[2], bs[3], a0); + accu_1 = ma_lane_f16_accu(accu_1, bs[0], bs[1], bs[2], bs[3], a1); + k -= 4, a += 4, b += 4; + } + + if (k > 0) { + float16x4_t bs[4]; + for (i = 0; i < CountN; ++i) { + bs[i] = MlasLoadPartialFloat16x4(b + i * ldb, k); + } + for (; i < 4; ++i) { + bs[i] = MlasZeroFloat16x4(); + } + Transpose4x4(bs[0], bs[1], bs[2], bs[3]); + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k); + accu_0 = vfma_lane_f16(accu_0, bs[0], a0, 0); + accu_1 = vfma_lane_f16(accu_1, bs[0], a1, 0); + if (k > 1) { + accu_0 = vfma_lane_f16(accu_0, bs[1], a0, 1); + accu_1 = vfma_lane_f16(accu_1, bs[1], a1, 1); + } + if (k > 2) { + accu_0 = vfma_lane_f16(accu_0, bs[2], a0, 2); + accu_1 = vfma_lane_f16(accu_1, bs[2], a1, 2); + } + } + + if constexpr (beta_behavior == 1) { + float16x4_t c0 = MlasLoadPartialFloat16x4(C_data, CountN); + float16x4_t c1 = MlasLoadPartialFloat16x4(C_data + ldc, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu_0 = vfma_f16(c0, accu_0, alpha_v); + accu_1 = vfma_f16(c1, accu_1, alpha_v); + MlasStorePartialFloat16x4(C_data, accu_0, CountN); + MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN); + } else if constexpr (beta_behavior == 2) { + float16x4_t c0 = MlasLoadPartialFloat16x4(C_data, CountN); + float16x4_t c1 = MlasLoadPartialFloat16x4(C_data + ldc, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + accu_0 = vfma_f16(vmul_f16(c0, beta_v), accu_0, alpha_v); + accu_1 = vfma_f16(vmul_f16(c1, beta_v), accu_1, alpha_v); + MlasStorePartialFloat16x4(C_data, accu_0, CountN); + MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + accu_0 = vmul_f16(accu_0, alpha_v); + accu_1 = vmul_f16(accu_1, alpha_v); + MlasStorePartialFloat16x4(C_data, accu_0, CountN); + MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN); + } + } +} + +// Full K. Directly save to C. +void HGemm_TransposedB_Kernel( + const MLAS_FP16* A, + const MLAS_FP16* B, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldb, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +) { + if (CountM > 2) { + MLAS_THROW_EX(std::runtime_error, "HGemm_TransposedB_Kernel only support <= 2 rows"); + } + const auto* A_data = reinterpret_cast(A); + const auto* B_data = reinterpret_cast(B); + auto* C_data = reinterpret_cast<_mlas_fp16_*>(C); + const auto f16_0 = MLAS_FP16(0.0f); + const auto f16_1 = MLAS_FP16(1.0f); + if (CountM == 1) { + if (beta == f16_0.val) { + HGemm_TransposedB_Kernel_M1<0>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta); + } else if (beta == f16_1.val) { + HGemm_TransposedB_Kernel_M1<1>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta); + } else { + HGemm_TransposedB_Kernel_M1<2>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta); + } + } else { + if (beta == f16_0.val) { + HGemm_TransposedB_Kernel_M2<0>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta); + } else if (beta == f16_1.val) { + HGemm_TransposedB_Kernel_M2<1>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta); + } else { + HGemm_TransposedB_Kernel_M2<2>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta); + } + } +} + +template // 0: beta == 0, 1: beta == 1, 2: beta != 0 && beta != 1 +void HGemm_TransposedPackedB_Kernel_M1( + const _mlas_fp16_* A, + const _mlas_fp16_* PackedB, + _mlas_fp16_* C, + size_t CountN, + size_t CountK, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +) { + for (; CountN >= 16; CountN -= 16, C += 16) { + const auto* a = A; + size_t k = CountK; + float16x8_t accu0 = MlasZeroFloat16x8(); + float16x8_t accu1 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 16) { + float16x8_t b00 = MlasLoadFloat16x8(PackedB); + float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56); + float16x8_t b40 = MlasLoadFloat16x8(PackedB + 64); + float16x8_t b41 = MlasLoadFloat16x8(PackedB + 72); + float16x8_t b50 = MlasLoadFloat16x8(PackedB + 80); + float16x8_t b51 = MlasLoadFloat16x8(PackedB + 88); + float16x8_t b60 = MlasLoadFloat16x8(PackedB + 96); + float16x8_t b61 = MlasLoadFloat16x8(PackedB + 104); + float16x8_t b70 = MlasLoadFloat16x8(PackedB + 112); + float16x8_t b71 = MlasLoadFloat16x8(PackedB + 120); + float16x8_t a0 = MlasLoadFloat16x8(a); + accu0 = maq_laneq_f16_accu(accu0, b00, b10, b20, b30, b40, b50, b60, b70, a0); + accu1 = maq_laneq_f16_accu(accu1, b01, b11, b21, b31, b41, b51, b61, b71, a0); + } + + if (k & 4) { + float16x8_t b00 = MlasLoadFloat16x8(PackedB); + float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56); + float16x4_t a0 = MlasLoadFloat16x4(a); + accu0 = maq_lane_f16_accu(accu0, b00, b10, b20, b30, a0); + accu1 = maq_lane_f16_accu(accu1, b01, b11, b21, b31, a0); + k -= 4, a += 4, PackedB += 4 * 16; + } + + if (k > 0) { + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x8_t b00 = MlasLoadFloat16x8(PackedB); + float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8); + accu0 = vfmaq_lane_f16(accu0, b00, a0, 0); + accu1 = vfmaq_lane_f16(accu1, b01, a0, 0); + if (k > 1) { + float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24); + accu0 = vfmaq_lane_f16(accu0, b10, a0, 1); + accu1 = vfmaq_lane_f16(accu1, b11, a0, 1); + } + if (k > 2) { + float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40); + accu0 = vfmaq_lane_f16(accu0, b20, a0, 2); + accu1 = vfmaq_lane_f16(accu1, b21, a0, 2); + } + + PackedB += k * 16; + } + + if constexpr (beta_behavior == 1) { + float16x8_t c0 = MlasLoadFloat16x8(C); + float16x8_t c1 = MlasLoadFloat16x8(C + 8); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu0 = vfmaq_f16(c0, accu0, alpha_v); + accu1 = vfmaq_f16(c1, accu1, alpha_v); + MlasStoreFloat16x8(C, accu0); + MlasStoreFloat16x8(C + 8, accu1); + } else if constexpr (beta_behavior == 2) { + float16x8_t c0 = MlasLoadFloat16x8(C); + float16x8_t c1 = MlasLoadFloat16x8(C + 8); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + float16x8_t beta_v = MlasBroadcastFloat16x8(beta); + accu0 = vfmaq_f16(vmulq_f16(c0, beta_v), accu0, alpha_v); + accu1 = vfmaq_f16(vmulq_f16(c1, beta_v), accu1, alpha_v); + MlasStoreFloat16x8(C, accu0); + MlasStoreFloat16x8(C + 8, accu1); + } else { + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu0 = vmulq_f16(accu0, alpha_v); + accu1 = vmulq_f16(accu1, alpha_v); + MlasStoreFloat16x8(C, accu0); + MlasStoreFloat16x8(C + 8, accu1); + } + } + + if (CountN & 8) { + const auto* a = A; + size_t k = CountK; + float16x8_t accu0 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56); + float16x8_t a0 = MlasLoadFloat16x8(a); + accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0); + } + + if (k & 4) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x4_t a0 = MlasLoadFloat16x4(a); + accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0); + k -= 4, a += 4, PackedB += 4 * 8; + } + + if (k > 0) { + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + accu0 = vfmaq_lane_f16(accu0, b0, a0, 0); + if (k > 1) { + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + accu0 = vfmaq_lane_f16(accu0, b1, a0, 1); + } + if (k > 2) { + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + accu0 = vfmaq_lane_f16(accu0, b2, a0, 2); + } + PackedB += k * 8; + } + + if constexpr (beta_behavior == 1) { + float16x8_t c0 = MlasLoadFloat16x8(C); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu0 = vfmaq_f16(c0, accu0, alpha_v); + MlasStoreFloat16x8(C, accu0); + } else if constexpr (beta_behavior == 2) { + float16x8_t c0 = MlasLoadFloat16x8(C); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + float16x8_t beta_v = MlasBroadcastFloat16x8(beta); + accu0 = vfmaq_f16(vmulq_f16(c0, beta_v), accu0, alpha_v); + MlasStoreFloat16x8(C, accu0); + } else { + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu0 = vmulq_f16(accu0, alpha_v); + MlasStoreFloat16x8(C, accu0); + } + + CountN -= 8, C += 8; + } + + if (CountN > 0) { + const auto* a = A; + size_t k = CountK; + float16x8_t accu0 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56); + float16x8_t a0 = MlasLoadFloat16x8(a); + accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0); + } + + if (k & 4) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x4_t a0 = MlasLoadFloat16x4(a); + accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0); + k -= 4, a += 4, PackedB += 4 * 8; + } + + if (k > 0) { + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + accu0 = vfmaq_lane_f16(accu0, b0, a0, 0); + if (k > 1) { + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + accu0 = vfmaq_lane_f16(accu0, b1, a0, 1); + } + if (k > 2) { + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + accu0 = vfmaq_lane_f16(accu0, b2, a0, 2); + } + PackedB += k * 8; + } + + float16x4_t accu_low = vget_low_f16(accu0); + float16x4_t accu_high = vget_high_f16(accu0); + + if (CountN & 4) { + if constexpr (beta_behavior == 1) { + float16x4_t c0 = MlasLoadFloat16x4(C); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStoreFloat16x4(C, vfma_f16(c0, accu_low, alpha_v)); + } else if constexpr (beta_behavior == 2) { + float16x4_t c0 = MlasLoadFloat16x4(C); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + MlasStoreFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu_low, alpha_v)); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStoreFloat16x4(C, vmul_f16(accu_low, alpha_v)); + } + + CountN -= 4, C += 4; + accu_low = accu_high; + } + + if (CountN) { + if constexpr (beta_behavior == 1) { + float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStorePartialFloat16x4(C, vfma_f16(c0, accu_low, alpha_v), CountN); + } else if constexpr (beta_behavior == 2) { + float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + MlasStorePartialFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu_low, alpha_v), CountN); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStorePartialFloat16x4(C, vmul_f16(accu_low, alpha_v), CountN); + } + } + } +} + +template // 0: beta == 0, 1: beta == 1, 2: beta != 0 && beta != 1 +void HGemm_TransposedPackedB_Kernel_M2( + const _mlas_fp16_* A, + const _mlas_fp16_* PackedB, + _mlas_fp16_* C, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +) { + for (; CountN >= 16; CountN -= 16, C += 16) { + const auto* a = A; + size_t k = CountK; + float16x8_t accu00 = MlasZeroFloat16x8(); + float16x8_t accu01 = MlasZeroFloat16x8(); + float16x8_t accu10 = MlasZeroFloat16x8(); + float16x8_t accu11 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 16) { + float16x8_t b00 = MlasLoadFloat16x8(PackedB); + float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56); + float16x8_t b40 = MlasLoadFloat16x8(PackedB + 64); + float16x8_t b41 = MlasLoadFloat16x8(PackedB + 72); + float16x8_t b50 = MlasLoadFloat16x8(PackedB + 80); + float16x8_t b51 = MlasLoadFloat16x8(PackedB + 88); + float16x8_t b60 = MlasLoadFloat16x8(PackedB + 96); + float16x8_t b61 = MlasLoadFloat16x8(PackedB + 104); + float16x8_t b70 = MlasLoadFloat16x8(PackedB + 112); + float16x8_t b71 = MlasLoadFloat16x8(PackedB + 120); + float16x8_t a0 = MlasLoadFloat16x8(a); + float16x8_t a1 = MlasLoadFloat16x8(a + lda); + accu00 = maq_laneq_f16_accu(accu00, b00, b10, b20, b30, b40, b50, b60, b70, a0); + accu01 = maq_laneq_f16_accu(accu01, b01, b11, b21, b31, b41, b51, b61, b71, a0); + accu10 = maq_laneq_f16_accu(accu10, b00, b10, b20, b30, b40, b50, b60, b70, a1); + accu11 = maq_laneq_f16_accu(accu11, b01, b11, b21, b31, b41, b51, b61, b71, a1); + } + + if (k & 4) { + float16x8_t b00 = MlasLoadFloat16x8(PackedB); + float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56); + float16x4_t a0 = MlasLoadFloat16x4(a); + float16x4_t a1 = MlasLoadFloat16x4(a + lda); + accu00 = maq_lane_f16_accu(accu00, b00, b10, b20, b30, a0); + accu01 = maq_lane_f16_accu(accu01, b01, b11, b21, b31, a0); + accu10 = maq_lane_f16_accu(accu10, b00, b10, b20, b30, a1); + accu11 = maq_lane_f16_accu(accu11, b01, b11, b21, b31, a1); + k -= 4, a += 4, PackedB += 4 * 16; + } + + if (k > 0) { + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k); + float16x8_t b00 = MlasLoadFloat16x8(PackedB); + float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8); + accu00 = vfmaq_lane_f16(accu00, b00, a0, 0); + accu01 = vfmaq_lane_f16(accu01, b01, a0, 0); + accu10 = vfmaq_lane_f16(accu10, b00, a1, 0); + accu11 = vfmaq_lane_f16(accu11, b01, a1, 0); + if (k > 1) { + float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24); + accu00 = vfmaq_lane_f16(accu00, b10, a0, 1); + accu01 = vfmaq_lane_f16(accu01, b11, a0, 1); + accu10 = vfmaq_lane_f16(accu10, b10, a1, 1); + accu11 = vfmaq_lane_f16(accu11, b11, a1, 1); + } + if (k > 2) { + float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40); + accu00 = vfmaq_lane_f16(accu00, b20, a0, 2); + accu01 = vfmaq_lane_f16(accu01, b21, a0, 2); + accu10 = vfmaq_lane_f16(accu10, b20, a1, 2); + accu11 = vfmaq_lane_f16(accu11, b21, a1, 2); + } + PackedB += k * 16; + } + + if constexpr (beta_behavior == 1) { + float16x8_t c00 = MlasLoadFloat16x8(C); + float16x8_t c01 = MlasLoadFloat16x8(C + 8); + float16x8_t c10 = MlasLoadFloat16x8(C + ldc); + float16x8_t c11 = MlasLoadFloat16x8(C + ldc + 8); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu00 = vfmaq_f16(c00, accu00, alpha_v); + accu01 = vfmaq_f16(c01, accu01, alpha_v); + accu10 = vfmaq_f16(c10, accu10, alpha_v); + accu11 = vfmaq_f16(c11, accu11, alpha_v); + MlasStoreFloat16x8(C, accu00); + MlasStoreFloat16x8(C + 8, accu01); + MlasStoreFloat16x8(C + ldc, accu10); + MlasStoreFloat16x8(C + ldc + 8, accu11); + } else if constexpr (beta_behavior == 2) { + float16x8_t c00 = MlasLoadFloat16x8(C); + float16x8_t c01 = MlasLoadFloat16x8(C + 8); + float16x8_t c10 = MlasLoadFloat16x8(C + ldc); + float16x8_t c11 = MlasLoadFloat16x8(C + ldc + 8); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + float16x8_t beta_v = MlasBroadcastFloat16x8(beta); + accu00 = vfmaq_f16(vmulq_f16(c00, beta_v), accu00, alpha_v); + accu01 = vfmaq_f16(vmulq_f16(c01, beta_v), accu01, alpha_v); + accu10 = vfmaq_f16(vmulq_f16(c10, beta_v), accu10, alpha_v); + accu11 = vfmaq_f16(vmulq_f16(c11, beta_v), accu11, alpha_v); + MlasStoreFloat16x8(C, accu00); + MlasStoreFloat16x8(C + 8, accu01); + MlasStoreFloat16x8(C + ldc, accu10); + MlasStoreFloat16x8(C + ldc + 8, accu11); + } else { + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu00 = vmulq_f16(accu00, alpha_v); + accu01 = vmulq_f16(accu01, alpha_v); + accu10 = vmulq_f16(accu10, alpha_v); + accu11 = vmulq_f16(accu11, alpha_v); + MlasStoreFloat16x8(C, accu00); + MlasStoreFloat16x8(C + 8, accu01); + MlasStoreFloat16x8(C + ldc, accu10); + MlasStoreFloat16x8(C + ldc + 8, accu11); + } + } + + if (CountN & 8) { + const auto* a = A; + size_t k = CountK; + float16x8_t accu00 = MlasZeroFloat16x8(); + float16x8_t accu10 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56); + float16x8_t a0 = MlasLoadFloat16x8(a); + float16x8_t a1 = MlasLoadFloat16x8(a + lda); + accu00 = maq_laneq_f16_accu(accu00, b0, b1, b2, b3, b4, b5, b6, b7, a0); + accu10 = maq_laneq_f16_accu(accu10, b0, b1, b2, b3, b4, b5, b6, b7, a1); + } + + if (k & 4) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x4_t a0 = MlasLoadFloat16x4(a); + float16x4_t a1 = MlasLoadFloat16x4(a + lda); + accu00 = maq_lane_f16_accu(accu00, b0, b1, b2, b3, a0); + accu10 = maq_lane_f16_accu(accu10, b0, b1, b2, b3, a1); + k -= 4, a += 4, PackedB += 4 * 8; + } + + if (k > 0) { + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k); + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + accu00 = vfmaq_lane_f16(accu00, b0, a0, 0); + accu10 = vfmaq_lane_f16(accu10, b0, a1, 0); + if (k > 1) { + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + accu00 = vfmaq_lane_f16(accu00, b1, a0, 1); + accu10 = vfmaq_lane_f16(accu10, b1, a1, 1); + } + if (k > 2) { + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + accu00 = vfmaq_lane_f16(accu00, b2, a0, 2); + accu10 = vfmaq_lane_f16(accu10, b2, a1, 2); + } + PackedB += k * 8; + } + + if constexpr (beta_behavior == 1) { + float16x8_t c0 = MlasLoadFloat16x8(C); + float16x8_t c1 = MlasLoadFloat16x8(C + ldc); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu00 = vfmaq_f16(c0, accu00, alpha_v); + accu10 = vfmaq_f16(c1, accu10, alpha_v); + MlasStoreFloat16x8(C, accu00); + MlasStoreFloat16x8(C + ldc, accu10); + } else if constexpr (beta_behavior == 2) { + float16x8_t c0 = MlasLoadFloat16x8(C); + float16x8_t c1 = MlasLoadFloat16x8(C + ldc); + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + float16x8_t beta_v = MlasBroadcastFloat16x8(beta); + accu00 = vfmaq_f16(vmulq_f16(c0, beta_v), accu00, alpha_v); + accu10 = vfmaq_f16(vmulq_f16(c1, beta_v), accu10, alpha_v); + MlasStoreFloat16x8(C, accu00); + MlasStoreFloat16x8(C + ldc, accu10); + } else { + float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha); + accu00 = vmulq_f16(accu00, alpha_v); + accu10 = vmulq_f16(accu10, alpha_v); + MlasStoreFloat16x8(C, accu00); + MlasStoreFloat16x8(C + ldc, accu10); + } + + CountN -= 8, C += 8; + } + + if (CountN > 0) { + const auto* a = A; + size_t k = CountK; + float16x8_t accu0 = MlasZeroFloat16x8(); + float16x8_t accu1 = MlasZeroFloat16x8(); + for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32); + float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40); + float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48); + float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56); + float16x8_t a0 = MlasLoadFloat16x8(a); + float16x8_t a1 = MlasLoadFloat16x8(a + lda); + accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0); + accu1 = maq_laneq_f16_accu(accu1, b0, b1, b2, b3, b4, b5, b6, b7, a1); + } + + if (k & 4) { + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24); + float16x4_t a0 = MlasLoadFloat16x4(a); + float16x4_t a1 = MlasLoadFloat16x4(a + lda); + accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0); + accu1 = maq_lane_f16_accu(accu1, b0, b1, b2, b3, a1); + k -= 4, a += 4, PackedB += 4 * 8; + } + + if (k > 0) { + float16x4_t a0 = MlasLoadPartialFloat16x4(a, k); + float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k); + float16x8_t b0 = MlasLoadFloat16x8(PackedB); + accu0 = vfmaq_lane_f16(accu0, b0, a0, 0); + accu1 = vfmaq_lane_f16(accu1, b0, a1, 0); + if (k > 1) { + float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8); + accu0 = vfmaq_lane_f16(accu0, b1, a0, 1); + accu1 = vfmaq_lane_f16(accu1, b1, a1, 1); + } + if (k > 2) { + float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16); + accu0 = vfmaq_lane_f16(accu0, b2, a0, 2); + accu1 = vfmaq_lane_f16(accu1, b2, a1, 2); + } + PackedB += k * 8; + } + + float16x4_t accu0_low = vget_low_f16(accu0); + float16x4_t accu0_high = vget_high_f16(accu0); + float16x4_t accu1_low = vget_low_f16(accu1); + float16x4_t accu1_high = vget_high_f16(accu1); + + if (CountN & 4) { + if constexpr (beta_behavior == 1) { + float16x4_t c0 = MlasLoadFloat16x4(C); + float16x4_t c1 = MlasLoadFloat16x4(C + ldc); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStoreFloat16x4(C, vfma_f16(c0, accu0_low, alpha_v)); + MlasStoreFloat16x4(C + ldc, vfma_f16(c1, accu1_low, alpha_v)); + } else if constexpr (beta_behavior == 2) { + float16x4_t c0 = MlasLoadFloat16x4(C); + float16x4_t c1 = MlasLoadFloat16x4(C + ldc); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + MlasStoreFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu0_low, alpha_v)); + MlasStoreFloat16x4(C + ldc, vfma_f16(vmul_f16(c1, beta_v), accu1_low, alpha_v)); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStoreFloat16x4(C, vmul_f16(accu0_low, alpha_v)); + MlasStoreFloat16x4(C + ldc, vmul_f16(accu1_low, alpha_v)); + } + CountN -= 4, C += 4; + accu0_low = accu0_high; + accu1_low = accu1_high; + } + + if (CountN) { + if constexpr (beta_behavior == 1) { + float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN); + float16x4_t c1 = MlasLoadPartialFloat16x4(C + ldc, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStorePartialFloat16x4(C, vfma_f16(c0, accu0_low, alpha_v), CountN); + MlasStorePartialFloat16x4(C + ldc, vfma_f16(c1, accu1_low, alpha_v), CountN); + } else if constexpr (beta_behavior == 2) { + float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN); + float16x4_t c1 = MlasLoadPartialFloat16x4(C + ldc, CountN); + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + float16x4_t beta_v = MlasBroadcastFloat16x4(beta); + MlasStorePartialFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu0_low, alpha_v), CountN); + MlasStorePartialFloat16x4(C + ldc, vfma_f16(vmul_f16(c1, beta_v), accu1_low, alpha_v), CountN); + } else { + float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha); + MlasStorePartialFloat16x4(C, vmul_f16(accu0_low, alpha_v), CountN); + MlasStorePartialFloat16x4(C + ldc, vmul_f16(accu1_low, alpha_v), CountN); + } + } + } +} + +void HGemm_TransposedPackedB_Kernel( + const MLAS_FP16* A, + const MLAS_FP16* PackedB, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t CountK, + size_t lda, + size_t ldc, + _mlas_fp16_ alpha, + _mlas_fp16_ beta +) { + if (CountM > 2) { + MLAS_THROW_EX(std::runtime_error, "HGemm_TransposedPackedB_Kernel only support <= 2 rows"); + } + + const auto* A_data = reinterpret_cast(A); + const auto* PackedB_data = reinterpret_cast(PackedB); + auto* C_data = reinterpret_cast<_mlas_fp16_*>(C); + const auto f16_0 = MLAS_FP16(0.0f); + const auto f16_1 = MLAS_FP16(1.0f); + if (CountM == 1) { + if (beta == f16_0.val) { + HGemm_TransposedPackedB_Kernel_M1<0>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta); + } else if (beta == f16_1.val) { + HGemm_TransposedPackedB_Kernel_M1<1>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta); + } else { + HGemm_TransposedPackedB_Kernel_M1<2>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta); + } + } else { + if (beta == f16_0.val) { + HGemm_TransposedPackedB_Kernel_M2<0>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta); + } else if (beta == f16_1.val) { + HGemm_TransposedPackedB_Kernel_M2<1>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta); + } else { + HGemm_TransposedPackedB_Kernel_M2<2>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta); + } + } +} + +} // namespace hgemm_neon diff --git a/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp new file mode 100644 index 0000000000000..5b131a8e41f21 --- /dev/null +++ b/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp @@ -0,0 +1,28 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + hgemm_kernel_neon.cpp + +Abstract: + + This module implements half precision GEMM kernel for neon. + +--*/ + +#include "mlasi.h" +#include "halfgemm.h" + +const MLAS_HGEMM_DISPATCH MlasHGemmDispatchNeon = [](){ + MLAS_HGEMM_DISPATCH d; +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + d.HPackBKernel_TransposedB = hgemm_neon::HPackB_TransposedB_Kernel; + d.HGemmKernel_TransposedB = hgemm_neon::HGemm_TransposedB_Kernel; + d.HGemmKernel_TransposedPackedB = hgemm_neon::HGemm_TransposedPackedB_Kernel; +#endif + return d; +}(); diff --git a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp index 69e37d2b916d1..5b1f9d7d4a2dc 100644 --- a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp +++ b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp @@ -93,39 +93,6 @@ Transpose8x8(uint8x8_t& v0, uint8x8_t& v1, uint8x8_t& v2, uint8x8_t& v3, v7 = vreinterpret_u8_u32(c3.val[1]); } -MLAS_FORCEINLINE void -Transpose4x8(float16x8_t& v0, float16x8_t& v1, float16x8_t& v2, float16x8_t& v3) -{ - // |v00|v01|v02|v03|v04|v05|v06|v07| - // |v10|v11|v12|v13|v14|v15|v16|v17| - // |v20|v21|v22|v23|v24|v25|v26|v27| - // |v30|v31|v32|v33|v34|v35|v36|v37| - // => - // |v00|v10|v20|v30|v04|v14|v24|v34| - // |v01|v11|v21|v31|v05|v15|v25|v35| - // |v02|v12|v22|v32|v06|v16|v26|v36| - // |v03|v13|v23|v33|v07|v17|v27|v37| - float16x8x2_t t01 = vtrnq_f16(v0, v1); - float16x8x2_t t23 = vtrnq_f16(v2, v3); - - v0 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0]))); - v1 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1]))); - v2 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0]))); - v3 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1]))); -} - -MLAS_FORCEINLINE void -Transpose4x4(float16x4_t& v0, float16x4_t& v1, float16x4_t& v2, float16x4_t& v3) -{ - float16x4x2_t t01 = vtrn_f16(v0, v1); - float16x4x2_t t23 = vtrn_f16(v2, v3); - - v0 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0]))); - v1 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1]))); - v2 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0]))); - v3 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1]))); -} - void HQ4BitGemmPackQuantBData_CompFp16( size_t N, diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 100d7d47751aa..56fad6bb3412a 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -301,6 +301,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE); // Define the default strides to step through slices of the input matrices. // +#define MLAS_HGEMM_STRIDEN 32 +#define MLAS_HGEMM_STRIDEK 512 #define MLAS_SGEMM_STRIDEN 128 #define MLAS_SGEMM_STRIDEK 128 #define MLAS_SGEMM_PACKED_STRIDEN 128 @@ -317,6 +319,7 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE); // the effort at this time. // +#define MLAS_HGEMM_STRIDEN_THREAD_ALIGN 16 #define MLAS_SGEMM_STRIDEN_THREAD_ALIGN 16 #define MLAS_DGEMM_STRIDEN_THREAD_ALIGN 8 #define MLAS_QGEMM_STRIDEN_THREAD_ALIGN 16 @@ -944,6 +947,7 @@ extern "C" { #define MLAS_SGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024)) #define MLAS_DGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024)) #define MLAS_QGEMM_THREAD_COMPLEXITY 65536 +#define MLAS_HGEMM_THREAD_COMPLEXITY 65536 #if defined(__aarch64__) && defined(__linux__) #define MLAS_SBGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024)) @@ -1055,6 +1059,12 @@ extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni; struct MLAS_ROPE_DISPATCH; extern const MLAS_ROPE_DISPATCH MlasRopeDispatchNeon; +// +// half gemm dispatch structure +// +struct MLAS_HGEMM_DISPATCH; +extern const MLAS_HGEMM_DISPATCH MlasHGemmDispatchNeon; + // // Quantized depthwise convolution kernels. @@ -1217,6 +1227,7 @@ struct MLAS_PLATFORM { MLAS_CAST_F32_TO_F16_KERNEL* CastF32ToF16Kernel; const MLAS_ROPE_DISPATCH* RopeDispatch{nullptr}; + const MLAS_HGEMM_DISPATCH* HGemmDispatch{nullptr}; }; inline diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index ec572a4150292..026a954bbc6c2 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -544,6 +544,7 @@ Return Value: this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon; this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; this->RopeDispatch = &MlasRopeDispatchNeon; + this->HGemmDispatch = &MlasHGemmDispatchNeon; // // Check if the processor supports ASIMD dot product instructions. diff --git a/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc b/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc index b68cbaf85bcff..b1d6c51f693fd 100644 --- a/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc +++ b/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc @@ -147,7 +147,7 @@ static Status MatchAndProcess( RuntimeOptimizationRecord::ProducedOpIdVector produced_op_ids{}; produced_op_ids.reserve(action_saved_state.produced_node_op_schemas.size()); - for (const auto op_schema : action_saved_state.produced_node_op_schemas) { + for (const auto& op_schema : action_saved_state.produced_node_op_schemas) { produced_op_ids.push_back(utils::MakeOpId(*op_schema)); if (save_context->record_produced_node_op_schema) { status = save_context->record_produced_node_op_schema(*op_schema); diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc index 950ac247a2046..489cd19b11302 100644 --- a/onnxruntime/core/platform/windows/logging/etw_sink.cc +++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc @@ -64,6 +64,10 @@ EtwRegistrationManager& EtwRegistrationManager::Instance() { return instance; } +bool EtwRegistrationManager::SupportsETW() { + return true; +} + bool EtwRegistrationManager::IsEnabled() const { std::lock_guard lock(provider_change_mutex_); return is_enabled_; @@ -248,5 +252,19 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, } } // namespace logging } // namespace onnxruntime +#else +// ETW is not supported on this platform but should still define a dummy EtwRegistrationManager +// so that it can be used in the EP provider bridge. +namespace onnxruntime { +namespace logging { +EtwRegistrationManager& EtwRegistrationManager::Instance() { + static EtwRegistrationManager instance; + return instance; +} +bool EtwRegistrationManager::SupportsETW() { + return false; +} +} // namespace logging +} // namespace onnxruntime #endif // ETW_TRACE_LOGGING_SUPPORTED diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h index 2a798a28f13de..62b762886ca82 100644 --- a/onnxruntime/core/platform/windows/logging/etw_sink.h +++ b/onnxruntime/core/platform/windows/logging/etw_sink.h @@ -60,6 +60,9 @@ class EtwRegistrationManager { // Singleton instance access static EtwRegistrationManager& Instance(); + // Returns true if ETW is supported at all. + static bool SupportsETW(); + // Check if ETW logging is enabled bool IsEnabled() const; @@ -110,5 +113,33 @@ class EtwRegistrationManager { } // namespace logging } // namespace onnxruntime +#else +// ETW is not supported on this platform but should still define a dummy EtwRegistrationManager +// so that it can be used in the EP provider bridge. +#include "core/common/logging/severity.h" +namespace onnxruntime { +namespace logging { +class EtwRegistrationManager { + public: + using EtwInternalCallback = std::function; + + static EtwRegistrationManager& Instance(); + static bool SupportsETW(); + bool IsEnabled() const { return false; } + UCHAR Level() const { return 0; } + Severity MapLevelToSeverity() { return Severity::kFATAL; } + uint64_t Keyword() const { return 0; } + HRESULT Status() const { return 0; } + void RegisterInternalCallback(const EtwInternalCallback& callback) {} + void UnregisterInternalCallback(const EtwInternalCallback& callback) {} + + private: + EtwRegistrationManager() = default; + ~EtwRegistrationManager() = default; +}; +} // namespace logging +} // namespace onnxruntime #endif // ETW_TRACE_LOGGING_SUPPORTED diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h index 118cb4a3ba4bd..fbbd4273757d5 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h +++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h @@ -150,14 +150,14 @@ struct PoolAttributes { case AutoPadType::VALID: *pad_head = 0; *pad_tail = 0; - *out_size = ComputeOutputSize(in_size, stride, kernel, 0, dilation); + *out_size = ComputeOutputSize(in_size, stride, kernel, 0, 0, dilation); break; case AutoPadType::SAME_LOWER: { int64_t legacy_target_size = (in_size + stride - 1) / stride; int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size; *pad_head = (pad_needed + 1) / 2; *pad_tail = pad_needed - *pad_head; - *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation); + *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation); break; } case AutoPadType::SAME_UPPER: { @@ -165,7 +165,7 @@ struct PoolAttributes { int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size; *pad_head = pad_needed / 2; *pad_tail = pad_needed - *pad_head; - *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation); + *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation); break; } default: { @@ -173,7 +173,7 @@ struct PoolAttributes { } } } else { - *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head + *pad_tail, dilation); + *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation); } } #if defined(_MSC_VER) && !defined(__clang__) @@ -184,13 +184,21 @@ struct PoolAttributes { int64_t ComputeOutputSize(int64_t in_size, int64_t stride, int64_t kernel, - int64_t pad_needed, + int64_t pad_head, + int64_t pad_tail, int64_t dilation) const { - if (ceil_mode == 0) { - return static_cast(static_cast(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1); + int64_t numerator = in_size + pad_head + pad_tail - dilation * (kernel - 1) - 1; + int64_t out_size = numerator / stride + 1; + + if (ceil_mode == 1) { + out_size = static_cast(std::ceil(static_cast(numerator) / stride)) + 1; + // Ensure that the last pooling starts inside the image (at least 1 pixel) + // Reference: https://github.com/onnx/onnx/pull/5741 + if ((out_size - 1) * stride >= in_size + pad_head) { + --out_size; + } } - return static_cast( - std::ceil(static_cast(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1)); + return out_size; } #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) diff --git a/onnxruntime/core/providers/cpu/nn/pool_functors.h b/onnxruntime/core/providers/cpu/nn/pool_functors.h index d3205278b72f6..476a9a0338969 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_functors.h +++ b/onnxruntime/core/providers/cpu/nn/pool_functors.h @@ -406,6 +406,7 @@ struct AveragePool1DTask final { for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h - pads[0]; int64_t hend = hstart + kernel_shape[0] * dilation_h; + hend = std::min(hend, height + pads[1]); y_d[ph] = 0; int total_elements = 0; for (int64_t h = hstart; h < hend; h += dilation_h) { @@ -461,9 +462,11 @@ struct AveragePool2DTask final { for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h - pads[0]; int64_t hend = hstart + kernel_shape[0] * dilation_h; + hend = std::min(hend, height + pads[1]); for (int64_t pw = 0; pw < pooled_width; ++pw) { int64_t wstart = pw * stride_w - pads[1]; int64_t wend = wstart + kernel_shape[1] * dilation_w; + wend = std::min(wend, width + pads[3]); const int64_t pool_index = ph * pooled_width + pw; y_d[pool_index] = 0; int total_elements = 0; @@ -532,12 +535,15 @@ struct AveragePool3DTask { for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h - pads[0]; int64_t hend = hstart + kernel_shape[0] * dilation_h; + hend = std::min(hend, height + pads[1]); for (int64_t pw = 0; pw < pooled_width; ++pw) { int64_t wstart = pw * stride_w - pads[1]; int64_t wend = wstart + kernel_shape[1] * dilation_w; + wend = std::min(wend, width + pads[3]); for (int64_t pd = 0; pd < pooled_depth; ++pd) { int64_t dstart = pd * stride_d - pads[2]; int64_t dend = dstart + kernel_shape[2] * dilation_d; + dend = std::min(dend, depth + pads[5]); const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd; y_d[pool_index] = 0; int total_elements = 0; diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc index 91cad034d8854..fd1720d69eebd 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc index 75a66d3a14643..5874eb1e7dc3b 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc index 5599fbdc69bdd..91d1a38e71e6f 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc index 9059de817e210..03329b9159c06 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc index 9821d9267c71f..becd677e32ac1 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc index a8394faec51be..fa5e292be0ecd 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc index 5477cd16f9c01..a7a837ae210b4 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc index ef8709641e2d0..039d8510bb8d2 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc index 7d0e04fbd7b0e..ed9062f894660 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc index 218c41d6f07c0..fc2348951edb7 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc index b5e9c011990ce..986ce78fb1acc 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc index d6da9181b5a3d..ccd3f8b571fcb 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc index 66eefcd6e4840..cff96c2f1ff99 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc index d7b35572e6cd1..250b190091a52 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc index 6a633c443c9e5..e3dcee1e3d597 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/framework/tensorprotoutils.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc index aeadbd17053cf..a80742aef9cb6 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc index b0404ebec0583..8127de0a0f05f 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc index a2a4786b72ec7..10c5efb84ed8f 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc index d13b81c2a14b8..eb81f5e3f59ee 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc index a6da290753b74..fbb353f949f48 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc index c8641093ee7eb..d65c069851c1f 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc index f2f9165d2f3cc..fad5d8289c6b0 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc index 44403010c936c..af5aeba6c8236 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc index facdc7132dc00..52b075b0271ef 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc index a2a8b4512b028..8fa915de95a72 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc index edee298ad1ccf..7509fd15f1c5e 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include #include "core/common/logging/logging.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc index fb3ca5e6175fa..44510c33c004c 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc index 6fe5ca32fe044..4a9e3eb00a787 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc index dbd960ee5536c..77df9d2fd771c 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc index 95cd813800c9a..b9ebbace8d391 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include +#include "core/graph/onnx_protobuf.h" #include "core/common/logging/logging.h" #include "core/common/safeint.h" diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h index 4db335afa98b0..3cbf7d1ee40e8 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h @@ -2,7 +2,7 @@ // Licensed under the MIT License. #pragma once -#include +#include "core/graph/onnx_protobuf.h" #include #include "core/common/inlined_containers_fwd.h" diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc index 79674fd706151..3df231e53e7c0 100644 --- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc @@ -2,13 +2,15 @@ // Licensed under the MIT License. #include "core/providers/qnn/builder/onnx_ctx_model_helper.h" -#include "core/graph/constants.h" -#include "core/providers/qnn/builder/qnn_model.h" #include #include #include +#include "core/providers/qnn/ort_api.h" +#include "core/providers/qnn/builder/qnn_utils.h" +#include "core/providers/qnn/builder/qnn_model.h" + namespace onnxruntime { namespace qnn { @@ -51,9 +53,9 @@ Status GetMainContextNode(const std::vectorOpType(), "Should only filter in the EPContext node."); - NodeAttrHelper node_helper(*ep_context_node); + const Node& ep_context_node = *graph_viewer.Nodes().begin(); + ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node.OpType(), "Should only filter in the EPContext node."); + NodeAttrHelper node_helper(ep_context_node); int64_t is_main_context = node_helper.Get(MAIN_CONTEXT, static_cast(0)); if (1 == is_main_context) { main_context_pos.push_back(static_cast(i)); @@ -68,17 +70,16 @@ Status CreateNodeArgs(const std::vector& names, const std::unordered_map& tensor_info_table, std::vector& node_args, onnxruntime::Graph& graph) { - using namespace ONNX_NAMESPACE; for (size_t i = 0; i < names.size(); ++i) { std::string name = names[i]; ORT_RETURN_IF(tensor_info_table.find(name) == tensor_info_table.end(), "Tensor name: ", name, " not found in tensor_info_table"); const OnnxTensorInfo& tensor_info = tensor_info_table.at(name); - TypeProto tensor_type; - tensor_type.mutable_tensor_type()->set_elem_type(tensor_info.data_type_); + std::unique_ptr tensor_type = Factory::Create(); + tensor_type->mutable_tensor_type()->set_elem_type(tensor_info.data_type_); for (size_t j = 0; j < tensor_info.shape_.size(); ++j) { - tensor_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]); + tensor_type->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]); } - auto& input_arg = graph.GetOrCreateNodeArg(name, &tensor_type); + auto& input_arg = graph.GetOrCreateNodeArg(name, tensor_type.get()); node_args.push_back(&input_arg); } return Status::OK(); @@ -161,8 +162,8 @@ Status TryGetMaxSpillFillSize(const std::vector(0)); if (max_size > max_spill_fill_size) { max_spill_fill_size = max_size; diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h index 92c5391b40f09..3dfa0ae21001b 100644 --- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h @@ -6,12 +6,8 @@ #include #include -#include "qnn_def.h" -#include "core/common/logging/logging.h" -#include "core/graph/graph_viewer.h" -#include "core/providers/shared/utils/utils.h" -#include "core/graph/model.h" -#include "core/framework/execution_provider.h" +#include "core/providers/qnn/builder/qnn_def.h" +#include "core/providers/qnn/ort_api.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h index 05398c3f22ea2..0846275496ebf 100644 --- a/onnxruntime/core/providers/qnn/builder/op_builder.h +++ b/onnxruntime/core/providers/qnn/builder/op_builder.h @@ -3,9 +3,7 @@ #pragma once -#include "core/graph/graph_viewer.h" -#include "core/framework/node_unit.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/ort_api.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc index e411c2a6bf536..3d66003fb2bca 100644 --- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc +++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc @@ -5,8 +5,6 @@ #include #include -#include - #include "op_builder_factory.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc index c685fa065e2ba..e3a6141c292dd 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc @@ -1,14 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" +#include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "base_op_builder.h" - namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc index ed70111087e19..cd1ee72e00d4f 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc @@ -2,15 +2,9 @@ // Licensed under the MIT License. #include "core/providers/qnn/builder/opbuilder/base_op_builder.h" +#include #include "core/providers/qnn/builder/qnn_utils.h" -#include - -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" -#include "core/providers/cpu/tensor/transpose.h" -#include "core/common/safeint.h" - namespace onnxruntime { namespace qnn { @@ -271,37 +265,189 @@ Status BaseOpBuilder::SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& return Status::OK(); } -Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper, - const onnx::TensorProto& initializer, - const std::vector& perm, - std::vector& transposed_data) const { - const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer.data_type())->GetElementType(); - const auto tensor_shape_dims = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer); - TensorShape tensor_shape{tensor_shape_dims}; - AllocatorPtr cpu_allocator = std::make_shared(); - Tensor in_tensor = Tensor(tensor_dtype, tensor_shape, cpu_allocator); - - auto rank = perm.size(); - std::vector new_tensor_shape_dims; - std::vector permutations; - new_tensor_shape_dims.reserve(rank); - permutations.reserve(rank); - for (int64_t p : perm) { - permutations.push_back(p); - new_tensor_shape_dims.push_back(tensor_shape_dims[p]); +static Status GetTransposeStrides(const TensorShape& input_shape, + gsl::span perm, + gsl::span input_strides, + gsl::span output_strides) { + const size_t rank = input_shape.NumDimensions(); + ORT_RETURN_IF_NOT(perm.size() == rank, "Expected perm size of ", rank); + ORT_RETURN_IF_NOT(input_strides.size() == rank, "Expected input_strides size of ", rank); + ORT_RETURN_IF_NOT(output_strides.size() == rank, "Expected output_strides size of ", rank); + std::vector output_shape_dims(rank); + ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape(input_shape.GetDims(), perm, output_shape_dims))); + const TensorShape output_shape = TensorShape::FromExistingBuffer(output_shape_dims); + + for (size_t i = 0; i < rank; ++i) { + int64_t stride = (i < rank - 1) ? input_shape.SizeFromDimension(i + 1) : 1; + ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides."); + input_strides[i] = static_cast(stride); + } + + for (size_t i = 0; i < rank; ++i) { + int64_t stride = (i < rank - 1) ? output_shape.SizeFromDimension(i + 1) : 1; + ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides."); + output_strides[i] = static_cast(stride); } - TensorShape new_tensor_shape(new_tensor_shape_dims); - Tensor out_tensor = Tensor(tensor_dtype, new_tensor_shape, cpu_allocator); - ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor( - Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), initializer, in_tensor)); - ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor)); - onnx::TensorProto new_tensor_proto = onnxruntime::utils::TensorToTensorProto(out_tensor, "test"); - ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(new_tensor_proto, transposed_data)); + return Status::OK(); +} + +// Internal function to transpose data of rank 5 with the given permutation. +// Example: transpose input from either (N,C,H,W,D) or (C,N,H,W,D) to (H,W,D,C,N). +static Status TransposeDataRank5(const TensorShape& input_shape, + gsl::span perm, + size_t elem_byte_size, + gsl::span input_buffer, + gsl::span output_buffer) { + std::array input_strides = {}; + std::array output_strides = {}; + ORT_RETURN_IF_ERROR(GetTransposeStrides(input_shape, perm, input_strides, output_strides)); + + std::vector perm_inverse(perm.size()); + ORT_RETURN_IF_ERROR(qnn::utils::InvertPerm(perm, perm_inverse)); + + for (int64_t d0 = 0; d0 < input_shape[0]; ++d0) { + for (int64_t d1 = 0; d1 < input_shape[1]; ++d1) { + for (int64_t d2 = 0; d2 < input_shape[2]; ++d2) { + for (int64_t d3 = 0; d3 < input_shape[3]; ++d3) { + for (int64_t d4 = 0; d4 < input_shape[4]; ++d4) { + const size_t src_elem_index = ((d0 * input_strides[0]) + + (d1 * input_strides[1]) + + (d2 * input_strides[2]) + + (d3 * input_strides[3]) + + (d4 * input_strides[4])); + const size_t dst_elem_index = ((d0 * output_strides[perm_inverse[0]]) + + (d1 * output_strides[perm_inverse[1]]) + + (d2 * output_strides[perm_inverse[2]]) + + (d3 * output_strides[perm_inverse[3]]) + + (d4 * output_strides[perm_inverse[4]])); + + const size_t src_byte_index = src_elem_index * elem_byte_size; + const size_t dst_byte_index = dst_elem_index * elem_byte_size; + assert(src_byte_index < input_buffer.size()); + assert(dst_byte_index < output_buffer.size()); + + std::memcpy(&output_buffer[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size); + } + } + } + } + } return Status::OK(); } +Status BaseOpBuilder::TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper, + std::vector& data_shape, + const onnx::TensorProto& initializer, + std::vector& transposed_data) const { + ORT_RETURN_IF_NOT(data_shape.size() == 2, "Expected shape of rank 2"); + + std::array perm = {1, 0}; + std::vector output_shape(data_shape.size()); + ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape(data_shape, perm, output_shape))); + + auto onnx_type = static_cast(initializer.data_type()); + const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type); + ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type"); + + std::vector input_buffer; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer)); + transposed_data.resize(input_buffer.size()); + + for (size_t row = 0; row < data_shape[0]; row++) { + for (size_t col = 0; col < data_shape[1]; col++) { + const size_t src_elem_index = (row * data_shape[1] + col); + const size_t dst_elem_index = (col * output_shape[1] + row); + const size_t src_byte_index = src_elem_index * elem_byte_size; + const size_t dst_byte_index = dst_elem_index * elem_byte_size; + assert(src_byte_index < input_buffer.size()); + assert(dst_byte_index < transposed_data.size()); + + std::memcpy(&transposed_data[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size); + } + } + + data_shape = std::move(output_shape); // Update parameter with final transposed shape + return Status::OK(); +} + +Status BaseOpBuilder::TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper, + const onnx::TensorProto& initializer, + std::vector& transposed_data, + bool is_3d) const { + auto onnx_type = static_cast(initializer.data_type()); + const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type); + std::vector input_shape = qnn::utils::GetInitializerShape(initializer); + std::vector input_buffer; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer)); + transposed_data.resize(input_buffer.size()); + return TransposeFromNchwToHwcn(std::move(input_shape), elem_byte_size, input_buffer, transposed_data, is_3d); +} + +Status BaseOpBuilder::TransposeFromNchwToHwcn(std::vector&& original_input_shape_dims, + size_t elem_byte_size, + gsl::span input_buffer, + gsl::span output_buffer, + bool is_3d) const { + std::vector input_shape_dims = std::move(original_input_shape_dims); + const size_t rank = input_shape_dims.size(); + ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Only support input of rank 4 or 5 but got rank ", + rank); + ORT_RETURN_IF_NOT(output_buffer.size() == input_buffer.size(), + "Expected output buffer's size to equal the input buffer's size: ", + output_buffer.size(), " != ", input_buffer.size()); + ORT_RETURN_IF_NOT(elem_byte_size != 0, "Invalid element byte size due to potentially unsupported type"); + + if (!is_3d) { + input_shape_dims.push_back(1); // Make it 3D by making shape (N,C,H,W,1) + } + + return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims), + nchw2hwcn_perm_3d, + elem_byte_size, + input_buffer, + output_buffer); +} + +Status BaseOpBuilder::TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper, + const onnx::TensorProto& initializer, + std::vector& transposed_data, + bool is_3d) const { + auto onnx_type = static_cast(initializer.data_type()); + const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type); + std::vector input_shape = qnn::utils::GetInitializerShape(initializer); + std::vector input_buffer; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer)); + transposed_data.resize(input_buffer.size()); + return TransposeFromCnhwToHwcn(std::move(input_shape), elem_byte_size, input_buffer, transposed_data, is_3d); +} + +Status BaseOpBuilder::TransposeFromCnhwToHwcn(std::vector&& original_input_shape_dims, + size_t elem_byte_size, + gsl::span input_buffer, + gsl::span output_buffer, + bool is_3d) const { + std::vector input_shape_dims = std::move(original_input_shape_dims); + const size_t rank = input_shape_dims.size(); + ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Only support input of rank 4 or 5 but got rank ", + rank); + ORT_RETURN_IF_NOT(output_buffer.size() == input_buffer.size(), + "Expected output buffer's size to equal the input buffer's size: ", + output_buffer.size(), " != ", input_buffer.size()); + ORT_RETURN_IF_NOT(elem_byte_size != 0, "Invalid element byte size due to potentially unsupported type"); + + if (!is_3d) { + input_shape_dims.push_back(1); // Make it 3D by making shape (C,N,H,W,1) + } + + return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims), + cnhw2hwcn_perm_3d, + elem_byte_size, + input_buffer, + output_buffer); +} + Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, Qnn_Scalar_t& axis_qnn_scalar, diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h index 055c0f6ccf2fa..8e34b5d87cc68 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h @@ -3,11 +3,11 @@ #pragma once -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/ort_api.h" +#include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder.h" #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h" -#include "core/framework/allocator.h" #include "QnnOpDef.h" @@ -215,7 +215,8 @@ class BaseOpBuilder : public IOpBuilder { } // NCHW shape to channel last - Status NchwShapeToNhwc(const std::vector& nchw_shape, std::vector& nhwc_shape) const { + template + Status NchwShapeToNhwc(gsl::span nchw_shape, gsl::span nhwc_shape) const { ORT_RETURN_IF_NOT(nchw_shape.size() == 4, "shape should have 4 dimension NCHW."); nhwc_shape[0] = nchw_shape[0]; nhwc_shape[1] = nchw_shape[2]; @@ -226,7 +227,8 @@ class BaseOpBuilder : public IOpBuilder { } // NCHW shape to HWCN shape, required for Conv weight - Status NchwShapeToHwcn(const std::vector& nchw_shape, std::vector& hwcn_shape) const { + template + Status NchwShapeToHwcn(gsl::span nchw_shape, gsl::span hwcn_shape) const { if (nchw_shape.size() == 4) { hwcn_shape[0] = nchw_shape[2]; hwcn_shape[1] = nchw_shape[3]; @@ -246,7 +248,8 @@ class BaseOpBuilder : public IOpBuilder { } // CNHW shape to HWCN shape, required for Conv weight - Status CnhwShapeToHwcn(const std::vector& cnhw_shape, std::vector& hwcn_shape) const { + template + Status CnhwShapeToHwcn(gsl::span cnhw_shape, gsl::span hwcn_shape) const { if (cnhw_shape.size() == 4) { hwcn_shape[0] = cnhw_shape[2]; hwcn_shape[1] = cnhw_shape[3]; @@ -264,37 +267,31 @@ class BaseOpBuilder : public IOpBuilder { return Status::OK(); } - Status TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper, - const onnx::TensorProto& initializer, - const std::vector& perm, - std::vector& transposed_data) const; Status TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper, const onnx::TensorProto& initializer, std::vector& transposed_data, - bool is_3d = false) const { - auto& perm = is_3d ? nchw2hwcn_perm_3d : nchw2hwcn_perm; - return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data); - } + bool is_3d = false) const; + Status TransposeFromNchwToHwcn(std::vector&& input_shape_dims, + size_t elem_byte_size, + gsl::span input_buffer, + gsl::span output_buffer, + bool is_3d = false) const; Status TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper, const onnx::TensorProto& initializer, std::vector& transposed_data, - bool is_3d = false) const { - auto& perm = is_3d ? cnhw2hwcn_perm_3d : cnhw2hwcn_perm; - return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data); - } + bool is_3d = false) const; + Status TransposeFromCnhwToHwcn(std::vector&& input_shape_dims, + size_t elem_byte_size, + gsl::span input_buffer, + gsl::span output_buffer, + bool is_3d = false) const; Status TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper, std::vector& data_shape, const onnx::TensorProto& initializer, - std::vector& transposed_data) const { - auto tmp = data_shape[0]; - data_shape[0] = data_shape[1]; - data_shape[1] = tmp; - std::vector two_dim_trans_perm{1, 0}; - return TransposeInitializer(qnn_model_wrapper, initializer, two_dim_trans_perm, transposed_data); - } + std::vector& transposed_data) const; // Onnx Pads is [x1_begin, x2_begin, x1_end, x2_end], QNN requires [x1_begin, x1_end, x2_begin, x2_end] void ReArranagePads(std::vector& pads) const { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc index 07abcf1c7bf84..14f50fa78c1a9 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc @@ -5,16 +5,11 @@ #include #include -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/float16.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "base_op_builder.h" - namespace onnxruntime { namespace qnn { class BatchNormOpBuilder : public BaseOpBuilder { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc index d3bdee02437e4..3139c05378171 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc @@ -4,12 +4,11 @@ #include #include +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "base_op_builder.h" - namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc index e5dc4d04afefd..23b3dfb063ba2 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc @@ -4,14 +4,11 @@ #include #include -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "base_op_builder.h" - namespace onnxruntime { namespace qnn { class ClipOpBuilder : public BaseOpBuilder { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc index 12887f0fb72d6..0f92778252d48 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc @@ -1,16 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "base_op_builder.h" - namespace onnxruntime { namespace qnn { @@ -211,9 +206,9 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper, // Change shape to HWCN, it could be initializer or normal input if (conv_type == OnnxConvType::kConv) { - ORT_RETURN_IF_ERROR(NchwShapeToHwcn(input_info.shape, actual_shape)); + ORT_RETURN_IF_ERROR(NchwShapeToHwcn(input_info.shape, actual_shape)); } else if (conv_type == OnnxConvType::kConvTranspose) { - ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(input_info.shape, actual_shape)); + ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(input_info.shape, actual_shape)); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str()); } @@ -413,9 +408,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper, // Create the final shape after the weights are transposed to HWCN. if (conv_type == OnnxConvType::kConv) { - ORT_RETURN_IF_ERROR(NchwShapeToHwcn(shape_2d, final_shape)); + ORT_RETURN_IF_ERROR(NchwShapeToHwcn(shape_2d, final_shape)); } else if (conv_type == OnnxConvType::kConvTranspose) { - ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(shape_2d, final_shape)); + ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(shape_2d, final_shape)); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str()); } @@ -434,16 +429,6 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper, return static_cast(dim); }); - const TensorShape tensor_shape = TensorShape::FromExistingBuffer(shape_2d_int64); // Does not own shape data. - const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum( - input_info.initializer_tensor->data_type()) - ->GetElementType(); - ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, unpacked_tensor)); - - Tensor tensor_2d(tensor_dtype, tensor_shape, unpacked_tensor.data(), OrtMemoryInfo{}); // Does not own data. - ONNX_NAMESPACE::TensorProto reshaped_initializer = onnxruntime::utils::TensorToTensorProto(tensor_2d, - reshape_output); - // The reshape (unsqueeze) may require us to shift the quant parameter's axis. if (input_info.quant_param.IsPerChannel()) { ORT_RETURN_IF_ERROR(input_info.quant_param.HandleUnsqueeze(input_info.shape, shape_2d)); @@ -452,10 +437,21 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper, // // Get transposed initializer bytes. // + std::vector original_tensor_bytes; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, + original_tensor_bytes)); + unpacked_tensor.resize(original_tensor_bytes.size()); + const size_t elem_byte_size = qnn::utils::GetElementSizeByType( + static_cast(input_info.initializer_tensor->data_type())); + ORT_RETURN_IF(elem_byte_size == 0, "Can't get element byte size from given ONNX type for initializer ", + input1_name.c_str()); + if (conv_type == OnnxConvType::kConv) { - ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor)); + ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(std::move(shape_2d_int64), elem_byte_size, original_tensor_bytes, + unpacked_tensor, /*is_3d*/ false)); } else if (conv_type == OnnxConvType::kConvTranspose) { - ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor)); + ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(std::move(shape_2d_int64), elem_byte_size, original_tensor_bytes, + unpacked_tensor, /*is_3d*/ false)); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str()); } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc index 64f676aaa9875..2bae3452199a5 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc @@ -1,14 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "core/common/safeint.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc index 5549716751d4b..d25ec3f333bf1 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc @@ -2,14 +2,10 @@ // Licensed under the MIT License. #include -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "core/common/safeint.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc index eeee26c177281..76bc766d2b04d 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc @@ -1,14 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "core/common/safeint.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc index 4b8d079c0062a..d77d9534bf1c4 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc @@ -1,16 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" -#include "onnx/defs/data_type_utils.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc index d1a0e88686f39..fc92f42b376bc 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc @@ -2,16 +2,10 @@ // Licensed under the MIT License. #include -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" -#include "onnx/defs/data_type_utils.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc index 2f66069b6609e..3c9bdf0e7f8aa 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc @@ -2,11 +2,9 @@ // Licensed under the MIT License. #include "core/providers/qnn/builder/opbuilder/base_op_builder.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" -#include "onnx/defs/data_type_utils.h" #include "QnnOpDef.h" // From QNN SDK: contains QNN constants (e.g., op names, param values). diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc index 850fd2875818e..5a158af8d542a 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc @@ -1,13 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" -#include "core/providers/common.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "core/providers/shared/utils/utils.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc index 5fc6d42a8a179..40e0ccdd4a6dd 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc @@ -1,15 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/providers/cpu/tensor/slice_helper.h" -#include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" - -#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc index ef1990ad8e69a..795886fa255ed 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc @@ -1,16 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" -#include "onnx/defs/data_type_utils.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc index 77bc58bd6f833..a98110bc96fb2 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc @@ -2,16 +2,13 @@ // Licensed under the MIT License. #include -#include #include +#include +#include #include -#include "core/common/safeint.h" -#include "onnx/defs/data_type_utils.h" -#include "core/providers/common.h" -#include "core/framework/endian_utils.h" -#include "core/providers/shared/utils/utils.h" #include "core/providers/qnn/builder/opbuilder/base_op_builder.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/qnn_utils.h" @@ -71,7 +68,7 @@ class ReduceOpBuilder : public BaseOpBuilder { using AxesQnnIntType = uint32_t; Status GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, - InlinedHashSet& axes_set) const; + std::set& axes_set) const; // Maps an operator type to the opset in which "axes" became an input instead of an attribute. static const std::array opset_with_axes_as_input; @@ -87,7 +84,7 @@ const std::array ReduceOpBuilder::opset_with_axes_as_ }; Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, - InlinedHashSet& axes_set) const { + std::set& axes_set) const { ReduceOpType reduce_op_type = GetReduceOpType(node_unit.OpType()); if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_UNKNOWN) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unknown reduce operator ", node_unit.OpType()); @@ -146,10 +143,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod auto src_span = gsl::make_span(axes_bytes.data(), axes_bytes.size()); auto dst_span = gsl::make_span(reduce_axes.data(), reduce_axes.size()); - // Copy initializer bytes (stored in little-endian order) to vector of int64_t. - // ReadLittleEndian returns a status error if the source and destination spans do not have - // matching byte sizes. - ORT_RETURN_IF_ERROR(onnxruntime::utils::ReadLittleEndian(src_span, dst_span)); + std::memcpy(dst_span.data(), src_span.data(), src_span.size_bytes()); } } @@ -218,7 +212,7 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w // // Handle axes param. // - InlinedHashSet axes_set; + std::set axes_set; ORT_RETURN_IF_ERROR(GetAxesSet(qnn_model_wrapper, node_unit, axes_set)); const size_t num_axes = axes_set.size(); diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc index b6f414da950d8..6fd67a72b64e1 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc @@ -1,15 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "base_op_builder.h" - namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc index c62fca88b6ec2..5e173b7aff030 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc @@ -5,17 +5,10 @@ #include #include -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" -#include "core/providers/qnn/builder/qnn_model_wrapper.h" -#include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/providers/cpu/tensor/slice_helper.h" -#include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" - #include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" +#include "core/providers/qnn/builder/qnn_model_wrapper.h" +#include "core/providers/qnn/builder/op_builder_factory.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc index cf8726675434f..48c637cd2e951 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc @@ -1,16 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "core/common/safeint.h" -#include "core/util/qmath.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { @@ -170,15 +164,16 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper, // Check LeakyRelu input 0 to see if it's quantized tensor bool is_quantized_tensor = node_unit.Outputs()[0].quant_param.has_value(); if (is_quantized_tensor) { - float scale; - uint8_t zero_point; - int64_t num_of_elements = 1; - concurrency::ThreadPool* thread_pool = nullptr; - GetQuantizationParameter(&tensor_data.alpha, num_of_elements, scale, zero_point, thread_pool); - unpacked_data.resize(1); - ParQuantizeLinearStd(&tensor_data.alpha, unpacked_data.data(), num_of_elements, scale, zero_point, thread_pool); - quantize_param = QnnQuantParamsWrapper(scale, static_cast(zero_point)); qnn_data_type = QNN_DATATYPE_UFIXED_POINT_8; + std::array scales = {1.0f}; + std::array offsets = {0}; + std::array shape = {1}; + auto float_data = gsl::make_span(&tensor_data.alpha, 1); + ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(float_data, shape, scales, offsets, qnn_data_type)); + + unpacked_data.resize(1); + ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(float_data, shape, scales, offsets, unpacked_data, qnn_data_type)); + quantize_param = QnnQuantParamsWrapper(scales[0], static_cast(offsets[0])); } else { const auto& inputs = node_unit.Inputs(); TensorInfo input_info = {}; diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc index b033c8723ea86..fcc7d27c3ada4 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc @@ -1,17 +1,12 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/cpu/tensor/slice_helper.h" -#include "core/framework/tensorprotoutils.h" - -#include "base_op_builder.h" - namespace onnxruntime { namespace qnn { @@ -86,26 +81,22 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel ORT_RETURN_IF_NOT(initializer_proto->has_data_type(), "Expected initializer ", input_name.c_str(), " to have a proto data type."); - // Create empty Tensor. - const auto* dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer_proto->data_type())->GetElementType(); - TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*initializer_proto); - Tensor tensor(dtype, shape, std::make_shared()); - - // Deserialize initializer into Tensor. - ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor( - onnxruntime::Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), *initializer_proto, tensor)); + // Deserialize initializer into byte buffer + std::vector initializer_bytes; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*initializer_proto, initializer_bytes)); Status status; // Copy Tensor of int32_t or int64_t elems into output (int64_ts). - if (tensor.IsDataType()) { - gsl::span tensor_elems = tensor.DataAsSpan(); + auto onnx_type = static_cast(initializer_proto->data_type()); + if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) { + gsl::span tensor_elems = ReinterpretAsSpan(initializer_bytes); output.insert(output.end(), tensor_elems.begin(), tensor_elems.end()); - } else if (tensor.IsDataType()) { - gsl::span tensor_elems = tensor.DataAsSpan(); + } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) { + gsl::span tensor_elems = ReinterpretAsSpan(initializer_bytes); output.insert(output.end(), tensor_elems.begin(), tensor_elems.end()); } else { - status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", DataTypeImpl::ToString(dtype), + status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", onnx_type, " is not supported for Slice initializer input ", input.node_arg.Name().c_str()); } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc index b62534bacf426..7326523737383 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc @@ -1,15 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" -#include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc index ba5ad2cf03cef..1db9a8f1e3e15 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc @@ -1,16 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/cpu/tensor/slice_helper.h" -#include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" - -#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc index 851ca84dce075..1d518c3ed5359 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc @@ -1,16 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/cpu/tensor/slice_helper.h" -#include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" - -#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc index d22c0811682d0..adaa13912ae50 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "core/providers/qnn/builder/opbuilder/base_op_builder.h" -#include "core/framework/utils.h" #include "core/providers/qnn/builder/op_builder_factory.h" +#include "core/providers/qnn/builder/qnn_utils.h" namespace onnxruntime { namespace qnn { const int TOPK_MIN_INPUT = 2; diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc index a42d7312f0203..bcd8a6d0f78f6 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc @@ -4,12 +4,11 @@ #include #include +#include "core/providers/qnn/ort_api.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_utils.h" -#include "core/common/safeint.h" - -#include "base_op_builder.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index 077e89a6c701c..8df4e5bb3ba39 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -17,18 +17,14 @@ #include "HTP/QnnHtpSystemContext.h" #include "Saver/QnnSaver.h" #include -#include "core/framework/endian_utils.h" -#include "core/common/logging/capture.h" + +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/qnn_allocator.h" +#include "core/providers/qnn/qnn_telemetry.h" #include "core/providers/qnn/builder/onnx_ctx_model_helper.h" #include "core/providers/qnn/builder/qnn_configs_helper.h" #include "core/providers/qnn/builder/qnn_utils.h" -#ifdef _WIN32 -#include -#include "core/platform/tracing.h" -#endif - // Flag to determine if Backend should do node validation for each opNode added #define DO_GRAPH_NODE_VALIDATIONS 1 @@ -262,12 +258,12 @@ void QnnLogging(const char* format, const auto data_type = ::onnxruntime::logging::DataType::SYSTEM; if (logger.OutputIsEnabled(severity, data_type)) { - ::onnxruntime::logging::Capture(logger, - severity, - ::onnxruntime::logging::Category::onnxruntime, - data_type, - ORT_WHERE) - .ProcessPrintf(format, argument_parameter); + auto log_capture = Factory::Create(logger, + severity, + logging::Category::onnxruntime, + data_type, + ORT_WHERE); + log_capture->ProcessPrintf(format, argument_parameter); } } @@ -408,25 +404,25 @@ Status QnnBackendManager::CreateDevice() { // Set SoC Model. The *enum* Qnn_SocModel_t is deprecated and will not be updated in the future. Therefore, // must use the latest SDK documentation to get the SoC model of the latest HW. if (soc_model_ != QNN_SOC_MODEL_UNKNOWN) { - QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig(); - custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; - custom_config.socModel = soc_model_; + gsl::not_null custom_config = device_configs_builder.PushCustomConfig(); + custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + custom_config->socModel = soc_model_; - QnnDevice_Config_t& device_config = device_configs_builder.PushConfig(); - device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - device_config.customConfig = &custom_config; + gsl::not_null device_config = device_configs_builder.PushConfig(); + device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + device_config->customConfig = custom_config; } // Set the minimum HTP architecture. The driver will use ops that are compatible with this minimum architecture. if (htp_arch_ != QNN_HTP_DEVICE_ARCH_NONE) { - QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig(); - custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; - custom_config.arch.arch = htp_arch_; - custom_config.arch.deviceId = device_id_; - - QnnDevice_Config_t& device_config = device_configs_builder.PushConfig(); - device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; - device_config.customConfig = &custom_config; + gsl::not_null custom_config = device_configs_builder.PushCustomConfig(); + custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + custom_config->arch.arch = htp_arch_; + custom_config->arch.deviceId = device_id_; + + gsl::not_null device_config = device_configs_builder.PushConfig(); + device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + device_config->customConfig = custom_config; } } @@ -1163,15 +1159,16 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() { } bool tracelogging_provider_ep_enabled = false; - const Env& env = Env::Default(); - auto& provider = env.GetTelemetryProvider(); - auto level = provider.Level(); +#ifdef _WIN32 + auto& provider = QnnTelemetry::Instance(); if (provider.IsEnabled()) { + auto level = provider.Level(); auto keyword = provider.Keyword(); if ((keyword & static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0 && level >= 5) { tracelogging_provider_ep_enabled = true; } } +#endif // defined(_WIN32) // ETW disabled previously, but enabled now if (ProfilingLevel::INVALID == profiling_level_etw_ && tracelogging_provider_ep_enabled) { @@ -1389,18 +1386,8 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging( const std::string& timingSource, const std::string& eventLevel, const char* eventIdentifier) { - TraceLoggingWrite( - telemetry_provider_handle, - "QNNProfilingEvent", - TraceLoggingKeyword(static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)), - TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE), - TraceLoggingValue(timestamp, "Timestamp"), - TraceLoggingString(message.c_str(), "Message"), - TraceLoggingString(qnnScalarValue.c_str(), "Value"), - TraceLoggingString(unit.c_str(), "Unit of Measurement"), - TraceLoggingString(timingSource.c_str(), "Timing Source"), - TraceLoggingString(eventLevel.c_str(), "Event Level"), - TraceLoggingString(eventIdentifier, "Event Identifier")); + QnnTelemetry& qnn_telemetry = QnnTelemetry::Instance(); + qnn_telemetry.LogQnnProfileEvent(timestamp, message, qnnScalarValue, unit, timingSource, eventLevel, eventIdentifier); } #endif @@ -1552,7 +1539,8 @@ void* QnnBackendManager::LoadLib(const char* file_name, int flags, std::string& auto file_path = std::filesystem::path(file_name); if (!file_path.is_absolute()) { // construct an absolute path from ORT runtime path + file_name and check whether it exists. - auto pathstring = Env::Default().GetRuntimePath() + ToPathString(file_name); + const Env& env = GetDefaultEnv(); + auto pathstring = env.GetRuntimePath() + ToPathString(file_name); auto absolute_path = pathstring.c_str(); if (std::filesystem::exists(std::filesystem::path(absolute_path))) { // load library from absolute path and search for dependencies there. diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index 685e03f17cdd3..4a69859a7e841 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -22,9 +22,8 @@ #include "QnnLog.h" #include "QnnTypes.h" #include "System/QnnSystemInterface.h" -#include "core/common/status.h" -#include "core/common/logging/logging.h" -#include "core/common/path_string.h" + +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_context_mem_handle_manager.h" #include "core/providers/qnn/builder/qnn_def.h" diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h index 9dd9bbaa08d64..b581cd90537d9 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h @@ -3,7 +3,8 @@ #pragma once -#include +#include +#include namespace onnxruntime { namespace qnn { @@ -49,9 +50,9 @@ class QnnConfigsBuilder { * * \return A reference to a default CustomConfigType object. */ - CustomConfigType& PushCustomConfig() { - custom_configs_.push_back(custom_config_init_); - return custom_configs_.back(); + gsl::not_null PushCustomConfig() { + custom_configs_.push_back(std::make_unique(custom_config_init_)); + return custom_configs_.back().get(); } /** @@ -60,15 +61,15 @@ class QnnConfigsBuilder { * * \return A reference to a default BaseConfigType object. */ - BaseConfigType& PushConfig() { - configs_.push_back(base_config_init_); - BaseConfigType& config = configs_.back(); + gsl::not_null PushConfig() { + configs_.push_back(std::make_unique(base_config_init_)); + BaseConfigType* config = configs_.back().get(); // Add pointer to this new config to the list of config pointers. if (IsNullTerminated()) { - config_ptrs_.back() = &config; // Replace last nullptr entry. + config_ptrs_.back() = config; // Replace last nullptr entry. } else { - config_ptrs_.push_back(&config); + config_ptrs_.push_back(config); } return config; @@ -81,9 +82,14 @@ class QnnConfigsBuilder { BaseConfigType base_config_init_; CustomConfigType custom_config_init_; - InlinedVector custom_configs_; - InlinedVector configs_; - InlinedVector config_ptrs_; + + // Store elements of unique_ptrs instead of by value because std::vector reallocation would change the + // location of elements in memory. BaseConfigType objects may contain pointers to CustomConfigType objects, + // so we need to make sure that pointers to these objects are stable in memory. + std::vector> custom_configs_; + std::vector> configs_; + + std::vector config_ptrs_; }; } // namespace qnn diff --git a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc index 22bbc2d48e8e4..4d868c6ab96f6 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc @@ -5,7 +5,7 @@ #include "HTP/QnnHtpMem.h" -#include "core/common/common.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_def.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/qnn_allocator.h" diff --git a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h index 397ea8bad6d9a..0dd8a8466d1cf 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h @@ -9,10 +9,7 @@ #include "QnnInterface.h" -#include "core/common/common.h" -#include "core/common/inlined_containers.h" -#include "core/common/logging/logging.h" -#include "core/common/status.h" +#include "core/providers/qnn/ort_api.h" namespace onnxruntime::qnn { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h index f0619eb218245..148fa115d40e5 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_def.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h @@ -9,8 +9,7 @@ #include #include #include -#include "core/graph/basic_types.h" -#include "core/common/common.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc index 5f8b7f35eea8b..a9ccb9cc15206 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc @@ -7,15 +7,12 @@ #include #include "QnnOpDef.h" -#include "core/framework/utils.h" -#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h" -#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_node_group.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/qnn_allocator.h" #include "core/providers/qnn/shared_context.h" -#include "core/providers/shared/utils/utils.h" namespace onnxruntime { namespace qnn { @@ -104,7 +101,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer, // valid throughout the lifetime of the ModelBuilder std::vector> node_unit_holder; std::unordered_map node_unit_map; - std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger); + std::tie(node_unit_holder, node_unit_map) = GetQDQNodeUnits(graph_viewer, logger); // This name must be same with the EPContext node name const auto& graph_name = fused_node.Name(); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h index 2f220e708c50e..3a2a080aa391f 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h @@ -6,14 +6,11 @@ #include #include -#include "core/common/status.h" -#include "core/framework/node_unit.h" -#include "core/graph/graph_viewer.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_def.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/qnn_backend_manager.h" #include "core/providers/qnn/rpcmem_library.h" -#include "core/session/onnxruntime_cxx_api.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index 79f8f176a2e76..6bd12959afbdf 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/providers/qnn/builder/qnn_model_wrapper.h" + #include #include #include @@ -8,10 +10,7 @@ #include #include -#include "qnn_model_wrapper.h" -#include "core/common/safeint.h" -#include "core/framework/tensorprotoutils.h" -#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_utils.h" namespace onnxruntime { @@ -461,7 +460,7 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ", scale_name.c_str()); gsl::not_null scale_tensor_proto = iter->second; - TensorShape scale_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*scale_tensor_proto); + TensorShape scale_shape(qnn::utils::GetInitializerShape(*scale_tensor_proto)); // Check the number of scale values to determine if the tensor is per-channel. // This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis because even a @@ -636,29 +635,13 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& // If this is an int4, we need to unpack it because QNN treats int4 as a full int8. if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) { - TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer); - const size_t num_elems = shape.Size(); - std::vector packed_int4_bytes = std::move(unpacked_tensor); - unpacked_tensor = std::vector(num_elems); - - auto dst = gsl::make_span(reinterpret_cast(unpacked_tensor.data()), unpacked_tensor.size()); - auto src = gsl::make_span(reinterpret_cast(packed_int4_bytes.data()), packed_int4_bytes.size()); - ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor for QNN"); - - // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug. - // Docs explicitly state that masking off top 4 bits should not be required. - for (size_t i = 0; i < dst.size(); i++) { - dst[i] &= 0x0F; // -3 (0b1111_1101) becomes 13 (0b0000_1101) - } + TensorShape shape(qnn::utils::GetInitializerShape(initializer)); + const size_t num_int4_elems = shape.Size(); + ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8(num_int4_elems, unpacked_tensor)); } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) { - TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer); - const size_t num_elems = shape.Size(); - std::vector packed_int4_bytes = std::move(unpacked_tensor); - unpacked_tensor = std::vector(num_elems); - - auto dst = gsl::make_span(reinterpret_cast(unpacked_tensor.data()), unpacked_tensor.size()); - auto src = gsl::make_span(reinterpret_cast(packed_int4_bytes.data()), packed_int4_bytes.size()); - ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor for QNN"); + TensorShape shape(qnn::utils::GetInitializerShape(initializer)); + const size_t num_uint4_elems = shape.Size(); + ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8(num_uint4_elems, unpacked_tensor)); } return Status::OK(); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h index d018ca12d6451..203250204d7f8 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h @@ -7,13 +7,10 @@ #include #include -#include "core/common/status.h" #include "QnnInterface.h" #include "qnn_def.h" -#include "core/common/logging/logging.h" -#include "core/framework/node_unit.h" -#include "core/graph/graph_viewer.h" -#include "core/providers/shared/utils/utils.h" + +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h index f9ef01411310f..276fbaae3b3c9 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h @@ -8,8 +8,7 @@ #include #include -#include "core/common/logging/logging.h" -#include "core/framework/node_unit.h" +#include "core/providers/qnn/ort_api.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc index caf4725626338..3af2fdd1f0276 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc @@ -6,9 +6,8 @@ #include #include #include -#include "core/graph/graph_utils.h" -#include "core/framework/node_unit.h" -#include "core/providers/shared/utils/utils.h" + +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_node_group/utils.h" diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h index 90fe44c3af059..d3d552bc172ec 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h @@ -7,8 +7,7 @@ #include #include -#include "core/common/common.h" -#include "core/framework/node_unit.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_node_group.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc index 76b1726646486..5094ad96724f5 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc @@ -6,9 +6,8 @@ #include #include #include -#include "core/graph/graph_utils.h" -#include "core/framework/node_unit.h" -#include "core/providers/shared/utils/utils.h" + +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h index 3b67f13492a46..0a1b16d24ffcd 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h @@ -7,8 +7,7 @@ #include #include -#include "core/common/common.h" -#include "core/framework/node_unit.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_node_group.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc index c398d1fae5097..e947da1a60e7a 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc @@ -10,8 +10,7 @@ #include #include #include -#include "core/graph/graph_utils.h" -#include "core/framework/node_unit.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc index 5548d7d37c378..93b2fca296389 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc @@ -4,8 +4,7 @@ #include #include -#include "core/graph/graph_viewer.h" -#include "core/framework/node_unit.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_node_group.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h index 0d11d21906ccb..c4cf4e8a20a92 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h @@ -7,8 +7,7 @@ #include #include -#include "core/graph/graph_viewer.h" -#include "core/framework/node_unit.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_node_group.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h index 23330f5616d73..01c15cf4bebe6 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h @@ -4,10 +4,10 @@ #pragma once #include #include -#include "QnnTypes.h" -#include "core/common/common.h" #include -#include "core/framework/node_unit.h" + +#include "core/providers/qnn/ort_api.h" +#include "QnnTypes.h" namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index 08d3120260cea..56c3d3e803d9b 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -5,14 +5,13 @@ #include #include +#include #include #include #include #include -#include "core/common/common.h" -#include "core/common/safeint.h" -#include "core/framework/data_types.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_def.h" namespace onnxruntime { @@ -66,6 +65,42 @@ size_t GetElementSizeByType(ONNXTensorElementDataType elem_type) { return pos->second; } +size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type) { + switch (onnx_type) { + case ONNX_NAMESPACE::TensorProto_DataType_INT4: + return sizeof(Int4x2); + case ONNX_NAMESPACE::TensorProto_DataType_UINT4: + return sizeof(UInt4x2); + case ONNX_NAMESPACE::TensorProto_DataType_INT8: + return sizeof(int8_t); + case ONNX_NAMESPACE::TensorProto_DataType_UINT8: + return sizeof(uint8_t); + case ONNX_NAMESPACE::TensorProto_DataType_INT16: + return sizeof(int16_t); + case ONNX_NAMESPACE::TensorProto_DataType_UINT16: + return sizeof(uint16_t); + case ONNX_NAMESPACE::TensorProto_DataType_INT32: + return sizeof(int32_t); + case ONNX_NAMESPACE::TensorProto_DataType_UINT32: + return sizeof(uint32_t); + case ONNX_NAMESPACE::TensorProto_DataType_INT64: + return sizeof(int64_t); + case ONNX_NAMESPACE::TensorProto_DataType_UINT64: + return sizeof(uint64_t); + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: + return 2; + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: + return sizeof(float); + case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: + return sizeof(double); + case ONNX_NAMESPACE::TensorProto_DataType_BOOL: + return sizeof(bool); + default: + return 0; + } + // Unreachable +} + size_t GetQnnTensorDataSizeInBytes(gsl::span shape, Qnn_DataType_t element_type) { ORT_ENFORCE(!shape.empty(), "Empty shape not allowed."); // TODO can we just treat empty shape as a scalar? SafeInt data_length = GetElementSizeByType(element_type); @@ -507,39 +542,22 @@ bool OnnxDataTypeToQnnDataType(const int32_t onnx_data_type, Qnn_DataType_t& qnn } std::pair CheckMinMax(float rmin, float rmax) { - // Ensure a minimum range of 0.0001 (required by QNN) - rmax = std::max(rmax, rmin + 0.0001f); - // Both QNN and ORT require the range to include 0.0f rmin = std::min(rmin, 0.0f); rmax = std::max(rmax, 0.0f); + // Ensure a minimum range of 0.0001 (required by QNN) + rmax = std::max(rmax, rmin + 0.0001f); + return std::make_pair(rmin, rmax); } -template -Status GetQminQmax(const Qnn_DataType_t qnn_data_type, - T& qmin, - T& qmax) { - if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else { - ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type); +inline float RoundHalfToEven(float input) { + if (!std::isfinite(input)) { + return input; } - return Status::OK(); + // std::remainder returns x - n, where n is the integral value nearest to x. When |x - n| = 0.5, n is chosen to be even + return input - std::remainderf(input, 1.f); } Status GetQuantParams(float rmin, @@ -555,20 +573,22 @@ Status GetQuantParams(float rmin, rmin = -abs_max; } - float qmin = 0.0f; - float qmax = 255.0f; - ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax)); + double rmin_dbl = static_cast(rmin); + double rmax_dbl = static_cast(rmax); + double qmin = 0.0; + double qmax = 0.0; + ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax, symmetric)); - scale = (rmax - rmin) / (qmax - qmin); - float initial_zero_point = 0.0f; + double scale_dbl = (rmax_dbl - rmin_dbl) / (qmax - qmin); + double initial_zero_point = 0.0; if (symmetric) { - initial_zero_point = std::round(rmin + rmax) / 2; + initial_zero_point = std::round(rmin_dbl + rmax_dbl) / 2; } else { - initial_zero_point = qmin - (rmin / scale); + initial_zero_point = qmin - (rmin_dbl / scale_dbl); } - zero_point = static_cast(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point))); - // To match QNN quantization definition - zero_point = 0 - zero_point; + zero_point = static_cast(RoundHalfToEven(static_cast(Saturate(qmax, qmin, initial_zero_point)))); + zero_point = -zero_point; // Negate to match QNN quantization definition. + scale = static_cast(scale_dbl); return Status::OK(); } @@ -590,6 +610,126 @@ Status Quantize(const double double_value, return Status::OK(); } +size_t ShapeSizeCalc(gsl::span shape, size_t start, size_t end) { + size_t size = 1; + for (size_t i = start; i < end; i++) { + size *= shape[i]; + } + return size; +} + +Status GetDataQuantParams(gsl::span data, gsl::span shape, + /*out*/ gsl::span scales, /*out*/ gsl::span offsets, + Qnn_DataType_t data_type, bool symmetric, std::optional axis) { + const size_t num_dims = shape.size(); + const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims); + ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize"); + + size_t block_count = 1; + size_t broadcast_dim = 1; + size_t block_size = num_elems; + + if (axis.has_value()) { + size_t axis_no_neg = *axis < 0 ? static_cast(*axis) + num_dims : static_cast(*axis); + block_count = ShapeSizeCalc(shape, 0, axis_no_neg); + broadcast_dim = shape[axis_no_neg]; + block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims); + } + + ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer"); + ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer"); + + size_t i = 0; + for (size_t n = 0; n < block_count; n++) { + for (size_t bd = 0; bd < broadcast_dim; bd++) { + float rmin = std::numeric_limits::max(); + float rmax = std::numeric_limits::lowest(); + for (size_t j = 0; j < block_size; j++) { + rmin = std::min(rmin, data[i]); + rmax = std::max(rmax, data[i]); + i++; + } + + scales[bd] = 1.0f; + offsets[bd] = 0; + ORT_RETURN_IF_ERROR(GetQuantParams(rmin, rmax, data_type, scales[bd], offsets[bd], symmetric)); + } + } + + assert(i == data.size()); + return Status::OK(); +} + +Status QuantizeData(gsl::span data, gsl::span shape, + gsl::span scales, gsl::span offsets, + /*out*/ gsl::span quant_bytes, Qnn_DataType_t data_type, + std::optional axis) { + const size_t num_dims = shape.size(); + const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims); + ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize"); + size_t expected_num_quant_bytes = GetElementSizeByType(data_type) * data.size(); + ORT_RETURN_IF_NOT(quant_bytes.size() == expected_num_quant_bytes, + "Cannot quantize data because output buffer is not the correct size"); + + size_t block_count = 1; + size_t broadcast_dim = 1; + size_t block_size = num_elems; + + if (axis.has_value()) { + size_t axis_no_neg = *axis < 0 ? static_cast(*axis) + num_dims : static_cast(*axis); + block_count = ShapeSizeCalc(shape, 0, axis_no_neg); + broadcast_dim = shape[axis_no_neg]; + block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims); + } + + ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer"); + ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer"); + + size_t i = 0; + for (size_t n = 0; n < block_count; n++) { + for (size_t bd = 0; bd < broadcast_dim; bd++) { + switch (data_type) { + case QNN_DATATYPE_SFIXED_POINT_8: { + auto input_span = gsl::make_span(&data[i], block_size); + auto output_span = gsl::make_span(&quant_bytes[i * sizeof(int8_t)], sizeof(int8_t) * block_size); + ORT_RETURN_IF_ERROR(QuantizeData(input_span, scales[bd], offsets[bd], output_span)); + break; + } + case QNN_DATATYPE_UFIXED_POINT_8: { + auto input_span = gsl::make_span(&data[i], block_size); + auto output_span = gsl::make_span(&quant_bytes[i * sizeof(uint8_t)], sizeof(uint8_t) * block_size); + ORT_RETURN_IF_ERROR(QuantizeData(input_span, scales[bd], offsets[bd], output_span)); + break; + } + case QNN_DATATYPE_SFIXED_POINT_16: { + auto input_span = gsl::make_span(&data[i], block_size); + auto output_span = gsl::make_span(&quant_bytes[i * sizeof(int16_t)], sizeof(int16_t) * block_size); + ORT_RETURN_IF_ERROR(QuantizeData(input_span, scales[bd], offsets[bd], output_span)); + break; + } + case QNN_DATATYPE_UFIXED_POINT_16: { + auto input_span = gsl::make_span(&data[i], block_size); + auto output_span = gsl::make_span(&quant_bytes[i * sizeof(uint16_t)], sizeof(uint16_t) * block_size); + ORT_RETURN_IF_ERROR(QuantizeData(input_span, scales[bd], offsets[bd], output_span)); + break; + } + case QNN_DATATYPE_SFIXED_POINT_32: { + auto input_span = gsl::make_span(&data[i], block_size); + auto output_span = gsl::make_span(&quant_bytes[i * sizeof(int32_t)], sizeof(int32_t) * block_size); + ORT_RETURN_IF_ERROR(QuantizeData(input_span, scales[bd], offsets[bd], output_span)); + break; + } + default: + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported quantization data type for QuantizeData"); + } + i += block_size; + } + } + assert(i == data.size()); + + return Status::OK(); +} + std::string_view GetQnnErrorMessage(const QNN_INTERFACE_VER_TYPE& qnn_interface, Qnn_ErrorHandle_t qnn_error_handle) { // From QNN SDK: The memory is statically owned and should not be freed by the caller. const char* error_msg = nullptr; diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index 950f349c5006f..853debb61a12f 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -2,11 +2,13 @@ // Licensed under the MIT License. #pragma once +#include #include #include #include #include #include +#include #include #include @@ -14,9 +16,7 @@ #include "QnnInterface.h" #include "QnnTypes.h" -#include "core/session/onnxruntime_cxx_api.h" -#include "core/framework/node_unit.h" -#include "core/util/qmath.h" +#include "core/providers/qnn/ort_api.h" namespace onnxruntime { namespace qnn { @@ -27,6 +27,8 @@ size_t GetElementSizeByType(const Qnn_DataType_t& data_type); size_t GetElementSizeByType(ONNXTensorElementDataType elem_type); +size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type); + size_t GetQnnTensorDataSizeInBytes(gsl::span shape, Qnn_DataType_t element_data_type); bool QnnTensorHasDynamicShape(const Qnn_Tensor_t& tensor); @@ -83,7 +85,30 @@ static bool ArrayHasString(const std::array& strings, std:: std::pair CheckMinMax(float rmin, float rmax); template -Status GetQminQmax(const Qnn_DataType_t qnn_data_type, T& qmin, T& qmax); +Status GetQminQmax(const Qnn_DataType_t qnn_data_type, + T& qmin, + T& qmax, + bool symmetric = false) { + if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) { + qmin = static_cast(std::numeric_limits::min() + static_cast(symmetric)); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) { + qmin = static_cast(std::numeric_limits::min()); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) { + qmin = static_cast(std::numeric_limits::min() + static_cast(symmetric)); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { + qmin = static_cast(std::numeric_limits::min()); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) { + qmin = static_cast(std::numeric_limits::min() + static_cast(symmetric)); + qmax = static_cast(std::numeric_limits::max()); + } else { + ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type); + } + return Status::OK(); +} template inline T Saturate(const T qmax, @@ -113,6 +138,104 @@ Status Quantize(const double double_value, const Qnn_DataType_t qnn_data_type, int& quant_value); +size_t ShapeSizeCalc(gsl::span shape, size_t start, size_t end); + +// Computes the quantization parameters (scales and offsets) for the given data. +// Supports both per-tensor and per-channel quantization. Must provide an axis argument +// for per-channel quantization. +// The offsets use the QNN convention where offset = -zero_point. +Status GetDataQuantParams(gsl::span data, gsl::span shape, + /*out*/ gsl::span scales, /*out*/ gsl::span offsets, + Qnn_DataType_t data_type, bool symmetric = false, + std::optional axis = std::nullopt); + +// Quantizes the given float data using the provided quantization parameters (scales and offsets). +// Supports both per-tensor and per-channel quantization. Must provide an axis argument +// for per-channel quantization. +// The provided offsets must use the QNN convention where offset = -zero_point. +Status QuantizeData(gsl::span data, gsl::span shape, + gsl::span scales, gsl::span offsets, + /*out*/ gsl::span quant_bytes, Qnn_DataType_t data_type, + std::optional axis = std::nullopt); + +// Quantizes (per-tensor) the given float data using the provided scale and offset. +// The provided offset must use the QNN convention where offset = -zero_point. +template +inline Status QuantizeData(gsl::span data, float scale, int32_t offset, + /*out*/ gsl::span quant_bytes) { + const size_t num_elems = data.size(); + const size_t expected_output_bytes = sizeof(QuantType) * num_elems; + ORT_RETURN_IF_NOT(expected_output_bytes == quant_bytes.size(), + "Output buffer is not large enough to hold quantized bytes."); + const double clip_min = static_cast(std::numeric_limits::lowest()); + const double clip_max = static_cast(std::numeric_limits::max()); + + QuantType* output = reinterpret_cast(quant_bytes.data()); + for (size_t i = 0; i < num_elems; ++i) { + const double scale_dbl = static_cast(scale); + const double offset_dbl = static_cast(offset); + double float_val = std::nearbyint(static_cast(data[i]) / scale_dbl) - offset_dbl; + float_val = std::max(float_val, clip_min); + float_val = std::min(float_val, clip_max); + output[i] = static_cast(float_val); + } + return Status::OK(); +} + +// Re-writes a buffer of packed 4-bit elements to a buffer of unpacked 8-bit elements. +// QNN requires that 4-bit weights are unpacked to 8-bit. +template +Status UnpackInt4ToInt8(size_t num_int4_elems, std::vector& data_bytes) { + if constexpr (Signed) { // INT4 + std::vector packed_int4_bytes = std::move(data_bytes); + data_bytes = std::vector(num_int4_elems); + + auto dst = gsl::make_span(reinterpret_cast(data_bytes.data()), data_bytes.size()); + auto src = gsl::make_span(reinterpret_cast(packed_int4_bytes.data()), packed_int4_bytes.size()); + ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor for QNN"); + + // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug. + // Docs explicitly state that masking off top 4 bits should not be required, but we have to do it. + for (size_t i = 0; i < dst.size(); i++) { + dst[i] &= 0x0F; // -3 (0b1111_1101) becomes 13 (0b0000_1101) + } + } else { // UINT4 + std::vector packed_uint4_bytes = std::move(data_bytes); + data_bytes = std::vector(num_int4_elems); + + auto dst = gsl::make_span(reinterpret_cast(data_bytes.data()), data_bytes.size()); + auto src = gsl::make_span(reinterpret_cast(packed_uint4_bytes.data()), packed_uint4_bytes.size()); + ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor for QNN"); + } + + return Status::OK(); +} + +template +std::vector GetInitializerShape(const ONNX_NAMESPACE::TensorProto& tensor_proto) { + const auto& dims = tensor_proto.dims(); + std::vector tensor_shape_vec(static_cast(dims.size())); + for (int i = 0; i < dims.size(); ++i) { + tensor_shape_vec[i] = static_cast(dims[i]); + } + + return tensor_shape_vec; +} + +template +Status PermuteShape(gsl::span input_shape, gsl::span perm, gsl::span output_shape) { + const size_t rank = input_shape.size(); + ORT_RETURN_IF_NOT(rank == perm.size() && rank == output_shape.size(), + "PermuteShape(): expect all arguments to have the same rank."); + + for (size_t i = 0; i < rank; ++i) { + size_t p = static_cast(perm[i]); + output_shape[i] = input_shape[p]; + } + + return Status::OK(); +} + // Gets error message associated with QNN error handle value. std::string_view GetQnnErrorMessage(const QNN_INTERFACE_VER_TYPE& qnn_interface, Qnn_ErrorHandle_t qnn_error_handle); diff --git a/onnxruntime/core/providers/qnn/ort_api.cc b/onnxruntime/core/providers/qnn/ort_api.cc new file mode 100644 index 0000000000000..809593b409dad --- /dev/null +++ b/onnxruntime/core/providers/qnn/ort_api.cc @@ -0,0 +1,211 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/qnn/ort_api.h" + +#include +#include +#include + +namespace onnxruntime { + +#if BUILD_QNN_EP_STATIC_LIB +static std::unique_ptr>> s_run_on_unload_; + +void RunOnUnload(std::function function) { + static std::mutex mutex; + std::lock_guard guard(mutex); + if (!s_run_on_unload_) { + s_run_on_unload_ = std::make_unique>>(); + } + s_run_on_unload_->push_back(std::move(function)); +} + +struct OnUnload { + ~OnUnload() { + if (!s_run_on_unload_) + return; + + for (auto& function : *s_run_on_unload_) + function(); + + s_run_on_unload_.reset(); + } + +} g_on_unload; +#endif // BUILD_QNN_EP_STATIC_LIB + +std::vector Graph__Nodes(const Graph& graph) { +#if BUILD_QNN_EP_STATIC_LIB + std::vector nodes; + nodes.reserve(graph.NumberOfNodes()); + + for (const Node& node : graph.Nodes()) { + nodes.push_back(&node); + } + + return nodes; +#else + return graph.Nodes(); +#endif +} + +#if BUILD_QNN_EP_STATIC_LIB +#define NODE_ATTR_ITER_VAL(iter) (iter)->second +#else +#define NODE_ATTR_ITER_VAL(iter) (iter)->second() +#endif + +NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node) + : node_attributes_(node.GetAttributes()) {} + +NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit) + : node_attributes_(node_unit.GetNode().GetAttributes()) {} + +float NodeAttrHelper::Get(const std::string& key, float def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + return NODE_ATTR_ITER_VAL(entry).f(); + } + + return def_val; +} + +int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + return narrow(NODE_ATTR_ITER_VAL(entry).i()); + } + + return def_val; +} + +uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + return narrow(NODE_ATTR_ITER_VAL(entry).i()); + } + + return def_val; +} + +int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + return NODE_ATTR_ITER_VAL(entry).i(); + } + + return def_val; +} + +const std::string& NodeAttrHelper::Get(const std::string& key, const std::string& def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + return NODE_ATTR_ITER_VAL(entry).s(); + } + + return def_val; +} + +std::vector NodeAttrHelper::Get(const std::string& key, const std::vector& def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + const auto& values = NODE_ATTR_ITER_VAL(entry).ints(); + const int64_t* cbegin = values.data(); + const int64_t* cend = values.data() + values.size(); + std::vector v; + v.reserve(static_cast(values.size())); + std::transform(cbegin, cend, std::back_inserter(v), + [](int64_t val) -> int32_t { return narrow(val); }); + return v; + } + + return def_val; +} + +std::vector NodeAttrHelper::Get(const std::string& key, const std::vector& def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + const auto& values = NODE_ATTR_ITER_VAL(entry).ints(); + const int64_t* cbegin = values.data(); + const int64_t* cend = values.data() + values.size(); + std::vector v; + v.reserve(static_cast(values.size())); + std::transform(cbegin, cend, std::back_inserter(v), + [](int64_t val) -> uint32_t { return narrow(val); }); + return v; + } + + return def_val; +} + +std::vector NodeAttrHelper::Get(const std::string& key, const std::vector& def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + const auto& values = NODE_ATTR_ITER_VAL(entry).ints(); + const int64_t* cbegin = values.data(); + const int64_t* cend = values.data() + values.size(); + return std::vector{cbegin, cend}; + } + + return def_val; +} + +std::vector NodeAttrHelper::Get(const std::string& key, const std::vector& def_val) const { + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + const auto& values = NODE_ATTR_ITER_VAL(entry).floats(); + const float* cbegin = values.data(); + const float* cend = values.data() + values.size(); + return std::vector{cbegin, cend}; + } + + return def_val; +} + +std::optional NodeAttrHelper::GetFloat(const std::string& key) const { + std::optional result; + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + result = NODE_ATTR_ITER_VAL(entry).f(); + } + + return result; +} + +std::optional NodeAttrHelper::GetInt64(const std::string& key) const { + std::optional result; + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + result = NODE_ATTR_ITER_VAL(entry).i(); + } + + return result; +} + +std::optional> NodeAttrHelper::GetFloats(const std::string& key) const { + std::optional> result; + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + const auto& values = NODE_ATTR_ITER_VAL(entry).floats(); + const float* cbegin = values.data(); + const float* cend = values.data() + values.size(); + result = std::vector(cbegin, cend); + } + + return result; +} + +std::optional> NodeAttrHelper::GetInt64s(const std::string& key) const { + std::optional> result; + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + const auto& values = NODE_ATTR_ITER_VAL(entry).ints(); + const int64_t* cbegin = values.data(); + const int64_t* cend = values.data() + values.size(); + result = std::vector(cbegin, cend); + } + + return result; +} + +std::optional NodeAttrHelper::GetString(const std::string& key) const { + std::optional result; + if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) { + result = NODE_ATTR_ITER_VAL(entry).s(); + } + + return result; +} + +bool NodeAttrHelper::HasAttr(const std::string& key) const { + return node_attributes_.find(key) != node_attributes_.end(); +} +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h new file mode 100644 index 0000000000000..030ebbb54c615 --- /dev/null +++ b/onnxruntime/core/providers/qnn/ort_api.h @@ -0,0 +1,178 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License + +#pragma once + +// This compilation unit (ort_api.h/.cc) encapsulates the interface between the EP and ORT in a manner +// that allows QNN EP to built either as a static library or a dynamic shared library. +// The preprocessor macro `BUILD_QNN_EP_STATIC_LIB` is defined and set to 1 if QNN EP +// is built as a static library. + +#if BUILD_QNN_EP_STATIC_LIB +// Includes when building QNN EP statically +#ifdef _WIN32 +#include +#include +#include "core/platform/tracing.h" +#include "core/platform/windows/logging/etw_sink.h" +#endif + +#include "onnx/defs/data_type_utils.h" +#include "core/common/common.h" +#include "core/common/status.h" +#include "core/common/safeint.h" +#include "core/common/logging/logging.h" +#include "core/common/logging/capture.h" +#include "core/common/path_string.h" +#include "core/platform/env.h" +#include "core/framework/data_types.h" +#include "core/framework/float16.h" +#include "core/framework/run_options.h" +#include "core/framework/execution_provider.h" +#include "core/framework/model_metadef_id_generator.h" +#include "core/framework/compute_capability.h" +#include "core/framework/tensor_shape.h" +#include "core/framework/node_unit.h" +#include "core/framework/tensorprotoutils.h" +#include "core/framework/utils.h" +#include "core/graph/constants.h" +#include "core/graph/basic_types.h" +#include "core/graph/model.h" +#include "core/graph/graph_viewer.h" +#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h" +#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h" +#include "core/providers/common.h" +#include "core/providers/partitioning_utils.h" +#include "core/session/onnxruntime_cxx_api.h" +#else +// Includes when building QNN EP as a shared library +#include "core/providers/shared_library/provider_api.h" +#define ORT_API_MANUAL_INIT +#include "core/session/onnxruntime_cxx_api.h" +#endif + +#include "core/common/inlined_containers.h" +#include "core/session/onnxruntime_session_options_config_keys.h" +#include "core/session/onnxruntime_run_options_config_keys.h" + +#include +#include + +namespace onnxruntime { +#if BUILD_QNN_EP_STATIC_LIB +using Node_EdgeEnd = Node::EdgeEnd; +#endif + +#if BUILD_QNN_EP_STATIC_LIB +void RunOnUnload(std::function function); +inline const Env& GetDefaultEnv() { return Env::Default(); } +#endif + +inline void InitOrtCppApi() { +#if BUILD_QNN_EP_STATIC_LIB + // Do nothing. Including "onnxruntime_cxx_api.h" normally initializes the global api_ object. +#else + // Call util function in provider bridge that initializes the global api_ object. + InitProviderOrtApi(); +#endif +} + +/// +/// Creates an onnxruntime or onnx object. Works for both static and shared library builds of QNN EP. +/// +/// Example: auto model = Factory<Model>::Create(/* args ... */); +/// +/// Type of the object to create +template +struct Factory { + template + static inline std::unique_ptr Create(Params&&... params) { +#if BUILD_QNN_EP_STATIC_LIB + return std::make_unique(std::forward(params)...); +#else + return T::Create(std::forward(params)...); +#endif + } +}; + +inline const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions& run_options) { +#if BUILD_QNN_EP_STATIC_LIB + return run_options.config_options; +#else + return run_options.GetConfigOptions(); +#endif +} + +inline std::unique_ptr& ComputeCapability__SubGraph(ComputeCapability& compute_cability) { +#if BUILD_QNN_EP_STATIC_LIB + return compute_cability.sub_graph; +#else + return compute_cability.SubGraph(); +#endif +} + +inline std::vector& IndexedSubGraph__Nodes(IndexedSubGraph& indexed_sub_graph) { +#if BUILD_QNN_EP_STATIC_LIB + return indexed_sub_graph.nodes; +#else + return indexed_sub_graph.Nodes(); +#endif +} + +std::vector Graph__Nodes(const Graph& graph); + +inline std::pair>, std::unordered_map> +GetQDQNodeUnits(const GraphViewer& graph_viewer, const logging::Logger& logger) { +#if BUILD_QNN_EP_STATIC_LIB + return QDQ::GetAllNodeUnits(graph_viewer, logger); +#else + return QDQ::GetAllNodeUnits(&graph_viewer, logger); +#endif +} + +/** + * Wrapping onnxruntime::Node for retrieving attribute values + */ +class NodeAttrHelper { + public: + explicit NodeAttrHelper(const Node& node); + + // Get the attributes from the target node of the node_unit + explicit NodeAttrHelper(const NodeUnit& node_unit); + + /* + * Get with default + */ + float Get(const std::string& key, float def_val) const; + std::vector Get(const std::string& key, const std::vector& def_val) const; + + int64_t Get(const std::string& key, int64_t def_val) const; + std::vector Get(const std::string& key, const std::vector& def_val) const; + + const std::string& Get(const std::string& key, const std::string& def_val) const; + + // Convert the i() or ints() of the attribute from int64_t to int32_t + int32_t Get(const std::string& key, int32_t def_val) const; + std::vector Get(const std::string& key, const std::vector& def_val) const; + + // Convert the i() or ints() of the attribute from int64_t to uint32_t + uint32_t Get(const std::string& key, uint32_t def_val) const; + std::vector Get(const std::string& key, const std::vector& def_val) const; + + /* + * Get without default. + */ + std::optional GetFloat(const std::string& key) const; + std::optional> GetFloats(const std::string& key) const; + + std::optional GetInt64(const std::string& key) const; + std::optional> GetInt64s(const std::string& key) const; + + std::optional GetString(const std::string& key) const; + + bool HasAttr(const std::string& key) const; + + private: + const NodeAttributes& node_attributes_; +}; +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.cc b/onnxruntime/core/providers/qnn/qnn_allocator.cc index 68dac682756d5..1fb8742f724cd 100644 --- a/onnxruntime/core/providers/qnn/qnn_allocator.cc +++ b/onnxruntime/core/providers/qnn/qnn_allocator.cc @@ -7,9 +7,7 @@ #include #include -#include "core/common/common.h" -#include "core/common/safeint.h" -#include "core/mlas/inc/mlas.h" // for MlasGetPreferredBufferAlignment() +#include "core/providers/qnn/ort_api.h" namespace onnxruntime::qnn { @@ -52,7 +50,8 @@ struct AllocationHeader { }; size_t AllocationAlignment() { - return std::max(alignof(AllocationHeader), MlasGetPreferredBufferAlignment()); + constexpr size_t min_allocation_alignment = 64; // Equal to MlasGetPreferredBufferAlignment() + return std::max(alignof(AllocationHeader), min_allocation_alignment); } size_t DivRoundUp(size_t a, size_t b) { // TODO is there already a helper function somewhere for this? diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.h b/onnxruntime/core/providers/qnn/qnn_allocator.h index f642368697aae..e64f38f494b35 100644 --- a/onnxruntime/core/providers/qnn/qnn_allocator.h +++ b/onnxruntime/core/providers/qnn/qnn_allocator.h @@ -6,11 +6,7 @@ #include #include -#include "core/common/common.h" -#include "core/common/inlined_containers.h" -#include "core/common/logging/logging.h" -#include "core/common/status.h" -#include "core/framework/allocator.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/rpcmem_library.h" namespace onnxruntime::qnn { diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index e9d6884b8c8ca..b1555b6050928 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -6,60 +6,22 @@ #include #include -#include "core/framework/compute_capability.h" -#include "core/framework/kernel_registry.h" -#include "core/framework/run_options.h" -#include "core/graph/graph_viewer.h" -#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h" -#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h" -#include "core/platform/env.h" -#include "core/providers/common.h" -#include "core/providers/partitioning_utils.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/onnx_ctx_model_helper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_def.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/qnn_node_group.h" +#include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/qnn_allocator.h" +#include "core/providers/qnn/qnn_telemetry.h" #include "core/providers/qnn/rpcmem_library.h" #include "core/providers/qnn/shared_context.h" -#include "core/session/onnxruntime_cxx_api.h" -#include "core/session/onnxruntime_run_options_config_keys.h" -#include "core/session/onnxruntime_session_options_config_keys.h" - -#ifdef _WIN32 -#include -#include "core/platform/windows/logging/etw_sink.h" -#endif namespace onnxruntime { constexpr const char* QNN = "QNN"; -static std::unique_ptr>> s_run_on_unload_; - -void RunOnUnload(std::function function) { - static std::mutex mutex; - std::lock_guard guard(mutex); - if (!s_run_on_unload_) { - s_run_on_unload_ = std::make_unique>>(); - } - s_run_on_unload_->push_back(std::move(function)); -} - -struct OnUnload { - ~OnUnload() { - if (!s_run_on_unload_) - return; - - for (auto& function : *s_run_on_unload_) - function(); - - s_run_on_unload_.reset(); - } - -} g_on_unload; - static void ParseProfilingLevel(std::string profiling_level_string, qnn::ProfilingLevel& profiling_level) { std::transform(profiling_level_string.begin(), @@ -196,17 +158,20 @@ qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned } QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map, - const SessionOptions* session_options) + const ConfigOptions* config_options) : IExecutionProvider{onnxruntime::kQnnExecutionProvider} { - if (session_options) { - disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault( + InitOrtCppApi(); + metadef_id_generator_ = Factory::Create(); + + if (config_options) { + disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault( kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; - context_cache_enabled_ = session_options->config_options.GetConfigOrDefault( + context_cache_enabled_ = config_options->GetConfigOrDefault( kOrtSessionOptionEpContextEnable, "0") == "1"; LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_; - std::string embed_mode = session_options->config_options.GetConfigOrDefault( + std::string embed_mode = config_options->GetConfigOrDefault( kOrtSessionOptionEpContextEmbedMode, "0"); if ("1" == embed_mode) { qnn_context_embed_mode_ = true; @@ -217,18 +182,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_; - context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); + context_cache_path_cfg_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_; // For the case that workaround QNN context PD memory limit, user need split the model into pieces and // generate the QNN context model separately. // It could happen that the generated EPContext node in separate graph has same node name. // User can set this context_node_name_prefix for each split pieces to avoid that happens. - context_node_name_prefix_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, ""); + context_node_name_prefix_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, ""); LOGS_DEFAULT(VERBOSE) << "User specified QNN context node name prefix: " << context_node_name_prefix_; share_ep_contexts_ = - session_options->config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; + config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_; } @@ -249,8 +214,9 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio // separate out the profiling level for ETW in case it gets disabled later when we extract the events // set to invalid to indicate that ETW is no enabled when we setup QNN qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID; - const Env& env = Env::Default(); - auto& provider = env.GetTelemetryProvider(); + +#ifdef _WIN32 + auto& provider = qnn::QnnTelemetry::Instance(); if (provider.IsEnabled()) { auto level = provider.Level(); auto keyword = provider.Keyword(); @@ -260,6 +226,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } } } +#endif // defined(_WIN32) // In case ETW gets disabled later auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL); @@ -412,47 +379,53 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio soc_model, enable_htp_weight_sharing}); -#ifdef _WIN32 - auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance(); - // Register callback for ETW capture state (rundown) - callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback( - [&etwRegistrationManager, this]( - LPCGUID SourceId, - ULONG IsEnabled, - UCHAR Level, - ULONGLONG MatchAnyKeyword, - ULONGLONG MatchAllKeyword, - PEVENT_FILTER_DESCRIPTOR FilterData, - PVOID CallbackContext) { - ORT_UNUSED_PARAMETER(SourceId); - ORT_UNUSED_PARAMETER(MatchAnyKeyword); - ORT_UNUSED_PARAMETER(MatchAllKeyword); - ORT_UNUSED_PARAMETER(FilterData); - ORT_UNUSED_PARAMETER(CallbackContext); - - if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) { - if ((MatchAnyKeyword & static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) { - auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity(); - (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity); - } - if ((MatchAnyKeyword & static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) { - if (Level != 0) { - // Commenting out Dynamic QNN Profiling for now - // There seems to be a crash in 3rd party QC QnnHtp.dll with this. - // Repro Scenario - start ETW tracing prior to session creation. - // Then disable/enable ETW Tracing with the code below uncommented a few times - // auto profiling_level_etw = GetProfilingLevelFromETWLevel(Level); - // (void)qnn_backend_manager_->SetProfilingLevelETW(profiling_level_etw); +#if defined(_WIN32) + if (onnxruntime::logging::EtwRegistrationManager::SupportsETW()) { + auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance(); + // Register callback for ETW capture state (rundown) + callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback( + [&etwRegistrationManager, this]( + LPCGUID SourceId, + ULONG IsEnabled, + UCHAR Level, + ULONGLONG MatchAnyKeyword, + ULONGLONG MatchAllKeyword, + PEVENT_FILTER_DESCRIPTOR FilterData, + PVOID CallbackContext) { + ORT_UNUSED_PARAMETER(SourceId); + ORT_UNUSED_PARAMETER(MatchAnyKeyword); + ORT_UNUSED_PARAMETER(MatchAllKeyword); + ORT_UNUSED_PARAMETER(FilterData); + ORT_UNUSED_PARAMETER(CallbackContext); + + if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) { + if ((MatchAnyKeyword & static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) { + auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity(); + (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity); + } + if ((MatchAnyKeyword & static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) { + if (Level != 0) { + // Commenting out Dynamic QNN Profiling for now + // There seems to be a crash in 3rd party QC QnnHtp.dll with this. + // Repro Scenario - start ETW tracing prior to session creation. + // Then disable/enable ETW Tracing with the code below uncommented a few times + // auto profiling_level_etw = GetProfilingLevelFromETWLevel(Level); + // (void)qnn_backend_manager_->SetProfilingLevelETW(profiling_level_etw); + // + // NOTE(1/2/2025): It is possible that the above was not working in part because it is using the + // *logging ETW* subsystem to modify profiling, which should use an entirely different + // ETW provider (see QnnTelemetry). Should add callbacks for profiling to the QnnTelemetry ETW provider. + } } } - } - if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) { - // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID); - (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt); - } - }); - etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_); + if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) { + // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID); + (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt); + } + }); + etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_); + } #endif } @@ -466,7 +439,7 @@ QNNExecutionProvider::~QNNExecutionProvider() { } // Unregister the ETW callback -#ifdef _WIN32 +#if defined(_WIN32) if (callback_ETWSink_provider_ != nullptr) { logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_); } @@ -498,9 +471,10 @@ static void LogNodeSupport(const logging::Logger& logger, oss << "\tREASON : " << support_status.ErrorMessage() << std::endl; } - logging::Capture(logger, log_severity, logging::Category::onnxruntime, - log_data_type, call_site) - .Stream() + auto log_capture = Factory::Create(logger, log_severity, + logging::Category::onnxruntime, + log_data_type, call_site); + log_capture->Stream() << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes << " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :" << std::endl @@ -604,11 +578,11 @@ static bool EpSharedContextsHasAllGraphs(const std::vectorName(); + const std::string& graph_name = ep_context_node.Name(); bool has_shared_qnn_model = SharedContext::GetInstance().HasQnnModel(graph_name); if (!has_shared_qnn_model) { LOGS(logger, VERBOSE) << "Graph: " << graph_name << " from EpContext node not found from shared EP contexts."; @@ -623,7 +597,7 @@ static bool EpSharedContextsHasAllGraphs(const std::vector>& result, - const utils::GenerateMetadefNameFn& gen_metadef_name, + const std::function& gen_metadef_name, const logging::Logger& logger) { std::unordered_set supported_nodes{}; std::vector> supported_groups{}; @@ -683,7 +657,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer const auto gen_metadef_name = [&]() { uint64_t model_hash; - int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash); + int metadef_id = metadef_id_generator_->GenerateId(graph_viewer, model_hash); return MakeString(QNN, context_node_name_prefix_, "_", model_hash, "_", metadef_id); }; @@ -734,7 +708,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer std::vector> node_unit_holder; std::unordered_map node_unit_map; - std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger); + std::tie(node_unit_holder, node_unit_map) = GetQDQNodeUnits(graph_viewer, logger); // remove is_qnn_ctx_model related code const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map, @@ -777,11 +751,14 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer bool is_valid_partition = true; size_t nodes_in_partition = 0; - if (partition && partition->sub_graph) { - nodes_in_partition = partition->sub_graph->nodes.size(); + if (partition && ComputeCapability__SubGraph(*partition)) { + const auto& subgraph = ComputeCapability__SubGraph(*partition); + const auto& subgraph_nodes = IndexedSubGraph__Nodes(*subgraph); + + nodes_in_partition = subgraph_nodes.size(); if (nodes_in_partition == 1 && !is_qnn_ctx_model) { - const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]); + const Node* node = graph_viewer.GetNode(subgraph_nodes[0]); if (!node) { LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node."; @@ -850,34 +827,34 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector& nod void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder& configs_builder) const { if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP) { if (htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) { - QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushCustomConfig(); - htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - htp_graph_opt_config.optimizationOption.floatValue = static_cast(htp_graph_finalization_opt_mode_); - - QnnGraph_Config_t& graph_opt_config = configs_builder.PushConfig(); - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &htp_graph_opt_config; + gsl::not_null htp_graph_opt_config = configs_builder.PushCustomConfig(); + htp_graph_opt_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + htp_graph_opt_config->optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + htp_graph_opt_config->optimizationOption.floatValue = static_cast(htp_graph_finalization_opt_mode_); + + gsl::not_null graph_opt_config = configs_builder.PushConfig(); + graph_opt_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config->customConfig = htp_graph_opt_config; } if (vtcm_size_in_mb_ > 0) { - QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig(); - htp_graph_opt_config_vtcm.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - htp_graph_opt_config_vtcm.vtcmSizeInMB = static_cast(vtcm_size_in_mb_); + gsl::not_null htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig(); + htp_graph_opt_config_vtcm->option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + htp_graph_opt_config_vtcm->vtcmSizeInMB = static_cast(vtcm_size_in_mb_); - QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushConfig(); - graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm; + gsl::not_null graph_opt_config_vtcm = configs_builder.PushConfig(); + graph_opt_config_vtcm->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config_vtcm->customConfig = htp_graph_opt_config_vtcm; } if (enable_HTP_FP16_precision_) { - QnnHtpGraph_CustomConfig_t& htp_graph_precision_config = configs_builder.PushCustomConfig(); - htp_graph_precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; - htp_graph_precision_config.precision = QNN_PRECISION_FLOAT16; + gsl::not_null htp_graph_precision_config = configs_builder.PushCustomConfig(); + htp_graph_precision_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; + htp_graph_precision_config->precision = QNN_PRECISION_FLOAT16; - QnnGraph_Config_t& graph_precision_config = configs_builder.PushConfig(); - graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_precision_config.customConfig = &htp_graph_precision_config; + gsl::not_null graph_precision_config = configs_builder.PushConfig(); + graph_precision_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_precision_config->customConfig = htp_graph_precision_config; } } } @@ -933,10 +910,10 @@ Status QNNExecutionProvider::Compile(const std::vector& fused if (EpSharedContextsHasAllGraphs(fused_nodes_and_graphs, logger)) { for (auto fused_node_and_graph : fused_nodes_and_graphs) { const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph); - const auto& ep_context_node = graph_viewer.Nodes().begin(); + const Node& ep_context_node = *graph_viewer.Nodes().begin(); const Node& fused_node = fused_node_and_graph.fused_node; const std::string& graph_meta_id = fused_node.Name(); - std::string key = ep_context_node->Name(); + std::string key = ep_context_node.Name(); auto qnn_model_shared = SharedContext::GetInstance().GetSharedQnnModel(key); ORT_RETURN_IF(nullptr == qnn_model_shared, "Graph: " + key + " not found from shared EP contexts."); ORT_RETURN_IF_ERROR(qnn_model_shared->SetGraphInputOutputInfo(graph_viewer, fused_node, logger)); @@ -978,10 +955,10 @@ Status QNNExecutionProvider::Compile(const std::vector& fused for (auto fused_node_and_graph : fused_nodes_and_graphs) { const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph); - const auto& ep_context_node = graph_viewer.Nodes().begin(); + const Node& ep_context_node = *graph_viewer.Nodes().begin(); const Node& fused_node = fused_node_and_graph.fused_node; const std::string& graph_meta_id = fused_node.Name(); - std::string key = ep_context_node->Name(); + std::string key = ep_context_node.Name(); ORT_RETURN_IF(qnn_models.find(key) == qnn_models.end(), key + " key name not exist in table qnn_models."); auto qnn_model = std::move(qnn_models[key]); ORT_RETURN_IF_ERROR(qnn_model->SetGraphInputOutputInfo(graph_viewer, fused_node, logger)); @@ -1022,7 +999,7 @@ Status QNNExecutionProvider::Compile(const std::vector& fused buffer_size, max_spill_fill_buffer_size)); } - qnn_ep_context_model_ = std::make_unique("qnn_ep_context_model", false, logger); + qnn_ep_context_model_ = Factory::Create(std::string{"qnn_ep_context_model"}, false, logger); ORT_RETURN_IF_ERROR(qnn::CreateEPContextNodes(qnn_ep_context_model_.get(), context_buffer.get(), buffer_size, @@ -1041,8 +1018,8 @@ const InlinedVector QNNExecutionProvider::GetEpContextNodes() const InlinedVector ep_context_nodes; if (qnn_ep_context_model_) { const auto& graph = qnn_ep_context_model_->MainGraph(); - for (const auto& node : graph.Nodes()) { - ep_context_nodes.push_back(graph.GetNode(node.Index())); + for (gsl::not_null node : Graph__Nodes(graph)) { + ep_context_nodes.push_back(graph.GetNode(node->Index())); } } @@ -1133,22 +1110,34 @@ void QNNExecutionProvider::ReleasePerThreadContext() const { per_thread_context_cache->erase(cached_context_it); } +static bool TryGetConfigEntry(const ConfigOptions& config_options, const std::string& key, std::string& value) { + std::optional new_value = config_options.GetConfigEntry(key); + if (!new_value.has_value()) { + return false; + } + + value = *new_value; + return true; +} + Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) { auto backend_type = qnn_backend_manager_->GetQnnBackendType(); if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) { return Status::OK(); } + const ConfigOptions& config_options = RunOptions__GetConfigOptions(run_options); + std::string htp_perf_mode = ""; qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault; - if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) { + if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) { // set power mode ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode); } std::string rpc_latency = ""; uint32_t rpc_control_latency = 0; - if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) { + if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) { rpc_control_latency = static_cast(std::stoul(rpc_latency)); LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency; } @@ -1174,9 +1163,11 @@ Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::R return Status::OK(); } + const ConfigOptions& config_options = RunOptions__GetConfigOptions(run_options); + std::string htp_perf_mode = ""; qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault; - if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) { + if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) { // set power mode ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode); } diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index 317b34e66a6e4..48f41c4da384f 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -3,32 +3,25 @@ #pragma once -#include "core/framework/execution_provider.h" -#include "core/framework/session_options.h" -#include "core/framework/model_metadef_id_generator.h" -#include "core/graph/model.h" +#include +#include +#include +#include +#include + +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_backend_manager.h" #include "core/providers/qnn/builder/qnn_model.h" #include "core/providers/qnn/builder/qnn_configs_helper.h" #include "core/providers/qnn/rpcmem_library.h" #include "HTP/QnnHtpGraph.h" -#include -#include -#include -#include -#include -#ifdef _WIN32 -#include "core/platform/windows/logging/etw_sink.h" -#endif namespace onnxruntime { -void RunOnUnload(std::function function); - // Logical device representation. class QNNExecutionProvider : public IExecutionProvider { public: - explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options); + explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options); virtual ~QNNExecutionProvider(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider); @@ -90,14 +83,14 @@ class QNNExecutionProvider : public IExecutionProvider { bool qnn_context_embed_mode_ = true; int32_t vtcm_size_in_mb_ = 0; std::unique_ptr qnn_ep_context_model_; - ModelMetadefIdGenerator metadef_id_generator_; + std::unique_ptr metadef_id_generator_; uint32_t device_id_ = 0; qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault; uint32_t default_rpc_control_latency_ = 0; bool enable_HTP_FP16_precision_ = true; bool share_ep_contexts_ = false; bool enable_spill_fill_buffer_ = false; -#ifdef _WIN32 +#if defined(_WIN32) onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr; #endif qnn::ModelSettings model_settings_ = {}; diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc index 4095d7ff02a33..d4dd446751359 100644 --- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc +++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc @@ -2,32 +2,68 @@ // Licensed under the MIT License #include "core/providers/qnn/qnn_provider_factory_creator.h" - -#include "core/session/abi_session_options_impl.h" #include "core/providers/qnn/qnn_execution_provider.h" -#include "core/session/ort_apis.h" namespace onnxruntime { struct QNNProviderFactory : IExecutionProviderFactory { - QNNProviderFactory(const ProviderOptions& provider_options_map, const SessionOptions* session_options) - : provider_options_map_(provider_options_map), session_options_(session_options) { + QNNProviderFactory(const ProviderOptions& provider_options_map, const ConfigOptions* config_options) + : provider_options_map_(provider_options_map), config_options_(config_options) { } ~QNNProviderFactory() override { } std::unique_ptr CreateProvider() override { - return std::make_unique(provider_options_map_, session_options_); + return std::make_unique(provider_options_map_, config_options_); } private: ProviderOptions provider_options_map_; - const SessionOptions* session_options_; + const ConfigOptions* config_options_; }; +#if BUILD_QNN_EP_STATIC_LIB std::shared_ptr QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map, const SessionOptions* session_options) { - return std::make_shared(provider_options_map, session_options); + const ConfigOptions* config_options = nullptr; + if (session_options != nullptr) { + config_options = &session_options->config_options; + } + + return std::make_shared(provider_options_map, config_options); } +#else +struct QNN_Provider : Provider { + std::shared_ptr CreateExecutionProviderFactory(const void* param) override { + if (param == nullptr) { + LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL options to CreateExecutionProviderFactory()"; + return nullptr; + } + + std::array pointers_array = *reinterpret_cast*>(param); + const ProviderOptions* provider_options = reinterpret_cast(pointers_array[0]); + const ConfigOptions* config_options = reinterpret_cast(pointers_array[1]); + + if (provider_options == nullptr) { + LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL ProviderOptions to CreateExecutionProviderFactory()"; + return nullptr; + } + + return std::make_shared(*provider_options, config_options); + } + + void Initialize() override {} + void Shutdown() override {} +} g_provider; +#endif // BUILD_QNN_EP_STATIC_LIB } // namespace onnxruntime + +#if !BUILD_QNN_EP_STATIC_LIB +extern "C" { + +ORT_API(onnxruntime::Provider*, GetProvider) { + return &onnxruntime::g_provider; +} +} +#endif // !BUILD_QNN_EP_STATIC_LIB diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h index 80f9d99b804e7..46b6c15b40553 100644 --- a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h +++ b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h @@ -11,6 +11,9 @@ namespace onnxruntime { struct SessionOptions; +// Defined in core/session/provider_bridge_ort.cc if built as a shared library (default build config). +// Defined in core/providers/qnn/qnn_provider_factory.cc if built as a static library. +// The preprocessor macro `BUILD_QNN_EP_STATIC_LIB` is defined and set to 1 if QNN is built as a static library. struct QNNProviderFactoryCreator { static std::shared_ptr Create(const ProviderOptions& provider_options_map, const SessionOptions* session_options); diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.cc b/onnxruntime/core/providers/qnn/qnn_telemetry.cc new file mode 100644 index 0000000000000..b2c8350bfe8ca --- /dev/null +++ b/onnxruntime/core/providers/qnn/qnn_telemetry.cc @@ -0,0 +1,211 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/qnn/qnn_telemetry.h" + +#ifdef _WIN32 +#if !BUILD_QNN_EP_STATIC_LIB +// ETW includes +// need space after Windows.h to prevent clang-format re-ordering breaking the build. +// TraceLoggingProvider.h must follow Windows.h +#include + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 26440) // Warning C26440 from TRACELOGGING_DEFINE_PROVIDER +#endif + +#include +#include +#include +#include "core/platform/windows/TraceLoggingConfig.h" + +// Seems this workaround can be dropped when we drop support for VS2017 toolchains +// https://developercommunity.visualstudio.com/content/problem/85934/traceloggingproviderh-is-incompatible-with-utf-8.html +#ifdef _TlgPragmaUtf8Begin +#undef _TlgPragmaUtf8Begin +#define _TlgPragmaUtf8Begin +#endif + +#ifdef _TlgPragmaUtf8End +#undef _TlgPragmaUtf8End +#define _TlgPragmaUtf8End +#endif + +// Different versions of TraceLoggingProvider.h contain different macro variable names for the utf8 begin and end, +// and we need to cover the lower case version as well. +#ifdef _tlgPragmaUtf8Begin +#undef _tlgPragmaUtf8Begin +#define _tlgPragmaUtf8Begin +#endif + +#ifdef _tlgPragmaUtf8End +#undef _tlgPragmaUtf8End +#define _tlgPragmaUtf8End +#endif + +TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntime", + // {3a26b1ff-7484-7484-7484-15261f42614d} + (0x3a26b1ff, 0x7484, 0x7484, 0x74, 0x84, 0x15, 0x26, 0x1f, 0x42, 0x61, 0x4d), + TraceLoggingOptionMicrosoftTelemetry()); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif +#endif // !BUILD_QNN_EP_STATIC_LIB + +#include "core/providers/qnn/ort_api.h" + +namespace onnxruntime { +namespace qnn { + +#if !BUILD_QNN_EP_STATIC_LIB +std::mutex QnnTelemetry::mutex_; +std::mutex QnnTelemetry::provider_change_mutex_; +uint32_t QnnTelemetry::global_register_count_ = 0; +bool QnnTelemetry::enabled_ = true; +UCHAR QnnTelemetry::level_ = 0; +UINT64 QnnTelemetry::keyword_ = 0; +std::vector QnnTelemetry::callbacks_; +std::mutex QnnTelemetry::callbacks_mutex_; +#endif // !BUILD_QNN_EP_STATIC_LIB + +QnnTelemetry::QnnTelemetry() { +#if !BUILD_QNN_EP_STATIC_LIB + std::lock_guard lock(mutex_); + if (global_register_count_ == 0) { + // TraceLoggingRegister is fancy in that you can only register once GLOBALLY for the whole process + HRESULT hr = TraceLoggingRegisterEx(telemetry_provider_handle, ORT_TL_EtwEnableCallback, nullptr); + if (SUCCEEDED(hr)) { + global_register_count_ += 1; + } + } +#endif // !BUILD_QNN_EP_STATIC_LIB +} + +QnnTelemetry::~QnnTelemetry() { +#if !BUILD_QNN_EP_STATIC_LIB + std::lock_guard lock(mutex_); + if (global_register_count_ > 0) { + global_register_count_ -= 1; + if (global_register_count_ == 0) { + TraceLoggingUnregister(telemetry_provider_handle); + } + } + + std::lock_guard lock_callbacks(callbacks_mutex_); + callbacks_.clear(); +#endif // !BUILD_QNN_EP_STATIC_LIB +} + +QnnTelemetry& QnnTelemetry::Instance() { + static QnnTelemetry instance; + return instance; +} + +bool QnnTelemetry::IsEnabled() const { +#if BUILD_QNN_EP_STATIC_LIB + const Env& env = GetDefaultEnv(); + auto& provider = env.GetTelemetryProvider(); + return provider.IsEnabled(); +#else + std::lock_guard lock(provider_change_mutex_); + return enabled_; +#endif +} + +UCHAR QnnTelemetry::Level() const { +#if BUILD_QNN_EP_STATIC_LIB + const Env& env = GetDefaultEnv(); + auto& provider = env.GetTelemetryProvider(); + return provider.Level(); +#else + std::lock_guard lock(provider_change_mutex_); + return level_; +#endif +} + +UINT64 QnnTelemetry::Keyword() const { +#if BUILD_QNN_EP_STATIC_LIB + const Env& env = GetDefaultEnv(); + auto& provider = env.GetTelemetryProvider(); + return provider.Keyword(); +#else + std::lock_guard lock(provider_change_mutex_); + return keyword_; +#endif +} + +void QnnTelemetry::LogQnnProfileEvent(uint64_t timestamp, + const std::string& message, + const std::string& qnnScalarValue, + const std::string& unit, + const std::string& timingSource, + const std::string& eventLevel, + const char* eventIdentifier) const { + TraceLoggingWrite( + telemetry_provider_handle, + "QNNProfilingEvent", + TraceLoggingKeyword(static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)), + TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE), + TraceLoggingValue(timestamp, "Timestamp"), + TraceLoggingString(message.c_str(), "Message"), + TraceLoggingString(qnnScalarValue.c_str(), "Value"), + TraceLoggingString(unit.c_str(), "Unit of Measurement"), + TraceLoggingString(timingSource.c_str(), "Timing Source"), + TraceLoggingString(eventLevel.c_str(), "Event Level"), + TraceLoggingString(eventIdentifier, "Event Identifier")); +} + +void QnnTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) { +#if BUILD_QNN_EP_STATIC_LIB + WindowsTelemetry::RegisterInternalCallback(callback); +#else + std::lock_guard lock_callbacks(callbacks_mutex_); + callbacks_.push_back(&callback); +#endif +} + +void QnnTelemetry::UnregisterInternalCallback(const EtwInternalCallback& callback) { +#if BUILD_QNN_EP_STATIC_LIB + WindowsTelemetry::UnregisterInternalCallback(callback); +#else + std::lock_guard lock_callbacks(callbacks_mutex_); + auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(), + [&callback](const EtwInternalCallback* ptr) { + return ptr == &callback; + }); + callbacks_.erase(new_end, callbacks_.end()); +#endif +} + +#if !BUILD_QNN_EP_STATIC_LIB +void NTAPI QnnTelemetry::ORT_TL_EtwEnableCallback( + _In_ LPCGUID SourceId, + _In_ ULONG IsEnabled, + _In_ UCHAR Level, + _In_ ULONGLONG MatchAnyKeyword, + _In_ ULONGLONG MatchAllKeyword, + _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData, + _In_opt_ PVOID CallbackContext) { + std::lock_guard lock(provider_change_mutex_); + enabled_ = (IsEnabled != 0); + level_ = Level; + keyword_ = MatchAnyKeyword; + + InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext); +} + +void QnnTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword, + ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, + PVOID CallbackContext) { + std::lock_guard lock_callbacks(callbacks_mutex_); + for (const auto& callback : callbacks_) { + (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext); + } +} +#endif // !BUILD_QNN_EP_STATIC_LIB + +} // namespace qnn +} // namespace onnxruntime +#endif // defined(_WIN32) diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.h b/onnxruntime/core/providers/qnn/qnn_telemetry.h new file mode 100644 index 0000000000000..a2d42c518c1ac --- /dev/null +++ b/onnxruntime/core/providers/qnn/qnn_telemetry.h @@ -0,0 +1,98 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#ifdef _WIN32 +#include + +#if !BUILD_QNN_EP_STATIC_LIB +#include +#endif + +#include +#include +#include +#include + +#include "core/providers/qnn/ort_api.h" + +#if !BUILD_QNN_EP_STATIC_LIB +TRACELOGGING_DECLARE_PROVIDER(telemetry_provider_handle); +#endif + +namespace onnxruntime { +namespace qnn { + +/// +/// Singleton class used to log QNN profiling events to the ONNX Runtime telemetry tracelogging provider. +/// +/// When QNN EP is a DLL, we must define our own tracelogging provider handle via TRACELOGGING_DEFINE_PROVIDER. +/// TraceLogging documentation states that separate DLLs cannot share the same tracelogging provider handle. See: +/// https://learn.microsoft.com/en-us/windows/win32/api/traceloggingprovider/nf-traceloggingprovider-tracelogging_define_provider#remarks +/// +/// When QNN EP is a static library, we use the tracelogging provider handle already defined +/// in core/platform/windows/telemetry.h/.cc. In this case, we forward method calls to the +/// ORT Env's telemetry provider. +/// +class QnnTelemetry { + public: + static QnnTelemetry& Instance(); + bool IsEnabled() const; + + // Get the current logging level + unsigned char Level() const; + + // Get the current keyword + UINT64 Keyword() const; + + // Logs QNN profiling event as trace logging event. + void LogQnnProfileEvent(uint64_t timestamp, + const std::string& message, + const std::string& qnnScalarValue, + const std::string& unit, + const std::string& timingSource, + const std::string& eventLevel, + const char* eventIdentifier) const; + + using EtwInternalCallback = std::function; + + static void RegisterInternalCallback(const EtwInternalCallback& callback); + + static void UnregisterInternalCallback(const EtwInternalCallback& callback); + + private: + QnnTelemetry(); + ~QnnTelemetry(); + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnTelemetry); + +#if !BUILD_QNN_EP_STATIC_LIB + static std::mutex mutex_; + static uint32_t global_register_count_; + static bool enabled_; + + static std::vector callbacks_; + static std::mutex callbacks_mutex_; + static std::mutex provider_change_mutex_; + static UCHAR level_; + static ULONGLONG keyword_; + + static void InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword, + ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext); + + static void NTAPI ORT_TL_EtwEnableCallback( + _In_ LPCGUID SourceId, + _In_ ULONG IsEnabled, + _In_ UCHAR Level, + _In_ ULONGLONG MatchAnyKeyword, + _In_ ULONGLONG MatchAllKeyword, + _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData, + _In_opt_ PVOID CallbackContext); +#endif +}; + +} // namespace qnn +} // namespace onnxruntime + +#endif // defined(_WIN32) diff --git a/onnxruntime/core/providers/qnn/rpcmem_library.cc b/onnxruntime/core/providers/qnn/rpcmem_library.cc index 59e6cff925668..93c5ed54ab371 100644 --- a/onnxruntime/core/providers/qnn/rpcmem_library.cc +++ b/onnxruntime/core/providers/qnn/rpcmem_library.cc @@ -2,9 +2,7 @@ // Licensed under the MIT License #include "core/providers/qnn/rpcmem_library.h" - -#include "core/common/logging/logging.h" -#include "core/platform/env.h" +#include "core/providers/qnn/ort_api.h" namespace onnxruntime::qnn { @@ -25,7 +23,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb return; } - const auto& env = Env::Default(); + const auto& env = GetDefaultEnv(); const auto unload_status = env.UnloadDynamicLibrary(library_handle); if (!unload_status.IsOK()) { @@ -33,7 +31,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb } }; - const auto& env = Env::Default(); + const auto& env = GetDefaultEnv(); void* library_handle = nullptr; const auto load_status = env.LoadDynamicLibrary(path, global_symbols, &library_handle); @@ -47,7 +45,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb RpcMemApi CreateApi(void* library_handle) { RpcMemApi api{}; - const auto& env = Env::Default(); + const auto& env = GetDefaultEnv(); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(library_handle, "rpcmem_alloc", (void**)&api.alloc)); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(library_handle, "rpcmem_free", (void**)&api.free)); diff --git a/onnxruntime/core/providers/qnn/rpcmem_library.h b/onnxruntime/core/providers/qnn/rpcmem_library.h index d5697ff298e79..0642c96798188 100644 --- a/onnxruntime/core/providers/qnn/rpcmem_library.h +++ b/onnxruntime/core/providers/qnn/rpcmem_library.h @@ -6,7 +6,7 @@ #include #include -#include "core/common/common.h" +#include "core/providers/qnn/ort_api.h" namespace onnxruntime::qnn { diff --git a/onnxruntime/core/providers/qnn/shared_context.h b/onnxruntime/core/providers/qnn/shared_context.h index a111e57038304..81de357dbe677 100644 --- a/onnxruntime/core/providers/qnn/shared_context.h +++ b/onnxruntime/core/providers/qnn/shared_context.h @@ -5,7 +5,7 @@ #include #include -#include "core/common/common.h" +#include "core/providers/qnn/ort_api.h" #include "core/providers/qnn/builder/qnn_model.h" #pragma once diff --git a/onnxruntime/core/providers/qnn/symbols.def b/onnxruntime/core/providers/qnn/symbols.def new file mode 100644 index 0000000000000..4ec2f7914c208 --- /dev/null +++ b/onnxruntime/core/providers/qnn/symbols.def @@ -0,0 +1,2 @@ +EXPORTS + GetProvider diff --git a/onnxruntime/core/providers/qnn/version_script.lds b/onnxruntime/core/providers/qnn/version_script.lds new file mode 100644 index 0000000000000..094abb3329781 --- /dev/null +++ b/onnxruntime/core/providers/qnn/version_script.lds @@ -0,0 +1,9 @@ +#_init and _fini should be local +VERS_1.0 { + global: + GetProvider; + + # Hide everything else. + local: + *; +}; diff --git a/onnxruntime/core/providers/rknpu/node_attr_helper.h b/onnxruntime/core/providers/rknpu/node_attr_helper.h index 6ab8f8c6bb953..76a0c721f70aa 100644 --- a/onnxruntime/core/providers/rknpu/node_attr_helper.h +++ b/onnxruntime/core/providers/rknpu/node_attr_helper.h @@ -4,7 +4,7 @@ #pragma once -#include +#include "core/graph/onnx_protobuf.h" #include #include diff --git a/onnxruntime/core/providers/rknpu/onnx_converter.h b/onnxruntime/core/providers/rknpu/onnx_converter.h index e90efd75b9c7f..10cc09a9dba92 100644 --- a/onnxruntime/core/providers/rknpu/onnx_converter.h +++ b/onnxruntime/core/providers/rknpu/onnx_converter.h @@ -2,7 +2,7 @@ #pragma once -#include +#include "core/graph/onnx_protobuf.h" #include #include diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index 45f81ed22b7f7..6ff2572e5e668 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -9,6 +9,11 @@ #pragma once #define SHARED_PROVIDER 1 +#ifdef _WIN32 +#include +#include +#endif // defined(_WIN32) + #include #include #include @@ -136,6 +141,17 @@ enum class DataType { USER = 1 ///< Contains potentially sensitive user data. }; +enum class ORTTraceLoggingKeyword : uint64_t { + Session = 0x1, // ORT Session TraceLoggingWrite + Logs = 0x2, // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required + Reserved1 = 0x4, // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses + Reserved2 = 0x8, + Reserved3 = 0x10, + Reserved4 = 0x20, + Reserved5 = 0x40, + Reserved6 = 0x80, + Profiling = 0x100 // Enables profiling. At higher levels >5 can impact inference performance +}; } // namespace logging // OnnxRuntime Types (these are the internal types) @@ -143,6 +159,13 @@ struct CPUIDInfo; namespace logging { struct Logger; struct Capture; +#ifdef _WIN32 +struct EtwRegistrationManager; +using EtwRegistrationManager_EtwInternalCallback = std::function; +#endif } // namespace logging struct ComputeCapability; struct ConfigOptions; @@ -157,10 +180,12 @@ struct KernelRegistry; struct Function; struct Graph; class GraphViewer; +struct ConstGraphNodes; enum class DataLayout; struct Model; struct Path; struct Node; +struct Node_EdgeEnd; struct NodeArg; struct NodeAttributes; struct NodeUnitIODef; @@ -215,6 +240,7 @@ using DeleteFunc = void (*)(void*); using NodeArgInfo = ONNX_NAMESPACE::ValueInfoProto; using NameMLValMap = std::unordered_map; + } // namespace onnxruntime #include "core/platform/threadpool.h" @@ -368,6 +394,28 @@ template <> constexpr ONNXTensorElementDataType GetONNXTensorElementDataType() { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4; } + +inline std::vector> +CreateSupportedPartitions(const GraphViewer& graph_viewer, + const std::unordered_set& supported_nodes, + const std::unordered_set& stop_ops, + const std::function& generate_metadef_name, + const std::string& execution_provider_name, + const std::string& execution_provider_type, + const std::unordered_map* node_unit_map, + bool drop_constant_initializers = false) { + return g_host->Utils__CreateSupportedPartitions(graph_viewer, supported_nodes, stop_ops, generate_metadef_name, + execution_provider_name, execution_provider_type, node_unit_map, + drop_constant_initializers); +} +inline std::unique_ptr MakeComputeCapability(const GraphViewer& graph_viewer, + const std::vector& group, + const std::function& generate_metadef_name, + const std::string& execution_provider_name, + bool drop_constant_initializers) { + return g_host->Utils__MakeComputeCapability(graph_viewer, group, generate_metadef_name, + execution_provider_name, drop_constant_initializers); +} } // namespace utils namespace QDQ { @@ -381,6 +429,10 @@ GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) // So the C API (and C++) becomes available when ORT_API_MANUAL_INIT is used. void InitProviderOrtApi(); +// This is a replacement for Env::Default(). Returns a reference to the default ORT Environment. +inline Env& GetDefaultEnv() { + return g_host->Env__Default(); +} } // namespace onnxruntime #define CREATE_MESSAGE(logger, severity, category, datatype) \ diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index aa8c367d25d51..4c050534456da 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -505,6 +505,9 @@ Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const st /*out*/ std::vector& unpacked_tensor) { return g_host->UnpackInitializerData(tensor, model_path, unpacked_tensor); } +Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, /*out*/ std::vector& unpacked_tensor) { + return g_host->UnpackInitializerData(tensor, std::filesystem::path(), unpacked_tensor); +} } // namespace utils @@ -788,5 +791,5 @@ std::string ToUTF8String(const std::wstring& s) { std::wstring ToWideString(const std::string& s) { return g_host->ToWideString(s); } -#endif +#endif // _WIN32 } // namespace onnxruntime diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 5a179ec622f8c..962d10d8952d6 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -120,11 +120,20 @@ struct Node__EdgeIterator { virtual bool operator!=(const Node__EdgeIterator& p) const = 0; virtual void operator++() = 0; + virtual const Node_EdgeEnd& operator*() const = 0; virtual const Node& GetNode() const = 0; virtual int GetSrcArgIndex() const = 0; virtual int GetDstArgIndex() const = 0; }; +struct ConstGraphNodes_Iterator { + virtual ~ConstGraphNodes_Iterator() {} + + virtual bool operator!=(const ConstGraphNodes_Iterator& other) const = 0; + virtual void operator++() = 0; + virtual const Node& operator*() = 0; +}; + // There are two ways to route a function, one is a virtual method and the other is a function pointer (or pointer to // member function). // The function pointers are nicer in that they directly call the target function, but they cannot be used in cases @@ -273,20 +282,41 @@ struct ProviderHost { // logging::Logger virtual bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) = 0; + virtual logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) = 0; // logging::LoggingManager virtual const logging::Logger& logging__LoggingManager__DefaultLogger() = 0; + virtual bool logging__LoggingManager__HasDefaultLogger() = 0; // logging::Capture - virtual std::unique_ptr logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) = 0; + virtual std::unique_ptr logging__Capture__construct(const logging::Logger& logger, + logging::Severity severity, + const char* category, + logging::DataType data_type, + const CodeLocation& location) = 0; virtual void logging__Capture__operator_delete(logging::Capture* p) noexcept = 0; virtual std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept = 0; + virtual void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) = 0; + +#if defined(_WIN32) + // logging::EtwRegistrationManager + virtual logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() = 0; + virtual bool logging__EtwRegistrationManager__SupportsETW() = 0; + virtual logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) = 0; + virtual void logging__EtwRegistrationManager__RegisterInternalCallback( + logging::EtwRegistrationManager* p, + const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0; + virtual void logging__EtwRegistrationManager__UnregisterInternalCallback( + logging::EtwRegistrationManager* p, + const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0; +#endif // defined(_WIN32) // Env virtual Env& Env__Default() = 0; // Utils::DataTypeUtils virtual const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) = 0; + virtual const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) = 0; // int64s virtual int int64s__size(const ONNX_NAMESPACE::int64s* p) = 0; @@ -328,6 +358,7 @@ struct ProviderHost { virtual bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0; virtual const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0; virtual ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) = 0; + virtual bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0; virtual int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0; virtual void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) = 0; @@ -342,6 +373,7 @@ struct ProviderHost { // TypeProto virtual std::unique_ptr TypeProto__construct() = 0; virtual void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) = 0; + virtual bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0; virtual const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0; virtual ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) = 0; @@ -462,6 +494,7 @@ struct ProviderHost { virtual bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0; virtual const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0; virtual std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) = 0; + virtual bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) = 0; virtual int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) = 0; virtual void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) = 0; virtual void TensorProto__CopyFrom(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto* other) = 0; @@ -495,6 +528,7 @@ struct ProviderHost { // TensorShapeProto_Dimensions virtual std::unique_ptr TensorShapeProto_Dimensions__begin(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0; virtual std::unique_ptr TensorShapeProto_Dimensions__end(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0; + virtual size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0; // TensorShapeProto virtual int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) = 0; @@ -823,6 +857,8 @@ struct ProviderHost { virtual const NodeAttributes& Node__GetAttributes(const Node* p) noexcept = 0; virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) = 0; + virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) = 0; + virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) = 0; virtual size_t Node__GetInputEdgesCount(const Node* p) noexcept = 0; virtual size_t Node__GetOutputEdgesCount(const Node* p) noexcept = 0; @@ -842,6 +878,11 @@ struct ProviderHost { virtual const std::unordered_map>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0; virtual std::unordered_map> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0; + // Node_EdgeEnd + virtual const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) = 0; + virtual int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) = 0; + virtual int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) = 0; + // NodeArg virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0; virtual const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) = 0; @@ -872,6 +913,8 @@ struct ProviderHost { virtual void NodeAttributes__reserve(NodeAttributes* p, size_t size) = 0; // NodeUnit + virtual void NodeUnit__operator_delete(NodeUnit* p) noexcept = 0; + virtual int NodeUnit__UnitType(const NodeUnit* p) noexcept = 0; virtual const std::vector& NodeUnit__Inputs(const NodeUnit* p) noexcept = 0; @@ -897,10 +940,29 @@ struct ProviderHost { virtual std::pair>, std::unordered_map> QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) = 0; + // Partitioning utils + virtual std::vector> + Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer, + const std::unordered_set& supported_nodes, + const std::unordered_set& stop_ops, + const std::function& generate_metadef_name, + const std::string& execution_provider_name, + const std::string& execution_provider_type, + const std::unordered_map* node_unit_map, + bool drop_constant_initializers) = 0; + + virtual std::unique_ptr + Utils__MakeComputeCapability(const GraphViewer& graph_viewer, + const std::vector& group, + const std::function& generate_metadef_name, + const std::string& execution_provider_name, + bool drop_constant_initializers) = 0; // Model virtual std::unique_ptr Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path, const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) = 0; + virtual std::unique_ptr Model__construct(const std::string& graph_name, bool is_onnx_domain_only, + const logging::Logger& logger) = 0; virtual void Model__operator_delete(Model* p) = 0; virtual Graph& Model__MainGraph(Model* p) = 0; virtual std::unique_ptr Model__ToProto(Model* p) = 0; @@ -974,6 +1036,7 @@ struct ProviderHost { virtual const std::string& GraphViewer__Name(const GraphViewer* p) noexcept = 0; virtual const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept = 0; + virtual const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept = 0; virtual const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) = 0; virtual const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) = 0; @@ -989,6 +1052,7 @@ struct ProviderHost { virtual const std::vector& GraphViewer__GetInputs(const GraphViewer* p) noexcept = 0; virtual const std::vector& GraphViewer__GetOutputs(const GraphViewer* p) noexcept = 0; + virtual bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) = 0; virtual const std::unordered_set& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept = 0; virtual const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) = 0; @@ -1007,6 +1071,13 @@ struct ProviderHost { virtual const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const = 0; virtual IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const = 0; + // ConstGraphNodes + virtual std::unique_ptr ConstGraphNodes__begin(const ConstGraphNodes* p) = 0; + virtual std::unique_ptr ConstGraphNodes__end(const ConstGraphNodes* p) = 0; + virtual std::unique_ptr ConstGraphNodes__cbegin(const ConstGraphNodes* p) = 0; + virtual std::unique_ptr ConstGraphNodes__cend(const ConstGraphNodes* p) = 0; + virtual bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept = 0; + // OpKernel virtual const Node& OpKernel__Node(const OpKernel* p) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index 76b6d8063fd66..e434935343663 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -23,27 +23,50 @@ namespace logging { struct Logger final { bool OutputIsEnabled(Severity severity, DataType data_type) const noexcept { return g_host->logging__Logger__OutputIsEnabled(this, severity, data_type); } + Severity GetSeverity() const noexcept { + return g_host->logging__Logger__GetSeverity(this); + } PROVIDER_DISALLOW_ALL(Logger) }; struct LoggingManager final { static const Logger& DefaultLogger() { return g_host->logging__LoggingManager__DefaultLogger(); } + static bool HasDefaultLogger() { return g_host->logging__LoggingManager__HasDefaultLogger(); } PROVIDER_DISALLOW_ALL(LoggingManager) }; struct Capture final { static std::unique_ptr Create(const Logger& logger, logging::Severity severity, const char* category, - logging::DataType dataType, const CodeLocation& location) { return g_host->logging__Capture__construct(logger, severity, category, dataType, location); } + logging::DataType data_type, const CodeLocation& location) { + return g_host->logging__Capture__construct(logger, severity, category, data_type, location); + } static void operator delete(void* p) { g_host->logging__Capture__operator_delete(reinterpret_cast(p)); } std::ostream& Stream() noexcept { return g_host->logging__Capture__Stream(this); } + void ProcessPrintf(const char* format, va_list args) { g_host->logging__Capture__ProcessPrintf(this, format, args); } Capture() = delete; Capture(const Capture&) = delete; void operator=(const Capture&) = delete; }; + +#if defined(_WIN32) +struct EtwRegistrationManager final { + using EtwInternalCallback = EtwRegistrationManager_EtwInternalCallback; + static EtwRegistrationManager& Instance() { return g_host->logging__EtwRegistrationManager__Instance(); } + static bool SupportsETW() { return g_host->logging__EtwRegistrationManager__SupportsETW(); } + Severity MapLevelToSeverity() { return g_host->logging__EtwRegistrationManager__MapLevelToSeverity(this); } + void RegisterInternalCallback(const EtwInternalCallback& callback) { + g_host->logging__EtwRegistrationManager__RegisterInternalCallback(this, callback); + } + void UnregisterInternalCallback(const EtwInternalCallback& callback) { + g_host->logging__EtwRegistrationManager__UnregisterInternalCallback(this, callback); + } +}; +#endif // defined(_WIN32) + } // namespace logging } // namespace onnxruntime @@ -234,6 +257,7 @@ struct TensorProto final { const std::string& raw_data() const { return g_host->TensorProto__raw_data(this); } std::string* mutable_raw_data() { return g_host->TensorProto__mutable_raw_data(this); } + bool has_data_type() const { return g_host->TensorProto__has_data_type(this); } int32_t data_type() const { return g_host->TensorProto__data_type(this); } void set_data_type(int32_t type) { return g_host->TensorProto__set_data_type(this, type); } @@ -286,6 +310,7 @@ struct TensorShapeProto_Dimension final { struct TensorShapeProto_Dimensions final { IteratorHolder begin() const { return g_host->TensorShapeProto_Dimensions__begin(this); } IteratorHolder end() const { return g_host->TensorShapeProto_Dimensions__end(this); } + size_t size() const { return g_host->TensorShapeProto_Dimensions__size(this); } PROVIDER_DISALLOW_ALL(TensorShapeProto_Dimensions) }; @@ -305,6 +330,7 @@ struct TypeProto_Tensor final { bool has_shape() const { return g_host->TypeProto_Tensor__has_shape(this); } const TensorShapeProto& shape() const { return g_host->TypeProto_Tensor__shape(this); } TensorShapeProto* mutable_shape() { return g_host->TypeProto_Tensor__mutable_shape(this); } + bool has_elem_type() const { return g_host->TypeProto_Tensor__has_elem_type(this); } int32_t elem_type() const { return g_host->TypeProto_Tensor__elem_type(this); } void set_elem_type(int32_t value) { g_host->TypeProto_Tensor__set_elem_type(this, value); } @@ -339,6 +365,7 @@ struct TypeProto_Sequence final { struct TypeProto final { static std::unique_ptr Create() { return g_host->TypeProto__construct(); } + bool has_tensor_type() const { return g_host->TypeProto__has_tensor_type(this); } const TypeProto_Tensor& tensor_type() const { return g_host->TypeProto__tensor_type(this); } TypeProto_Tensor* mutable_tensor_type() { return g_host->TypeProto__mutable_tensor_type(this); } @@ -475,6 +502,7 @@ namespace Utils { struct DataTypeUtils final { static const std::string* ToType(const ONNX_NAMESPACE::TypeProto& type_proto) { return g_host->Utils__DataTypeUtils__ToType(type_proto); } + static const std::string* ToType(const std::string& type_str) { return g_host->Utils__DataTypeUtils__ToType(type_str); } PROVIDER_DISALLOW_ALL(DataTypeUtils) }; @@ -770,6 +798,14 @@ struct Function final { PROVIDER_DISALLOW_ALL(Function) }; +struct Node_EdgeEnd final { + const Node& GetNode() const { return g_host->Node_EdgeEnd__GetNode(this); } + int GetSrcArgIndex() const { return g_host->Node_EdgeEnd__GetSrcArgIndex(this); } + int GetDstArgIndex() const { return g_host->Node_EdgeEnd__GetDstArgIndex(this); } + + PROVIDER_DISALLOW_ALL(Node_EdgeEnd) +}; + struct Node final { enum class Type { Primitive = 0, @@ -801,6 +837,12 @@ struct Node final { void AddAttribute(const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) { g_host->Node__AddAttribute(this, attr_name, value); } + void AddAttribute(const std::string& attr_name, const std::string& value) { + g_host->Node__AddAttribute(this, attr_name, value); + } + void AddAttribute(const std::string& attr_name, int64_t value) { + g_host->Node__AddAttribute(this, attr_name, value); + } size_t GetInputEdgesCount() const noexcept { return g_host->Node__GetInputEdgesCount(this); } size_t GetOutputEdgesCount() const noexcept { return g_host->Node__GetOutputEdgesCount(this); } @@ -832,6 +874,7 @@ struct Node final { } void operator++() { impl_->operator++(); } + const Node_EdgeEnd& operator*() { return impl_->operator*(); } const Node__EdgeIterator* operator->() const { return impl_.get(); } std::unique_ptr impl_; @@ -906,6 +949,13 @@ struct NodeUnit final { QDQGroup, // The NodeUnit contain a QDQ group of nodes, such as "DQ->Sigmoid->Q" }; + NodeUnit() = delete; + NodeUnit(const NodeUnit&) = delete; + void operator=(const NodeUnit& v) = delete; + + // Need delete because of APIs that return unique_ptr + static void operator delete(void* p) { g_host->NodeUnit__operator_delete(reinterpret_cast(p)); } + Type UnitType() const noexcept { return static_cast(g_host->NodeUnit__UnitType(this)); } const std::vector& Inputs() const noexcept { return g_host->NodeUnit__Inputs(this); } @@ -941,6 +991,9 @@ struct Model final { const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) { return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger); } + static std::unique_ptr Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) { + return g_host->Model__construct(graph_name, is_onnx_domain_only, logger); + } static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast(p)); } static Status Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { return g_host->Model__Load(file_path, model_proto); } @@ -1041,6 +1094,7 @@ class GraphViewer final { const std::string& Name() const noexcept { return g_host->GraphViewer__Name(this); } const std::filesystem::path& ModelPath() const noexcept { return g_host->GraphViewer__ModelPath(this); } + const ConstGraphNodes& Nodes() const noexcept { return g_host->GraphViewer__Nodes(this); } const Node* GetNode(NodeIndex node_index) const { return g_host->GraphViewer__GetNode(this, node_index); } const NodeArg* GetNodeArg(const std::string& name) const { return g_host->GraphViewer__GetNodeArg(this, name); } @@ -1058,6 +1112,9 @@ class GraphViewer final { const std::vector& GetInputs() const noexcept { return g_host->GraphViewer__GetInputs(this); } const std::vector& GetOutputs() const noexcept { return g_host->GraphViewer__GetOutputs(this); } + bool NodeProducesGraphOutput(const Node& node) const { + return g_host->GraphViewer__NodeProducesGraphOutput(this, node); + } const std::unordered_set& GetValueInfo() const noexcept { return g_host->GraphViewer__GetValueInfo(this); } const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return g_host->GraphViewer__GetAllInitializedTensors(this); } @@ -1085,6 +1142,25 @@ class GraphViewer final { void operator=(const GraphViewer&) = delete; }; +struct ConstGraphNodes final { + IteratorHolder begin() const { + return g_host->ConstGraphNodes__begin(this); + } + IteratorHolder end() const { + return g_host->ConstGraphNodes__end(this); + } + IteratorHolder cbegin() const { + return g_host->ConstGraphNodes__cbegin(this); + } + IteratorHolder cend() const { + return g_host->ConstGraphNodes__cend(this); + } + + bool empty() const noexcept { return g_host->ConstGraphNodes__empty(this); } + + PROVIDER_DISALLOW_ALL(ConstGraphNodes) +}; + struct OpKernelContext final { template const T& RequiredInput(int index) const; diff --git a/onnxruntime/core/providers/webgpu/generator/range.cc b/onnxruntime/core/providers/webgpu/generator/range.cc index ee7c67ec24185..a0b65f08a5b4e 100644 --- a/onnxruntime/core/providers/webgpu/generator/range.cc +++ b/onnxruntime/core/providers/webgpu/generator/range.cc @@ -25,6 +25,11 @@ Status Range::ComputeInternal(ComputeContext& context) const { uint32_t output_size = gsl::narrow(n); RangeProgram program{}; +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + program.AddOutput({output_tensor, ProgramTensorMetadataDependency::Type}) .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .AddUniformVariables({ @@ -33,6 +38,10 @@ Status Range::ComputeInternal(ComputeContext& context) const { *reinterpret_cast(&delta), }); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + return context.RunProgram(program); } diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc index 8dcf63671092b..eaaad206ebaf5 100644 --- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc @@ -194,6 +194,10 @@ class Clip final : public UnaryElementwise { "Clip", std::is_same_v ? ClipF16Impl : ClipImpl, "", ShaderUsage::UseElementTypeAlias} {} +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif Status ConfigureProgram(const ComputeContext& context, UnaryElementwiseProgram& program) const override { const auto* clip_min_tensor = context.Input(1); @@ -214,6 +218,9 @@ class Clip final : public UnaryElementwise { } return Status::OK(); } +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif // uniforms.attr is a f32 value. It is encoded as a float for 2 f16 values. // bitcast>(uniforms.attr)[0] is clip_min, bitcast>(uniforms.attr)[1] is clip_max diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h index 1562ec158b40a..7bfd9e8800099 100644 --- a/onnxruntime/core/providers/webgpu/program.h +++ b/onnxruntime/core/providers/webgpu/program.h @@ -150,6 +150,11 @@ enum class ProgramTensorMetadataDependency : int { }; std::ostream& operator<<(std::ostream& os, ProgramTensorMetadataDependency); +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + inline ProgramTensorMetadataDependency operator|(ProgramTensorMetadataDependency a, ProgramTensorMetadataDependency b) { return (ProgramTensorMetadataDependency)((int&)a | (int&)b); } @@ -163,6 +168,10 @@ inline ProgramTensorMetadataDependency& operator&=(ProgramTensorMetadataDependen return (ProgramTensorMetadataDependency&)((int&)a &= (int&)b); } +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + constexpr SafeInt WORKGROUP_SIZE = 64; // data type of variable diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc index 109bac34d6503..1fdd312d4f0d8 100644 --- a/onnxruntime/core/providers/webgpu/program_manager.cc +++ b/onnxruntime/core/providers/webgpu/program_manager.cc @@ -147,16 +147,16 @@ Status ProgramManager::Build(const ProgramBase& program, } } - wgpu::ProgrammableStageDescriptor compute_stage{}; - compute_stage.module = shader_module; - compute_stage.entryPoint = "main"; + wgpu::ComputeState compute_state{}; + compute_state.module = shader_module; + compute_state.entryPoint = "main"; if (!constant_entries.empty()) { - compute_stage.constants = constant_entries.data(); - compute_stage.constantCount = constant_entries.size(); + compute_state.constants = constant_entries.data(); + compute_state.constantCount = constant_entries.size(); } wgpu::ComputePipelineDescriptor pipeline_descriptor{}; - pipeline_descriptor.compute = compute_stage; + pipeline_descriptor.compute = compute_state; #ifndef NDEBUG // if debug build pipeline_descriptor.label = program.Name().c_str(); #endif diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h index 4c87bc9158890..2aba2a59d157f 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.h +++ b/onnxruntime/core/providers/webgpu/shader_variable.h @@ -189,6 +189,10 @@ class ShaderVariableHelper : public ShaderIndicesHelper { friend class ShaderHelper; }; +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif inline ShaderUsage operator|(ShaderUsage a, ShaderUsage b) { return (uint32_t)a.usage | (uint32_t)b.usage; @@ -205,6 +209,10 @@ inline ShaderUsage& operator&=(ShaderUsage& a, ShaderUsage b) { return a; } +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + namespace detail { template >> std::string pass_as_string(T&& v) { diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index 1c9a16bf36e8e..99a645878cd7e 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -4,12 +4,20 @@ #include #include +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + #if !defined(__wasm__) #include "dawn/dawn_proc.h" #if !defined(USE_EXTERNAL_DAWN) #include "dawn/native/DawnNative.h" #endif #endif +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif #include "core/common/common.h" #include "core/common/path_string.h" @@ -107,12 +115,12 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi device_desc.requiredLimits = &required_limits; // TODO: revise temporary error handling - device_desc.SetUncapturedErrorCallback([](const wgpu::Device& /*device*/, wgpu::ErrorType type, const char* message) { - LOGS_DEFAULT(ERROR) << "WebGPU device error(" << int(type) << "): " << message; + device_desc.SetUncapturedErrorCallback([](const wgpu::Device& /*device*/, wgpu::ErrorType type, wgpu::StringView message) { + LOGS_DEFAULT(ERROR) << "WebGPU device error(" << int(type) << "): " << std::string_view{message}; }); // TODO: revise temporary device lost handling - device_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& /*device*/, wgpu::DeviceLostReason reason, const char* message) { - LOGS_DEFAULT(INFO) << "WebGPU device lost (" << int(reason) << "): " << message; + device_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& /*device*/, wgpu::DeviceLostReason reason, wgpu::StringView message) { + LOGS_DEFAULT(INFO) << "WebGPU device lost (" << int(reason) << "): " << std::string_view{message}; }); ORT_ENFORCE(wgpu::WaitStatus::Success == instance_.WaitAny(adapter_.RequestDevice( diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc index 781ddcb896155..585fddfd1ff2c 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc @@ -17,10 +17,6 @@ class ActivationOpBuilder : public BaseOpBuilder { private: Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const override ORT_MUST_USE_RESULT; - - // Operator support related. - bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - WebnnDeviceType device_type, const logging::Logger& logger) const override; }; // Add operator related. @@ -68,30 +64,6 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, return Status::OK(); } -// Operator support related. -bool ActivationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, - const Node& node, - WebnnDeviceType device_type, - const logging::Logger& logger) const { - const auto& input_defs = node.InputDefs(); - const auto& op_type = node.OpType(); - - std::vector input_shape; - if (!GetShape(*input_defs[0], input_shape, logger)) - return false; - - if (op_type == "Elu" && device_type == WebnnDeviceType::CPU) { - NodeAttrHelper helper(node); - float alpha = helper.Get("alpha", 1.0f); - if (alpha != 1.0f) { - LOGS(logger, VERBOSE) << "WebNN CPU backend only supports Elu's alpha == 1.0"; - return false; - } - } - - return true; -} - void CreateActivationOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { if (op_registrations.op_builder_map.count(op_type) > 0) return; diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc index e14507e8f5aea..c5493f97fdb21 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc @@ -20,8 +20,6 @@ class BinaryOpBuilder : public BaseOpBuilder { const logging::Logger& logger) const override ORT_MUST_USE_RESULT; // Operator support related. - bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType device_type, const logging::Logger& logger) const override; bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) const override; }; @@ -59,33 +57,6 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const return Status::OK(); } -bool BinaryOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, - const Node& node, - const WebnnDeviceType device_type, - const logging::Logger& logger) const { - const auto& input_defs = node.InputDefs(); - const auto& op_type = node.OpType(); - - std::vector input0_shape; - std::vector input1_shape; - if (!GetShape(*input_defs[0], input0_shape, logger) || - !GetShape(*input_defs[1], input1_shape, logger)) { - return false; - } - - // 'prelu' op in WebNN CPU backend restricts the last dimension of input and slope to be same. - // TODO: Remove this workaround once the associated issue is resolved in Chromium: - // https://issues.chromium.org/issues/335517470. - if (op_type == "PRelu" && device_type == WebnnDeviceType::CPU) { - if (input0_shape.back() != input1_shape.back()) { - LOGS(logger, VERBOSE) << "The last dimension of input and slope for PRelu must be same for WebNN CPU backend."; - return false; - } - } - - return true; -} - bool BinaryOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc index 374143c886849..a244efdd9b2eb 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc @@ -69,27 +69,7 @@ bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, // can ensure initializers are constant. See #19401 for details of how this update was made to the NNAPI EP. // GetClipMinMax(graph_viewer, node, minValue, maxValue, logger) float min, max; - if (GetClipMinMax(initializers, node, min, max, logger)) { - // WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0]. - // TODO: Remove this workaround once the associated issue is resolved in Chromium: - // https://issues.chromium.org/issues/326156496. - if (device_type == WebnnDeviceType::CPU) { - if ((min == 0.0f && max == std::numeric_limits::infinity()) || - (min == -1.0f && max == 1.0f) || - (min == 0.0f && max == 6.0f)) { - return true; - } else { - LOGS(logger, VERBOSE) << "Clip min and max values (" - << min << ", " - << max << ") are not supported for WebNN CPU backend"; - return false; - } - } - - return true; - } else { - return false; - }; + return GetClipMinMax(initializers, node, min, max, logger); } void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc index 548e718b8774e..e623590e3bc1a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc @@ -378,22 +378,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, return false; } - // WebNN CPU backend (TFLite) only supports default dilations and group. - // https://source.chromium.org/chromium/chromium/src/+/main:services/webnn/tflite/graph_builder_tflite.cc;l=1040 - if (device_type == WebnnDeviceType::CPU && op_type == "ConvTranspose") { - NodeAttrHelper helper(node); - const auto dilations = helper.Get("dilations", std::vector{1, 1}); - const auto group = helper.Get("group", 1); - if (dilations[0] != 1 || (dilations.size() > 1 && dilations[1] != 1)) { - LOGS(logger, VERBOSE) << op_type << " for WebNN CPU backend only supports default dilation 1."; - return false; - } - if (group != 1) { - LOGS(logger, VERBOSE) << op_type << " for WebNN CPU backend only supports default group 1."; - return false; - } - } - return true; } diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 223eed248800e..26ffeb93ab3b6 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -921,7 +921,7 @@ common::Status InferenceSession::SaveToOrtFormat(const std::filesystem::path& fi ORT_RETURN_IF_ERROR(kernel_type_str_resolver.RegisterGraphNodeOpSchemas(model_->MainGraph())); ORT_RETURN_IF_ERROR(standalone::RegisterCustomOpNodeSchemas(kernel_type_str_resolver, model_->MainGraph())); - for (const auto op_schema : saved_runtime_optimization_produced_node_op_schemas_) { + for (const auto& op_schema : saved_runtime_optimization_produced_node_op_schemas_) { ORT_RETURN_IF_ERROR(kernel_type_str_resolver.RegisterOpSchema(*op_schema)); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index af39edae2074d..d7c6dab72fde8 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -37,7 +37,6 @@ #include "core/framework/model_metadef_id_generator.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h" #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h" -#include "core/session/onnxruntime_session_options_config_keys.h" #include "core/session/onnxruntime_c_api.h" #include "core/common/string_helper.h" @@ -62,6 +61,10 @@ #include "orttraining/core/framework/distributed_run_context.h" #endif +#ifdef _WIN32 +#include "core/platform/windows/logging/etw_sink.h" +#endif + namespace ONNX_NAMESPACE { // We use these names in the provider API because we don't have the protobuf definitions of the RepeatedField* types using int64s = google::protobuf::RepeatedField; @@ -76,11 +79,18 @@ using FunctionProtos = google::protobuf::RepeatedPtrField; namespace onnxruntime { using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef; using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema; +using Node_EdgeEnd = Node::EdgeEnd; +#ifdef _WIN32 +namespace logging { +using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwInternalCallback; +} +#endif } // namespace onnxruntime #include "core/common/cpuid_info.h" #include "core/common/logging/logging.h" #include "core/providers/shared_library/provider_interfaces.h" +#include "core/providers/partitioning_utils.h" #include "core/providers/cuda/cuda_provider_factory_creator.h" #include "core/providers/cann/cann_provider_factory_creator.h" @@ -90,6 +100,7 @@ using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema; #include "core/providers/openvino/openvino_provider_factory_creator.h" #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h" #include "core/providers/vitisai/vitisai_provider_factory_creator.h" +#include "core/providers/qnn/qnn_provider_factory_creator.h" #include "core/providers/cuda/cuda_provider_factory.h" #include "core/providers/cann/cann_provider_factory.h" @@ -181,6 +192,7 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator { bool operator!=(const Node__EdgeIterator& p) const override { return v_ != static_cast(&p)->v_; } void operator++() override { v_.operator++(); } + const Node_EdgeEnd& operator*() const override { return v_.operator*(); } const Node& GetNode() const override { return v_->GetNode(); } int GetSrcArgIndex() const override { return v_->GetSrcArgIndex(); } int GetDstArgIndex() const override { return v_->GetDstArgIndex(); } @@ -188,6 +200,18 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator { Node::EdgeConstIterator v_; }; +struct ConstGraphNodes_Iterator_Impl : ConstGraphNodes_Iterator { + ConstGraphNodes_Iterator_Impl(ConstGraphNodes::ConstNodeIterator&& v) : v_{std::move(v)} {} + + bool operator!=(const ConstGraphNodes_Iterator& other) const override { + return v_ != static_cast(&other)->v_; + } + void operator++() override { v_.operator++(); } + const Node& operator*() override { return *v_; } + + ConstGraphNodes::ConstNodeIterator v_; +}; + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS) common::Status LoadDynamicLibraryFromProvider(onnxruntime::PathString library_name) { const auto& platform_env = onnxruntime::Env::Default(); @@ -367,22 +391,58 @@ struct ProviderHostImpl : ProviderHost { // logging::Logger (wrapped) bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) override { return p->OutputIsEnabled(severity, data_type); } + logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) override { + return p->GetSeverity(); + } // logging::LoggingManager (wrapped) const logging::Logger& logging__LoggingManager__DefaultLogger() override { return logging::LoggingManager::DefaultLogger(); } + bool logging__LoggingManager__HasDefaultLogger() override { return logging::LoggingManager::HasDefaultLogger(); } // logging::Capture (wrapped) - std::unique_ptr logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) override { - return std::make_unique(logger, severity, category, dataType, location); + std::unique_ptr logging__Capture__construct(const logging::Logger& logger, + logging::Severity severity, const char* category, + logging::DataType data_type, + const CodeLocation& location) override { + return std::make_unique(logger, severity, category, data_type, location); } void logging__Capture__operator_delete(logging::Capture* p) noexcept override { delete p; } std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept override { return p->Stream(); } + void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) override { + p->ProcessPrintf(format, args); + } + +#if defined(_WIN32) + // logging::EtwRegistrationManager + logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() override { + return logging::EtwRegistrationManager::Instance(); + } + bool logging__EtwRegistrationManager__SupportsETW() override { + return logging::EtwRegistrationManager::SupportsETW(); + } + logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) override { + return p->MapLevelToSeverity(); + } + void logging__EtwRegistrationManager__RegisterInternalCallback( + logging::EtwRegistrationManager* p, + const logging::EtwRegistrationManager_EtwInternalCallback& callback) override { + p->RegisterInternalCallback(callback); + } + void logging__EtwRegistrationManager__UnregisterInternalCallback( + logging::EtwRegistrationManager* p, + const logging::EtwRegistrationManager_EtwInternalCallback& callback) override { + p->UnregisterInternalCallback(callback); + } +#endif // defined(_WIN32) // Env Env& Env__Default() override { return Env::Default(); } // Utils::DataTypeUtils (wrapped) const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) override { return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_proto); } + const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) override { + return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_str); + } // int64s (wrapped) int int64s__size(const ONNX_NAMESPACE::int64s* p) override { return p->size(); } @@ -424,6 +484,7 @@ struct ProviderHostImpl : ProviderHost { bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_shape(); } const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->shape(); } ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->mutable_shape(); } + bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_elem_type(); } int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->elem_type(); } void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) override { p->set_elem_type(value); }; @@ -444,6 +505,7 @@ struct ProviderHostImpl : ProviderHost { // TypeProto (wrapped) std::unique_ptr TypeProto__construct() override { return std::make_unique(); } void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) override { p->CopyFrom(*other); } + bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->has_tensor_type(); } const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->tensor_type(); } ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) override { return p->mutable_tensor_type(); } int TypeProto__value_case(const ONNX_NAMESPACE::TypeProto* p) override { return p->value_case(); } @@ -572,6 +634,7 @@ struct ProviderHostImpl : ProviderHost { const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->raw_data(); } std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_raw_data(); } + bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_data_type(); } int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_type(); } void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) override { p->set_data_type(type); } @@ -610,6 +673,10 @@ struct ProviderHostImpl : ProviderHost { return std::make_unique(p->end()); } + size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) override { + return p->size(); + } + // TensorShapeProto (wrapped) int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim_size(); } const ONNX_NAMESPACE::TensorShapeProto_Dimensions& TensorShapeProto__dim(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim(); } @@ -960,6 +1027,12 @@ struct ProviderHostImpl : ProviderHost { void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) override { p->AddAttribute(attr_name, value); } + void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) override { + p->AddAttribute(attr_name, value); + } + void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) override { + p->AddAttribute(attr_name, value); + } size_t Node__GetInputEdgesCount(const Node* p) noexcept override { return p->GetInputEdgesCount(); } size_t Node__GetOutputEdgesCount(const Node* p) noexcept override { return p->GetOutputEdgesCount(); } @@ -982,6 +1055,11 @@ struct ProviderHostImpl : ProviderHost { std::unordered_map> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); } int Node__NodeType(const Node* p) const noexcept override { return int(p->NodeType()); } + // Node_EdgeEnd (wrapped). Maps to Node::EdgeEnd struct. + const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) override { return p->GetNode(); } + int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) override { return p->GetSrcArgIndex(); } + int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) override { return p->GetDstArgIndex(); } + // NodeArg (wrapped) const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); } const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) override { return p->Shape(); } @@ -1016,7 +1094,8 @@ struct ProviderHostImpl : ProviderHost { void NodeAttributes__insert_or_assign(NodeAttributes* p, const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) override { p->insert_or_assign(k, v); } void NodeAttributes__reserve(NodeAttributes* p, size_t size) override { p->reserve(size); } - // NodeUnit (wrapped) + void NodeUnit__operator_delete(NodeUnit* p) noexcept override { delete p; } + int NodeUnit__UnitType(const NodeUnit* p) noexcept override { return static_cast(p->UnitType()); } const std::vector& NodeUnit__Inputs(const NodeUnit* p) noexcept override { @@ -1064,12 +1143,46 @@ struct ProviderHostImpl : ProviderHost { return QDQ::GetAllNodeUnits(*graph_viewer, logger); } + // Partitioning utils + std::vector> + Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer, + const std::unordered_set& supported_nodes, + const std::unordered_set& stop_ops, + const utils::GenerateMetadefNameFn& generate_metadef_name, + const std::string& execution_provider_name, + const std::string& execution_provider_type, + const std::unordered_map* node_unit_map, + bool drop_constant_initializers) override { + return onnxruntime::utils::CreateSupportedPartitions(graph_viewer, + supported_nodes, + stop_ops, + generate_metadef_name, + execution_provider_name, + execution_provider_type, + node_unit_map, + drop_constant_initializers); + } + + std::unique_ptr + Utils__MakeComputeCapability(const GraphViewer& graph_viewer, + const std::vector& group, + const std::function& generate_metadef_name, + const std::string& execution_provider_name, + bool drop_constant_initializers) override { + return onnxruntime::utils::MakeComputeCapability(graph_viewer, group, generate_metadef_name, + execution_provider_name, drop_constant_initializers); + } + // Model (wrapped) std::unique_ptr Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path, const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) override { return std::make_unique(model_proto, model_path, local_registries, logger); } + std::unique_ptr Model__construct(const std::string& graph_name, bool is_onnx_domain_only, + const logging::Logger& logger) override { + return std::make_unique(graph_name, is_onnx_domain_only, logger); + } void Model__operator_delete(Model* p) override { delete p; } Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); } std::unique_ptr Model__ToProto(Model* p) override { return std::make_unique(p->ToProto()); } @@ -1179,6 +1292,7 @@ struct ProviderHostImpl : ProviderHost { const std::string& GraphViewer__Name(const GraphViewer* p) noexcept override { return p->Name(); } const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept override { return p->ModelPath(); } + const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept override { return p->Nodes(); } const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) override { return p->GetNode(node_index); } const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) override { return p->GetNodeArg(name); } @@ -1196,6 +1310,9 @@ struct ProviderHostImpl : ProviderHost { const std::vector& GraphViewer__GetInputs(const GraphViewer* p) noexcept override { return p->GetInputs(); } const std::vector& GraphViewer__GetOutputs(const GraphViewer* p) noexcept override { return p->GetOutputs(); } + bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) override { + return p->NodeProducesGraphOutput(node); + } const std::unordered_set& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept override { return p->GetValueInfo(); } const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) override { return p->GetAllInitializedTensors(); } @@ -1224,6 +1341,21 @@ struct ProviderHostImpl : ProviderHost { const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); } IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const override { return p->GetSchemaRegistry(); } + // ConstGraphNodes + std::unique_ptr ConstGraphNodes__begin(const ConstGraphNodes* p) override { + return std::make_unique(p->begin()); + } + std::unique_ptr ConstGraphNodes__end(const ConstGraphNodes* p) override { + return std::make_unique(p->end()); + } + std::unique_ptr ConstGraphNodes__cbegin(const ConstGraphNodes* p) override { + return std::make_unique(p->cbegin()); + } + std::unique_ptr ConstGraphNodes__cend(const ConstGraphNodes* p) override { + return std::make_unique(p->cend()); + } + bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept override { return p->empty(); } + // OpKernel (direct) const Node& OpKernel__Node(const OpKernel* p) override { return p->OpKernel::Node(); } @@ -1651,6 +1783,9 @@ static ProviderLibrary s_library_tensorrt(LIBRARY_PREFIX ORT_TSTR("onnxruntime_p ); static ProviderLibrary s_library_migraphx(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_migraphx") LIBRARY_EXTENSION); +// QNN EP can be built either as a static library or a shared library. Can safely define s_library_qnn even if static. +static ProviderLibrary s_library_qnn(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_qnn") LIBRARY_EXTENSION); + void UnloadSharedProviders() { s_library_dnnl.Unload(); s_library_vitisai.Unload(); @@ -1662,6 +1797,7 @@ void UnloadSharedProviders() { s_library_rocm.Unload(); s_library_shared.Unload(); s_library_migraphx.Unload(); + s_library_qnn.Unload(); } // Used by test code @@ -1832,6 +1968,20 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O return ov_options_converted_map; } +#if !BUILD_QNN_EP_STATIC_LIB +std::shared_ptr QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map, + const SessionOptions* session_options) { + const ConfigOptions* config_options = nullptr; + if (session_options != nullptr) { + config_options = &session_options->config_options; + } + + std::array configs_array = {&provider_options_map, config_options}; + const void* arg = reinterpret_cast(&configs_array); + return s_library_qnn.Get().CreateExecutionProviderFactory(arg); +} +#endif // !BUILD_QNN_EP_STATIC_LIB + std::shared_ptr OpenVINOProviderFactoryCreator::Create( const ProviderOptions* provider_options_map, const SessionOptions* session_options) { // Append session options applicable for EP to EP Provider options. diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index 7a6028dfbe153..b9675d4280e59 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -205,6 +205,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): "GemmFastGelu": self._infer_GemmFastGelu, "GemmFloat8": self._infer_GemmFloat8, "GroupNorm": self._infer_GroupNorm, + "GroupNormalization": self._infer_GroupNorm, "GroupQueryAttention": self._infer_GroupQueryAttention, "LayerNormalization": self._infer_LayerNormalization, "LongformerAttention": self._infer_LongformerAttention, @@ -474,6 +475,7 @@ def _onnx_infer_single_node(self, node): "PythonOp", "MultiHeadAttention", "GroupNorm", + "GroupNormalization", "GroupQueryAttention", "SparseAttention", "SkipGroupNorm", diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py index 0384300b99445..7f418af06a4ec 100644 --- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py +++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py @@ -6,6 +6,8 @@ Builds an Ubuntu-based Docker image with TensorRT. """ +from __future__ import annotations + import argparse import os import pty diff --git a/onnxruntime/test/fuzzing/include/OnnxPrediction.h b/onnxruntime/test/fuzzing/include/OnnxPrediction.h index c169aaa16fd6e..c99120dc45479 100644 --- a/onnxruntime/test/fuzzing/include/OnnxPrediction.h +++ b/onnxruntime/test/fuzzing/include/OnnxPrediction.h @@ -20,7 +20,7 @@ #include #include "BetaDistribution.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "onnxruntime_cxx_api.h" #include "testlog.h" diff --git a/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp b/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp index 607d9cfd9c755..472122be58e89 100644 --- a/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp +++ b/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp @@ -5,7 +5,7 @@ #include "OnnxPrediction.h" #include "onnxruntime_session_options_config_keys.h" #include "src/libfuzzer/libfuzzer_macro.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include diff --git a/onnxruntime/test/mlas/bench/bench_hgemm.cpp b/onnxruntime/test/mlas/bench/bench_hgemm.cpp new file mode 100644 index 0000000000000..1e8b0eb7c34d6 --- /dev/null +++ b/onnxruntime/test/mlas/bench/bench_hgemm.cpp @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "mlas.h" +#include "bench_util.h" +#include "core/util/thread_utils.h" + +#include +#include + +static const std::vector hgemm_bench_arg_names = {"M", "N", "K"}; + +void HGEMM(benchmark::State& state, bool transA, bool transB) { + if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!"); + if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!"); + if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!"); + const size_t M = static_cast(state.range(0)); + const size_t N = static_cast(state.range(1)); + const size_t K = static_cast(state.range(2)); + + auto A = RandomVectorUniform(static_cast(M * K), MLAS_FP16(-1.0f), MLAS_FP16(1.0f)); + auto B = RandomVectorUniform(static_cast(N * K), MLAS_FP16(-1.0f), MLAS_FP16(1.0f)); + std::vector C(static_cast(M * N)); + + MLAS_FP16 alpha = MLAS_FP16(1.0f); + MLAS_FP16 beta = MLAS_FP16(0.0f); + OrtThreadPoolParams tpo; + tpo.thread_pool_size = 8; + tpo.auto_set_affinity = true; + std::unique_ptr tp( + onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(), + tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP)); + MlasGemm( + transA ? CblasTrans : CblasNoTrans, + transB ? CblasTrans : CblasNoTrans, + static_cast(M), + static_cast(N), + static_cast(K), + A.data(), + transA ? M : K, + B.data(), + transB ? K : N, + C.data(), + N, + alpha.val, + beta.val, + tp.get()); + + for (auto _ : state) { + MlasGemm( + transA ? CblasTrans : CblasNoTrans, + transB ? CblasTrans : CblasNoTrans, + static_cast(M), + static_cast(N), + static_cast(K), + A.data(), + transA ? M : K, + B.data(), + transB ? K : N, + C.data(), + N, + alpha.val, + beta.val, + tp.get()); + } +} + +static void GemmSizeWithOne(benchmark::internal::Benchmark* b) { + b->ArgNames(hgemm_bench_arg_names); + b->ArgsProduct({{1}, {63, 255, 1023}, {63, 255, 1023}}); + b->ArgsProduct({{63, 255, 1023}, {1}, {63, 255, 1023}}); + b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {1}}); +} +BENCHMARK_CAPTURE(HGEMM, GEMV_TransB, false, true)->Apply(GemmSizeWithOne)->UseRealTime(); + +static void GemmSizeProducts(benchmark::internal::Benchmark* b) { + b->ArgNames(hgemm_bench_arg_names); + b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {63, 255, 1023}}); +} +BENCHMARK_CAPTURE(HGEMM, NORMAL_TransB, false, true)->Apply(GemmSizeProducts)->UseRealTime(); + +static void GemmLLMSizeProducts(benchmark::internal::Benchmark* b) { + b->ArgNames(hgemm_bench_arg_names); + b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}}); +} +BENCHMARK_CAPTURE(HGEMM, LLM, false, true)->Apply(GemmLLMSizeProducts)->UseRealTime(); diff --git a/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp b/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp new file mode 100644 index 0000000000000..4f3d690b432bf --- /dev/null +++ b/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp @@ -0,0 +1,393 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + test_hgemm_neon.cpp + +Abstract: + + Tests for MLAS fp16 GEMM on ARM CPU. + +--*/ + +#include +#include + +#include "test/mlas/unittest/test_util.h" +#include "core/mlas/lib/mlasi.h" +#include "core/mlas/lib/halfgemm.h" + +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + +class MlasNeonHGemmPackBTest : public MlasTestBase { + private: + std::random_device rd_; + unsigned int seed_; + std::mt19937 gen_; // mersenne_twister_engine seeded with rd() + std::uniform_real_distribution distrib_; + MatrixGuardBuffer input_, ref_, packed_; + + template + MLAS_FORCEINLINE void PackB(const MLAS_FP16* src, MLAS_FP16* dst) { + size_t i = 0; + for (; i + 16 <= N; i += 16) { + for (size_t j = 0; j < K; ++j) { + for (size_t k = 0; k < 16; ++k) { + *dst = src[(i + k) * K + j]; + ++dst; + } + } + } + if (i + 8 <= N) { + for (size_t j = 0; j < K; ++j) { + for (size_t k = 0; k < 8; ++k) { + *dst = src[(i + k) * K + j]; + ++dst; + } + } + i += 8; + } + if (i < N) { + for (size_t j = 0; j < K; ++j) { + for (size_t k = 0; k < N - i; ++k) { + *dst = src[(i + k) * K + j]; + ++dst; + } + dst += 8 - (N - i); + } + } + } + + template + MLAS_FORCEINLINE void Check(const MLAS_FP16* packed, const MLAS_FP16* ref) { + size_t n = ((N + 7) & ~7) * K; + for (size_t i = 0; i < n; ++i) { + ASSERT_EQ(packed[i].val, ref[i].val) << " seed " << seed_ << " i " << i; + } + } + + template + void TestPackB() { + auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) { + for (size_t i = 0; i < count; i++) { + buffer[i] = MLAS_FP16(distrib_(gen_)); + } + }; + + const auto* input = input_.GetFilledBuffer(N * K, InitializeBuffer); + auto* packed = packed_.GetBuffer(K * ((N + 7) & ~7), true); + auto* ref = ref_.GetBuffer(K * ((N + 7) & ~7), true); + hgemm_neon::HPackB_TransposedB_Kernel(input, packed, N, K, K); + PackB(input, ref); + Check(packed, ref); + } + + public: + MlasNeonHGemmPackBTest() + : seed_(rd_()), gen_(seed_), distrib_(-100.f, 100.f) { + } + + static const char* GetTestSuiteName() { + return "NeonHGemmPackB"; + } + + void ExecuteShort(void) override { + TestPackB<1, 1>(); + TestPackB<1, 15>(); + TestPackB<1, 31>(); + TestPackB<8, 1>(); + TestPackB<8, 16>(); + TestPackB<9, 31>(); + TestPackB<9, 33>(); + TestPackB<15, 33>(); + TestPackB<17, 67>(); + TestPackB<17, 96>(); + TestPackB<265, 263>(); + } +}; + +class MlasNeonHGemmTransposedBTest : public MlasTestBase { + private: + std::random_device rd_; + unsigned int seed_; + std::mt19937 gen_; // mersenne_twister_engine seeded with rd() + std::uniform_real_distribution distrib_; + MatrixGuardBuffer A_, B_, ref_, C_; + + template + MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) { + float alphaf = alpha.ToFloat(); + float betaf = beta.ToFloat(); + for (size_t m = 0; m < M; ++m) { + for (size_t n = 0; n < N; ++n) { + float accu = 0.0f; + for (size_t k = 0; k < K; ++k) { + accu += (A[m * K + k].ToFloat()) * (B[n * K + k].ToFloat()); + } + C[m * N + n] = MLAS_FP16(accu * alphaf + C[m * N + n].ToFloat() * betaf); + } + } + } + + MLAS_FORCEINLINE + bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) { + float f0 = v0.ToFloat(), f1 = v1.ToFloat(); + return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol; + } + + template + MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) { + size_t n = M * N; + for (size_t i = 0; i < n; ++i) { + ASSERT_TRUE(FloatEqual(C[i], ref[i], 0.02f, 0.055f)) + << " seed " << seed_ << " i " << i + << " M " << M << " N " << N << " K " << K + << " v0 " << C[i] << " v1 " << ref[i]; + } + } + + template + void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) { + auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) { + for (size_t i = 0; i < count; i++) { + buffer[i] = MLAS_FP16(distrib_(gen_)); + } + }; + + const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer); + const auto* B = B_.GetFilledBuffer(K * N, InitializeBuffer); + auto* C = C_.GetBuffer(M * N, true); + auto* ref = ref_.GetBuffer(M * N, true); + hgemm_neon::HGemm_TransposedB_Kernel(A, B, C, M, N, K, K, K, N, alpha.val, beta.val); + HGemm(A, B, ref, alpha, beta); + Check(C, ref); + } + + public: + MlasNeonHGemmTransposedBTest() + : seed_(1928375), gen_(seed_), distrib_(-1.f, 1.f) { + } + + static const char* GetTestSuiteName() { + return "NeonHGemmTransposedB"; + } + + void ExecuteShort(void) override { + TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<1, 1, 1>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<2, 1, 1>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 15, 17>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<2, 17, 15>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<1, 17, 15>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 33, 31>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<2, 31, 32>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<1, 32, 33>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 78, 263>(MLAS_FP16(0.5f), MLAS_FP16(0.0f)); + TestHGemm<2, 267, 79>(MLAS_FP16(1.5f), MLAS_FP16(1.0f)); + } +}; + +class MlasNeonHGemmTransposedPackedBTest : public MlasTestBase { + private: + std::random_device rd_; + unsigned int seed_; + std::mt19937 gen_; // mersenne_twister_engine seeded with rd() + std::uniform_real_distribution distrib_; + MatrixGuardBuffer A_, B_, ref_, C_; + + template + MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) { + float alphaf = alpha.ToFloat(); + float betaf = beta.ToFloat(); + size_t n = 0; + for (; n + 16 <= N; n += 16) { + for (size_t i = 0; i < 16; ++i) { + for (size_t m = 0; m < M; ++m) { + float accu = 0.0f; + for (size_t k = 0; k < K; ++k) { + accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 16 + i].ToFloat()); + } + C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf); + } + } + } + if (n + 8 <= N) { + for (size_t i = 0; i < 8; ++i) { + for (size_t m = 0; m < M; ++m) { + float accu = 0.0f; + for (size_t k = 0; k < K; ++k) { + accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 8 + i].ToFloat()); + } + C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf); + } + } + n += 8; + } + if (n < N) { + for (size_t i = 0; i < N - n; ++i) { + for (size_t m = 0; m < M; ++m) { + float accu = 0.0f; + for (size_t k = 0; k < K; ++k) { + accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 8 + i].ToFloat()); + } + C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf); + } + } + } + } + + MLAS_FORCEINLINE + bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) { + float f0 = v0.ToFloat(), f1 = v1.ToFloat(); + return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol; + } + + template + MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) { + size_t n = M * N; + for (size_t i = 0; i < n; ++i) { + ASSERT_TRUE(FloatEqual(C[i], ref[i], 0.02f, 0.055f)) + << " seed " << seed_ << " i " << i + << " M " << M << " K " << K << " N " << N + << " v0 " << C[i] << " v1 " << ref[i]; + } + } + + template + void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) { + auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) { + for (size_t i = 0; i < count; i++) { + buffer[i] = MLAS_FP16(distrib_(gen_)); + } + }; + + const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer); + const auto* B = B_.GetFilledBuffer(K * ((N + 7) & ~7), InitializeBuffer); + auto* C = C_.GetBuffer(M * N, true); + auto* ref = ref_.GetBuffer(M * N, true); + hgemm_neon::HGemm_TransposedPackedB_Kernel(A, B, C, M, N, K, K, N, alpha.val, beta.val); + HGemm(A, B, ref, alpha, beta); + Check(C, ref); + } + + public: + MlasNeonHGemmTransposedPackedBTest() + : seed_(1928372), gen_(seed_), distrib_(-1.f, 1.f) { + } + + static const char* GetTestSuiteName() { + return "NeonHGemmTransposedPackedB"; + } + + void ExecuteShort(void) override { + TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<1, 1, 1>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<2, 1, 1>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 15, 17>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<2, 17, 15>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<1, 17, 15>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 33, 31>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<2, 31, 32>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<1, 32, 33>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 78, 263>(MLAS_FP16(0.5f), MLAS_FP16(0.0f)); + TestHGemm<2, 267, 79>(MLAS_FP16(1.5f), MLAS_FP16(1.0f)); + } +}; + +class MlasNeonHGemmTest : public MlasTestBase { + private: + std::random_device rd_; + unsigned int seed_; + std::mt19937 gen_; // mersenne_twister_engine seeded with rd() + std::uniform_real_distribution distrib_; + MatrixGuardBuffer A_, B_, ref_, C_; + + template + MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) { + float alphaf = alpha.ToFloat(); + float betaf = beta.ToFloat(); + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) { + float accu = 0.0f; + for (size_t k = 0; k < K; ++k) { + accu += (A[i * K + k].ToFloat()) * (B[j * K + k].ToFloat()); + } + C[i * N + j] = MLAS_FP16(accu * alphaf + C[i * N + j].ToFloat() * betaf); + } + } + } + + MLAS_FORCEINLINE + bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) { + float f0 = v0.ToFloat(), f1 = v1.ToFloat(); + return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol; + } + + template + MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) { + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) { + ASSERT_TRUE(FloatEqual(C[i * N + j], ref[i * N + j], 0.02f, 0.055f)) + << " seed " << seed_ << " i " << i << " j " << j + << " M " << M << " K " << K << " N " << N + << " v0 " << C[i * N + j] << " v1 " << ref[i * N + j]; + } + } + } + + template + void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) { + auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) { + for (size_t i = 0; i < count; i++) { + buffer[i] = MLAS_FP16(distrib_(gen_)); + } + }; + + const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer); + const auto* B = B_.GetFilledBuffer(K * N, InitializeBuffer); + auto* C = C_.GetBuffer(M * N, true); + auto* ref = ref_.GetBuffer(M * N, true); + MlasGemm(CblasNoTrans, CblasTrans, M, N, K, A, K, B, K, C, N, alpha.val, beta.val, nullptr); + HGemm(A, B, ref, alpha, beta); + Check(C, ref); + } + + public: + MlasNeonHGemmTest() + : seed_(192837), gen_(seed_), distrib_(-0.25f, 0.25f) { + } + + static const char* GetTestSuiteName() { + return "NeonHGemm"; + } + + void ExecuteShort(void) override { + TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<1, 128, 512>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<2, 128, 513>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 128, 511>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<2, 129, 512>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<1, 127, 512>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<1, 513, 1023>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + TestHGemm<2, 511, 1025>(MLAS_FP16(1.5f), MLAS_FP16(0.5f)); + TestHGemm<127, 513, 1023>(MLAS_FP16(1.0f), MLAS_FP16(0.0f)); + TestHGemm<129, 511, 1025>(MLAS_FP16(0.5f), MLAS_FP16(1.0f)); + } +}; + +static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { + size_t count = 0; + if (is_short_execute) { + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + } + return count; +}); + +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index b9b69fdc74b4d..d44f098db6c4c 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -961,6 +961,7 @@ std::unique_ptr> GetBrokenTests(const std::string& provider {"reduce_prod_empty_set", "unknown version", {}}, {"reduce_sum_empty_set", "unknown version", {}}, {"reduce_sum_square_empty_set_expanded", "unknown version", {}}, + {"averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True", "TODO(titaiwang): enable this in the next ONNX release."}, #ifdef ENABLE_TRAINING_CORE {"adagrad", "not a registered function/op", {}}, // Op not registered. {"adagrad_multiple", "not a registered function/op", {}}, // Op not registered. diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index 302ad57fb88c5..a9aa78b7a3229 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -15,7 +15,7 @@ #include "test/util/include/inference_session_wrapper.h" #include "test/util/include/test_environment.h" #include "test/util/include/test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #if !defined(ORT_MINIMAL_BUILD) // if this is a full build we need the provider test utils diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc index a340f975ec91a..24a8c8491b632 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc @@ -1030,6 +1030,31 @@ TEST(PoolTest, AveragePool_19_dilation_2d) { kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider}); } +TEST(PoolTest, AveragePool_19_ceil_count_include_pad_1d) { + // TODO: Unskip when fixed #41968513 + if (DefaultDmlExecutionProvider().get() != nullptr) { + GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect."; + } + + OpTester test("AveragePool", 19); + + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", std::vector{3}); + test.AddAttribute("pads", vector{3, 3}); + test.AddAttribute("kernel_shape", vector{7}); + test.AddAttribute("ceil_mode", (int64_t)1); + test.AddAttribute("count_include_pad", (int64_t)1); + + std::vector x_vals = {2.0903f, 4.6493f, 1.6320f, -3.2051f, 4.6975f, 4.7296f, 3.3653f, -1.5815f, -2.3832f, 0.9628f, -1.5899f, -2.6820f, 5.7529f, 7.7346f, -0.8910f, -2.0151f, 0.1313f, -0.5374f}; + std::vector x_dims = {1, 2, 9}; + std::vector expected_dims = {1, 2, 4}; + std::vector expected_vals = {0.73807144f, 2.5655572f, 0.8032287f, -0.09990001f, 0.34911433f, 1.0389f, 1.4536142f, -0.40353334f}; + + test.AddInput("X", x_dims, x_vals); + test.AddOutput("Y", expected_dims, expected_vals); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider}); +} + TEST(PoolTest, GlobalAveragePool) { OpTester test("GlobalAveragePool"); diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc index c514cf16b2f3c..da6eda1317778 100644 --- a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc +++ b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc @@ -9,7 +9,7 @@ #include "core/graph/node_attr_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc index 1a0f9bfcbae97..f897a08da6b2e 100644 --- a/onnxruntime/test/providers/qnn/average_pool_test.cc +++ b/onnxruntime/test/providers/qnn/average_pool_test.cc @@ -11,7 +11,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc index 9b83dd281a56d..e2e4b0d714e54 100644 --- a/onnxruntime/test/providers/qnn/cast_test.cc +++ b/onnxruntime/test/providers/qnn/cast_test.cc @@ -9,7 +9,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc index cfa77a46210b3..21bd6fcc98d74 100644 --- a/onnxruntime/test/providers/qnn/clip_op_test.cc +++ b/onnxruntime/test/providers/qnn/clip_op_test.cc @@ -8,7 +8,7 @@ #include "test/providers/qnn/qnn_test_utils.h" #include "core/graph/node_attr_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc index 637d3257ddea7..b33f8f9c00fc4 100644 --- a/onnxruntime/test/providers/qnn/flatten_op_test.cc +++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc @@ -8,7 +8,7 @@ #include "test/providers/qnn/qnn_test_utils.h" #include "core/graph/node_attr_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc index 81c08873064c8..85dc792666827 100644 --- a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc +++ b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc @@ -11,7 +11,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc index da0c7f2c36854..0c1146ba22360 100644 --- a/onnxruntime/test/providers/qnn/gemm_op_test.cc +++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc @@ -9,7 +9,7 @@ #include "test/providers/qnn/qnn_test_utils.h" #include "core/graph/node_attr_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc index 59105136781f4..522b781379119 100644 --- a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc +++ b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc @@ -9,7 +9,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc index a99cba66bf167..4b26ed0da93c7 100644 --- a/onnxruntime/test/providers/qnn/lrn_op_test.cc +++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc @@ -9,7 +9,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index f3f584f24a102..dec9369b81748 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -8,7 +8,7 @@ #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/max_min_op_test.cc b/onnxruntime/test/providers/qnn/max_min_op_test.cc index 3deff121f3c72..9a45d11b7e34f 100644 --- a/onnxruntime/test/providers/qnn/max_min_op_test.cc +++ b/onnxruntime/test/providers/qnn/max_min_op_test.cc @@ -7,7 +7,7 @@ #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp index a6b8664c6c0c9..4ce6db7facc69 100644 --- a/onnxruntime/test/providers/qnn/pad_op_test.cpp +++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp @@ -10,7 +10,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp index 5dd3a6aaa3620..f0ca3557191c7 100644 --- a/onnxruntime/test/providers/qnn/pool_op_test.cpp +++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp @@ -10,7 +10,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index 92ec4ba3b0d28..a6fb66472844a 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -6,7 +6,9 @@ #include #include "core/providers/cpu/cpu_provider_factory.h" // For OrtSessionOptionsAppendExecutionProvider_CPU -#include "core/providers/qnn/qnn_allocator.h" +#if BUILD_QNN_EP_STATIC_LIB +#include "core/providers/qnn/qnn_allocator.h" // Used by QnnHTPBackendTests.UseHtpSharedMemoryAllocatorForInputs +#endif #include "core/session/inference_session.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/session/onnxruntime_session_options_config_keys.h" @@ -1099,6 +1101,9 @@ TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) { } } +// Only compile this test when QNN EP is built as a static library. When QNN EP is a shared library, +// we cannot include internal QNN EP headers that use the provider-bridge API. +#if BUILD_QNN_EP_STATIC_LIB TEST_F(QnnHTPBackendTests, UseHtpSharedMemoryAllocatorForInputs) { ProviderOptions provider_options; #if defined(_WIN32) @@ -1145,6 +1150,7 @@ TEST_F(QnnHTPBackendTests, UseHtpSharedMemoryAllocatorForInputs) { ExpectedEPNodeAssignment::All, 0.008f); } +#endif // BUILD_QNN_EP_STATIC_LIB #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) #endif // !defined(ORT_MINIMAL_BUILD) diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc index a3f0ed55b83f2..38fde332ca992 100644 --- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc @@ -7,7 +7,6 @@ #include "core/session/onnxruntime_cxx_api.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "core/session/inference_session.h" -#include "core/providers/shared/utils/utils.h" #include "test/providers/qnn/qnn_test_utils.h" @@ -25,6 +24,24 @@ namespace test { #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) +static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) { + const auto& attributes = node.GetAttributes(); + if (auto entry = attributes.find(attr_name); entry != attributes.end()) { + return entry->second.i(); + } + + return default_val; +} + +static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) { + const auto& attributes = node.GetAttributes(); + if (auto entry = attributes.find(attr_name); entry != attributes.end()) { + return entry->second.s(); + } + + return default_val; +} + // Create a model with FusedMatMul + Add (quantized) // input1 -> Add -> Q -> DQ \ // FusedMatMul -> Q -> DQ -> output @@ -873,10 +890,9 @@ static void GetLastContextBinaryFileName(const std::string last_onnx_ctx_file, auto& ctx_graph = ctx_model->MainGraph(); for (auto& node : ctx_graph.Nodes()) { if (node.OpType() == "EPContext") { - NodeAttrHelper node_helper(node); - int64_t is_main_context = node_helper.Get("main_context", static_cast(0)); + int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast(0)); if (1 == is_main_context) { - last_ctx_bin_file = node_helper.Get("ep_cache_context", ""); + last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", ""); return; } } @@ -899,10 +915,9 @@ static void UpdateEpContextModel(const std::vector& ep_ctx_files, for (auto& node : ctx_graph.Nodes()) { if (node.OpType() == "EPContext") { - NodeAttrHelper node_helper(node); - int64_t is_main_context = node_helper.Get("main_context", static_cast(0)); + int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast(0)); if (1 == is_main_context) { - std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", ""); + std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", ""); auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name); std::remove(file_path.string().c_str()); node.ClearAttribute("ep_cache_context"); diff --git a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc index 3964edc11461b..b66547a939983 100644 --- a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc +++ b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc @@ -8,7 +8,7 @@ #include "test/providers/qnn/qnn_test_utils.h" #include "core/graph/node_attr_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc index 15612e3267a75..651f55bc05d3f 100644 --- a/onnxruntime/test/providers/qnn/resize_test.cc +++ b/onnxruntime/test/providers/qnn/resize_test.cc @@ -9,7 +9,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc index 6dc721edb421e..23682f7e934c3 100644 --- a/onnxruntime/test/providers/qnn/split_op_test.cc +++ b/onnxruntime/test/providers/qnn/split_op_test.cc @@ -7,7 +7,7 @@ #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc index 33d2f64c0315e..abc1b3a89d85c 100644 --- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc +++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc @@ -7,7 +7,7 @@ #include "test/providers/qnn/qnn_test_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/tile_op_test.cc b/onnxruntime/test/providers/qnn/tile_op_test.cc index 2b35c730ee5fe..85541efe5646c 100644 --- a/onnxruntime/test/providers/qnn/tile_op_test.cc +++ b/onnxruntime/test/providers/qnn/tile_op_test.cc @@ -8,7 +8,7 @@ #include "test/providers/qnn/qnn_test_utils.h" #include "core/graph/node_attr_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/qnn/topk_op_test.cc b/onnxruntime/test/providers/qnn/topk_op_test.cc index 5a9351b9366ec..354a5d1e3b49a 100644 --- a/onnxruntime/test/providers/qnn/topk_op_test.cc +++ b/onnxruntime/test/providers/qnn/topk_op_test.cc @@ -8,7 +8,7 @@ #include "test/providers/qnn/qnn_test_utils.h" #include "core/graph/node_attr_utils.h" -#include "onnx/onnx_pb.h" +#include "core/graph/onnx_protobuf.h" #include "gtest/gtest.h" namespace onnxruntime { diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc index 3be0bd253c8a4..bb5007b40b072 100644 --- a/onnxruntime/test/qnn_ctx_gen/main.cc +++ b/onnxruntime/test/qnn_ctx_gen/main.cc @@ -16,7 +16,6 @@ #include "core/common/logging/sinks/clog_sink.h" #include "core/graph/model.h" -#include "core/providers/shared/utils/utils.h" #include "core/session/environment.h" #include "core/common/logging/logging.h" @@ -31,6 +30,24 @@ static void CheckStatus(const Status& status) { } } +static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) { + const auto& attributes = node.GetAttributes(); + if (auto entry = attributes.find(attr_name); entry != attributes.end()) { + return entry->second.i(); + } + + return default_val; +} + +static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) { + const auto& attributes = node.GetAttributes(); + if (auto entry = attributes.find(attr_name); entry != attributes.end()) { + return entry->second.s(); + } + + return default_val; +} + // from the last context cache Onnx model, find the EPContext node with main_context=1, // and get the QNN context binary file name, this context binary contains all graphs from all Onnx models // get the max spill fill buffer size @@ -44,11 +61,10 @@ static void GetLastContextBinaryFileName(const std::basic_string last auto& ctx_graph = ctx_model->MainGraph(); for (auto& node : ctx_graph.Nodes()) { if (node.OpType() == "EPContext") { - NodeAttrHelper node_helper(node); - int64_t is_main_context = node_helper.Get("main_context", static_cast(0)); - max_size = node_helper.Get("max_size", static_cast(0)); + int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast(0)); + max_size = GetNodeAttr(node, "max_size", static_cast(0)); if (1 == is_main_context) { - last_ctx_bin_file = node_helper.Get("ep_cache_context", ""); + last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", ""); return; } } @@ -72,10 +88,9 @@ static void UpdateEpContextModel(const std::vector> for (auto& node : ctx_graph.Nodes()) { if (node.OpType() == "EPContext") { - NodeAttrHelper node_helper(node); - int64_t is_main_context = node_helper.Get("main_context", static_cast(0)); + int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast(0)); if (1 == is_main_context) { - std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", ""); + std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", ""); auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name); std::remove(file_path.string().c_str()); node.ClearAttribute("ep_cache_context"); diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt index 406d0b7f19818..2ca562e5f5c2c 100644 --- a/requirements-lintrunner.txt +++ b/requirements-lintrunner.txt @@ -3,6 +3,6 @@ lintrunner==0.12.5 lintrunner-adapters==0.12.4 # RUFF -ruff==0.9.1 +ruff==0.9.3 # CLANGFORMAT clang-format==19.1.7 diff --git a/setup.py b/setup.py index a2d50284b03ff..6481f58f69070 100644 --- a/setup.py +++ b/setup.py @@ -315,17 +315,20 @@ def finalize_options(self): providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt") providers_openvino = "onnxruntime_providers_openvino" providers_cann = "onnxruntime_providers_cann" +providers_qnn = "onnxruntime_providers_qnn" if platform.system() == "Linux": providers_cuda_or_rocm = "lib" + providers_cuda_or_rocm + ".so" providers_tensorrt_or_migraphx = "lib" + providers_tensorrt_or_migraphx + ".so" providers_openvino = "lib" + providers_openvino + ".so" providers_cann = "lib" + providers_cann + ".so" + providers_qnn = "lib" + providers_qnn + ".so" elif platform.system() == "Windows": providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll" providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll" providers_openvino = providers_openvino + ".dll" providers_cann = providers_cann + ".dll" + providers_qnn = providers_qnn + ".dll" # Additional binaries dl_libs = [] @@ -345,8 +348,9 @@ def finalize_options(self): dl_libs.append(providers_cuda_or_rocm) dl_libs.append(providers_tensorrt_or_migraphx) dl_libs.append(providers_cann) + dl_libs.append(providers_qnn) dl_libs.append("libonnxruntime.so*") - # DNNL, TensorRT & OpenVINO EPs are built as shared libs + # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs libs.extend(["libonnxruntime_providers_shared.so"]) libs.extend(["libonnxruntime_providers_dnnl.so"]) libs.extend(["libonnxruntime_providers_openvino.so"]) @@ -354,6 +358,7 @@ def finalize_options(self): libs.append(providers_cuda_or_rocm) libs.append(providers_tensorrt_or_migraphx) libs.append(providers_cann) + libs.append(providers_qnn) # QNN qnn_deps = [ "libQnnCpu.so", @@ -392,13 +397,14 @@ def finalize_options(self): providers_cann, "onnxruntime.dll", ] - # DNNL, TensorRT & OpenVINO EPs are built as shared libs + # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs libs.extend(["onnxruntime_providers_shared.dll"]) libs.extend(["onnxruntime_providers_dnnl.dll"]) libs.extend(["onnxruntime_providers_tensorrt.dll"]) libs.extend(["onnxruntime_providers_openvino.dll"]) libs.extend(["onnxruntime_providers_cuda.dll"]) libs.extend(["onnxruntime_providers_vitisai.dll"]) + libs.extend(["onnxruntime_providers_qnn.dll"]) # DirectML Libs libs.extend(["DirectML.dll"]) # QNN V68/V73 dependencies diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index e7d93aeabe113..bce7552854a4c 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -129,6 +129,17 @@ def invalid_hetero_build(): return device_read +def _qnn_verify_library_kind(library_kind): + choices = ["shared_lib", "static_lib"] + if library_kind not in choices: + print("\nYou have specified an invalid library kind for QNN EP.") + print(f"The invalid library kind was: {library_kind}") + print("Provide a library kind from the following options: ", choices) + print(f"Example: --use_qnn {choices[0]}") + sys.exit("Incorrect build configuration") + return library_kind + + def parse_arguments(): class Parser(argparse.ArgumentParser): # override argument file line parsing behavior - allow multiple arguments per line and handle quotes @@ -578,7 +589,14 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--use_jsep", action="store_true", help="Build with JavaScript kernels.") parser.add_argument("--use_webgpu", action="store_true", help="Build with WebGPU support.") parser.add_argument("--use_external_dawn", action="store_true", help="Treat Dawn as an external dependency.") - parser.add_argument("--use_qnn", action="store_true", help="Build with QNN support.") + parser.add_argument( + "--use_qnn", + nargs="?", + const="shared_lib", # If provide --use_qnn without an arg, defaults to a shared library. + type=_qnn_verify_library_kind, + help="Build with QNN support. Specify 'shared_lib' or 'static_lib' to build QNN EP " + "as a shared or static library, respectively.", + ) parser.add_argument("--qnn_home", help="Path to QNN SDK dir.") parser.add_argument("--use_rknpu", action="store_true", help="Build with RKNPU.") parser.add_argument("--use_preinstalled_eigen", action="store_true", help="Use pre-installed Eigen.") @@ -1350,6 +1368,11 @@ def generate_build_tree( raise BuildError("qnn_home=" + qnn_home + " not valid." + " qnn_home paths must be specified and valid.") cmake_args += ["-Donnxruntime_USE_QNN=ON"] + if args.use_qnn == "static_lib": + cmake_args += ["-Donnxruntime_BUILD_QNN_EP_STATIC_LIB=ON"] + if args.android and args.use_qnn != "static_lib": + raise BuildError("Only support Android + QNN builds with QNN EP built as a static library.") + if args.use_coreml: cmake_args += ["-Donnxruntime_USE_COREML=ON"] @@ -2401,6 +2424,8 @@ def build_nuget_package( elif use_rocm: package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm" elif use_qnn: + if use_qnn != "shared_lib": + raise BuildError("Currently NuGet packages with QNN require QNN EP to be built as a shared library.") execution_provider = "/p:ExecutionProvider=qnn" package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN" elif any("OrtPackageId=" in x for x in msbuild_extra_options): diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py index e9f8fea951661..c2bc5cba82a23 100644 --- a/tools/ci_build/github/android/build_aar_package.py +++ b/tools/ci_build/github/android/build_aar_package.py @@ -72,11 +72,15 @@ def _parse_build_settings(args): return build_settings +def _is_qnn_android_build(build_settings): + return any(build_arg.startswith("--use_qnn") for build_arg in build_settings["build_params"]) + + def _build_aar(args): build_settings = _parse_build_settings(args) build_dir = os.path.abspath(args.build_dir) ops_config_path = os.path.abspath(args.include_ops_by_config) if args.include_ops_by_config else None - qnn_android_build = "--use_qnn" in build_settings["build_params"] + qnn_android_build = _is_qnn_android_build(build_settings) # Setup temp environment for building temp_env = os.environ.copy() diff --git a/tools/ci_build/github/android/default_qnn_aar_build_settings.json b/tools/ci_build/github/android/default_qnn_aar_build_settings.json index 8c362440358c4..5ac49f582d23e 100644 --- a/tools/ci_build/github/android/default_qnn_aar_build_settings.json +++ b/tools/ci_build/github/android/default_qnn_aar_build_settings.json @@ -11,7 +11,7 @@ "--cmake_generator=Ninja", "--build_java", "--build_shared_lib", - "--use_qnn", + "--use_qnn=static_lib", "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF", "--skip_tests" diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml index b89aa50171b4c..f237ef37fe82c 100644 --- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml @@ -72,7 +72,8 @@ jobs: --android_abi=x86_64 \ --android_api=31 \ --parallel \ - --use_qnn \ + --build_shared_lib \ + --use_qnn static_lib \ --qnn_home $(QnnSDKRootDir) \ --cmake_generator=Ninja \ --skip_tests diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 59deb0d4975fe..0eaaea562ca36 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -41,7 +41,7 @@ parameters: variables: - name: docker_base_image - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 - name: linux_trt_version value: 10.3.0.26-1.cuda11.8 - name: Repository diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 518aec8c2f92a..71f7ab6e49b70 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -49,9 +49,9 @@ parameters: variables: - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 - name: Repository ${{ if eq(parameters.CudaVersion, '11.8') }}: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index 9025f084d5982..c08eaaaa1308d 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -40,9 +40,9 @@ variables: - template: templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml index 8d42e7201411b..4a86da167ff1f 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -40,9 +40,9 @@ variables: - template: templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml index fb235bda24fbf..093db011e44f9 100644 --- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml @@ -41,7 +41,12 @@ jobs: timeoutInMinutes: 60 workspace: clean: all - + strategy: + matrix: + SHARED_LIB: + QnnLibKind: 'shared_lib' + STATIC_LIB: + QnnLibKind: 'static_lib' steps: - script: | ls -R /data/qnn_test_data @@ -65,7 +70,8 @@ jobs: --config Release \ --use_binskim_compliant_compile_flags \ --build_java \ - --use_qnn \ + --build_shared_lib \ + --use_qnn $(QnnLibKind) \ --qnn_home $(QnnSDKRootDir) \ --cmake_generator=Ninja \ --update --build --parallel @@ -77,7 +83,8 @@ jobs: --config Release \ --use_binskim_compliant_compile_flags \ --build_java \ - --use_qnn \ + --build_shared_lib \ + --use_qnn $(QnnLibKind) \ --qnn_home $(QnnSDKRootDir) \ --cmake_generator=Ninja \ --test diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml index 4b94ffc7e302e..960b59f93bee0 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml @@ -18,7 +18,7 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 cuda_version: '11.8' - stage: Republish_Wheels diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml index 48d1e6b1ac7a7..021f7c5ece140 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml @@ -18,7 +18,7 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 cuda_version: '12.2' - stage: Republish_Wheels diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml index 0517fec3bad04..b081b39ad9bcc 100644 --- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml @@ -142,9 +142,9 @@ stages: value: false - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 timeoutInMinutes: 60 steps: diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index 4adf41d3db4e5..85366ffc28b3a 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -45,9 +45,9 @@ jobs: - template: ../../templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index a3c804055d8fb..f48573abd3dba 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -68,9 +68,9 @@ stages: cmake_build_type: ${{ parameters.cmake_build_type }} cuda_version: ${{ parameters.cuda_version }} ${{ if eq(parameters.cuda_version, '11.8') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1 ${{ if eq(parameters.cuda_version, '12.2') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 - ${{ if eq(parameters.enable_windows_dml, true) }}: - ${{ each python_version in parameters.PythonVersions }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml index d44952690f651..ede9ec1a086ca 100644 --- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml +++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml @@ -4,11 +4,6 @@ parameters: type: string default: '' -- name: job_name_suffix - displayName: job name - type: string - default: '' - - name: packageName displayName: Package Name type: string @@ -25,17 +20,13 @@ parameters: default: '2.30.0.250109' jobs: -- job: Final_AAR_Testing_Android_${{ parameters.job_name_suffix }} +- job: Final_AAR_Testing_Android + pool: 'onnxruntime-Ubuntu2204-AMD-CPU' workspace: clean: all - pool: - vmImage: 'macOS-13' variables: - - name: runCodesignValidationInjection - value: false + runCodesignValidationInjection: false timeoutInMinutes: 90 - dependsOn: - - Android_Java_API_AAR_Packaging_${{ parameters.job_name_suffix }} steps: - template: set-version-number-variables-step.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index 98206bcb690c0..1ab4fd2a8e9e7 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -82,10 +82,12 @@ stages: packageName: 'onnxruntime-android' ReleaseVersionSuffix: $(ReleaseVersionSuffix) +- stage: Android_Java_API_AAR_Testing_Full + dependsOn: Android_Java_API_AAR_Packaging_Full + jobs: - template: android-java-api-aar-test.yml parameters: artifactName: 'onnxruntime-android-full-aar' - job_name_suffix: 'Full' ReleaseVersionSuffix: $(ReleaseVersionSuffix) - stage: Android_Java_API_AAR_Packaging_QNN @@ -105,10 +107,12 @@ stages: ReleaseVersionSuffix: $(ReleaseVersionSuffix) QnnSDKVersion: ${{ parameters.QnnSDKVersion }} +- stage: Final_AAR_Testing_Android_QNN + dependsOn: Android_Java_API_AAR_Packaging_QNN + jobs: - template: android-java-api-aar-test.yml parameters: artifactName: 'onnxruntime-android-qnn-aar' - job_name_suffix: 'QNN' packageName: 'onnxruntime-android-qnn' QnnSDKVersion: ${{ parameters.QnnSDKVersion }} diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 1a53ce6a423b6..fe3bc60c83dea 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.208 + version: 1.0.213 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.208 + version: 1.0.213 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml index 8bbe8f82530ea..523f3ab58b982 100644 --- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml @@ -102,10 +102,12 @@ stages: packageName: onnxruntime-training-android enable_code_sign: true +- stage: Final_AAR_Testing_Android_Training_Full + dependsOn: Android_Java_API_AAR_Packaging_Training_Full + jobs: - template: android-java-api-aar-test.yml parameters: artifactName: 'onnxruntime-training-android-full-aar' - job_name_suffix: 'Training_Full' packageName: onnxruntime-training-android - stage: NuGet_Packaging_Training_CPU @@ -115,7 +117,7 @@ stages: - Windows_Packaging_Training_CPU_x86_${{ parameters.BuildVariant }} - Windows_Packaging_Training_CPU_x64_${{ parameters.BuildVariant }} - Windows_Packaging_Training_CPU_arm64_${{ parameters.BuildVariant }} - - Android_Java_API_AAR_Packaging_Training_Full + - Final_AAR_Testing_Android_Training_Full condition: succeeded() jobs: - job: NuGet_Packaging_Training_CPU diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml index d1b85e64fa48a..ff2ecb0d3c28f 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml @@ -94,6 +94,7 @@ jobs: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "$(VSGenerator)" + --build_shared_lib --use_qnn --qnn_home $(QnnSDKRootDir) --enable_pybind diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml index 8595a52cdef2b..f382156c03944 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml @@ -92,6 +92,7 @@ jobs: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "$(VSGenerator)" + --build_shared_lib --use_qnn --qnn_home $(QnnSDKRootDir) --enable_pybind diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml index 979961d06654f..a5f2a481e6ba8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml @@ -92,6 +92,7 @@ jobs: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "$(VSGenerator)" + --build_shared_lib --use_qnn --qnn_home $(QnnSDKRootDir) --enable_pybind diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml index 205bd0b5c3c71..5a74998ca4bc8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml @@ -93,12 +93,18 @@ stages: workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}' createLogFile: true + - task: CmdLine@2 + displayName: 'Print contents of binaries directory' + inputs: + script: | + dir $(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }} + - template: win-esrp-dll.yml parameters: FolderPath: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}' DisplayName: 'ESRP - Sign dlls' DoEsrp: ${{ parameters.DoEsrp }} - Pattern: 'onnxruntime.dll' + Pattern: 'onnxruntime*.dll' - task: MSBuild@1 displayName: 'Restore NuGet Packages and create project.assets.json' diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml index 84b71b37d992a..787c3ffe23bd9 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml @@ -36,7 +36,7 @@ parameters: default: 2.30.0.250109 jobs: -- job: 'build' +- job: 'BUILD_QNN_EP' pool: 'onnxruntime-qnn-windows-vs-2022-arm64' variables: DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true @@ -46,6 +46,12 @@ jobs: timeoutInMinutes: 240 workspace: clean: all + strategy: + matrix: + SHARED_LIB: + QnnLibKind: 'shared_lib' + STATIC_LIB: + QnnLibKind: 'static_lib' steps: - script: | @@ -79,7 +85,8 @@ jobs: --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --cmake_generator "Visual Studio 17 2022" - --use_qnn + --build_shared_lib + --use_qnn $(QnnLibKind) --qnn_home $(QnnSDKRootDir) --update --build --parallel @@ -88,7 +95,8 @@ jobs: --config $(BuildConfig) ^ --build_dir $(Build.BinariesDirectory) ^ --cmake_generator "Visual Studio 17 2022" ^ - --use_qnn ^ + --build_shared_lib ^ + --use_qnn $(QnnLibKind) ^ --qnn_home $(QnnSDKRootDir) ^ --test --enable_onnx_tests displayName: 'Run unit tests' @@ -121,7 +129,7 @@ jobs: TargetFolder: '$(Build.ArtifactStagingDirectory)' CleanTargetFolder: true OverWrite: true - condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest')) + condition: and(succeeded(), and(ne(variables['Build.Reason'], 'PullRequest'), eq(variables['QnnLibKind'], 'shared_lib'))) - task: PublishBuildArtifacts@1 displayName: 'Publish Artifact' @@ -129,4 +137,4 @@ jobs: PathtoPublish: '$(Build.ArtifactStagingDirectory)' ArtifactName: 'internal_release' publishLocation: 'Container' - condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest')) + condition: and(succeeded(), and(ne(variables['Build.Reason'], 'PullRequest'), eq(variables['QnnLibKind'], 'shared_lib'))) diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml index e6792bc34aad0..28fbe4a1096b2 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml @@ -36,7 +36,7 @@ parameters: default: 2.30.0.250109 jobs: -- job: 'build' +- job: 'BUILD_QNN_EP' pool: 'Onnxruntime-QNNEP-Windows-2022-CPU' variables: MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary' @@ -50,6 +50,12 @@ jobs: timeoutInMinutes: 120 workspace: clean: all + strategy: + matrix: + SHARED_LIB: + QnnLibKind: 'shared_lib' + STATIC_LIB: + QnnLibKind: 'static_lib' steps: - task: UsePythonVersion@0 @@ -72,7 +78,8 @@ jobs: --build_dir $(Build.BinariesDirectory) --cmake_generator "Visual Studio 17 2022" --build_java - --use_qnn + --build_shared_lib + --use_qnn $(QnnLibKind) --qnn_home $(QnnSDKRootDir) --use_binskim_compliant_compile_flags --update --parallel @@ -87,7 +94,8 @@ jobs: --build_dir $(Build.BinariesDirectory) ^ --cmake_generator "Visual Studio 17 2022" ^ --build_java ^ - --use_qnn ^ + --build_shared_lib ^ + --use_qnn $(QnnLibKind) ^ --qnn_home $(QnnSDKRootDir) ^ --use_binskim_compliant_compile_flags ^ --test --enable_onnx_tests diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 6d86a57bd7986..b5999da997589 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -75,7 +75,7 @@ fi if [ "$BUILD_DEVICE" == "NPU" ]; then #Enable QNN EP - BUILD_ARGS+=("--use_qnn" "--qnn_home=/qnn_sdk") + BUILD_ARGS+=("--build_shared_lib" "--use_qnn" "--qnn_home=/qnn_sdk") fi export ONNX_ML=1 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index 72912acce885e..02938f015ec57 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250109.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250124.1 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index 9569aa2fcda63..f9d84e3b0e130 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14_dotnet:20250109.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14_dotnet:20250124.1 ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile index 589bd869ba89f..20b9a6c224120 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14:20250109.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14:20250124.1 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile index 1c1f716d81e95..d94e7562f19d4 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14_dotnet:20250109.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14_dotnet:20250124.1 ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile index 6caf21c475545..24287fd34d3ea 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20250109.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20250124.1 ARG TRT_VERSION #Install TensorRT only if TRT_VERSION is not empty diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile index a5dda5904de49..764a79135d7a3 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20250109.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20250124.1 ARG TRT_VERSION #Install TensorRT only if TRT_VERSION is not empty diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile index 04c6398e061b7..7590d5dd18347 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250109.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250124.1 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index 0568ae864dbfc..8ccb2c054900e 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -382,6 +382,7 @@ def generate_files(line_list, args): "tensorrt_ep_shared_lib": "onnxruntime_providers_tensorrt.dll", "openvino_ep_shared_lib": "onnxruntime_providers_openvino.dll", "cuda_ep_shared_lib": "onnxruntime_providers_cuda.dll", + "qnn_ep_shared_lib": "onnxruntime_providers_qnn.dll", "onnxruntime_perf_test": "onnxruntime_perf_test.exe", "onnx_test_runner": "onnx_test_runner.exe", } @@ -777,6 +778,24 @@ def generate_files(line_list, args): + '\\native" />' ) + if args.execution_provider == "qnn" or (is_qnn_package and not is_ado_packaging_build): + files_list.append( + "' + ) + files_list.append( + "' + ) + # process all other library dependencies if is_cpu_package or is_cuda_gpu_package or is_dml_package or is_mklml_package: # Process dnnl dependency