diff --git a/.github/ISSUE_TEMPLATE/bug-performance-issue.md b/.github/ISSUE_TEMPLATE/bug-performance-issue.md index f1d29a83573f3..0f168c4e71094 100644 --- a/.github/ISSUE_TEMPLATE/bug-performance-issue.md +++ b/.github/ISSUE_TEMPLATE/bug-performance-issue.md @@ -7,6 +7,9 @@ about: Use this template for reporting a bug or a performance issue. **Describe the bug** A clear and concise description of what the bug is. +**Urgency** +If there are particular important use cases blocked by this or strict project-related timelines, please share more information and dates. If there are no hard deadlines, please specify none. + **System information** - OS Platform and Distribution (e.g., Linux Ubuntu 16.04): - ONNX Runtime installed from (source or binary): @@ -27,4 +30,4 @@ A clear and concise description of what you expected to happen. If applicable, add screenshots to help explain your problem. **Additional context** -Add any other context about the problem here. +Add any other context about the problem here. If the issue is about a particular model, please share the model details as well to facilitate debugging. diff --git a/.gitmodules b/.gitmodules index bf0b0903613cf..47dc9124d74dd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,7 +9,7 @@ url = https://github.com/onnx/onnx [submodule "cmake/external/tvm"] path = cmake/external/tvm - url = https://github.com/dmlc/tvm.git + url = https://github.com/microsoft/onnxruntime-tvm.git [submodule "cmake/external/date"] path = cmake/external/date url = https://github.com/HowardHinnant/date.git @@ -32,3 +32,12 @@ [submodule "cmake/external/eigen"] path = cmake/external/eigen url = https://github.com/eigenteam/eigen-git-mirror.git +[submodule "cmake/external/grpc"] + path = cmake/external/grpc + url = https://github.com/grpc/grpc +[submodule "cmake/external/DNNLibrary"] + path = cmake/external/DNNLibrary + url = https://github.com/JDAI-CV/DNNLibrary +[submodule "cmake/external/spdlog"] + path = cmake/external/spdlog + url = https://github.com/gabime/spdlog.git diff --git a/BUILD.md b/BUILD.md index b8fb80983f719..f4d1650ee03ad 100644 --- a/BUILD.md +++ b/BUILD.md @@ -1,4 +1,5 @@ # Build ONNX Runtime +Dockerfiles are available [here](https://github.com/microsoft/onnxruntime/tree/master/tools/ci_build/github/linux/docker) to help you get started. ## Supported architectures @@ -12,7 +13,7 @@ | OS | Supports CPU | Supports GPU| Notes | |-------------|:------------:|:------------:|------------------------------------| -|Windows 10 | YES | YES |Must use VS 2017 or the latest VS2015| +|Windows 10 | YES | YES | VS2019 through the latest VS2015 are supported | |Windows 10
Subsystem for Linux | YES | NO | | |Ubuntu 16.x | YES | YES | Also supported on ARM32v7 (experimental) | @@ -29,36 +30,51 @@ OS/Compiler Matrix: ONNX Runtime python binding only supports Python 3.5, 3.6 and 3.7. -## Build +## Getting Started +You may either get a prebuilt onnxruntime from nuget.org, or do it yourself using the following steps: 1. Checkout the source tree: ``` git clone --recursive https://github.com/Microsoft/onnxruntime cd onnxruntime ``` 2. Install cmake-3.13 or better from https://cmake.org/download/. -3. (optional) Install protobuf 3.6.1 from source code (cmake/external/protobuf). CMake flag protobuf\_BUILD\_SHARED\_LIBS must be turned OFF on Windows and turned ON on Linux. After the installation, you should have the 'protoc' executable in your PATH. On Linux it is recommended to run `ldconfig` to make sure protobuf libraries are found. -4. If you installed your protobuf in a non standard location it would be helpful on Linux build to set the following env var: -`export CMAKE_ARGS="-DONNX_CUSTOM_PROTOC_EXECUTABLE=full path to protoc"` so ONNX build can find it. -On Linux also run `ldconfig ` so the linker can find protobuf libraries. + +On Windows: +3. (optional) Install protobuf 3.6.1 from source code (cmake/external/protobuf). CMake flag protobuf\_BUILD\_SHARED\_LIBS must be turned OFF. After the installation, you should have the 'protoc' executable in your PATH. +4. (optional) Install onnx from source code (cmake/external/onnx) + ``` + export ONNX_ML=1 + python3 setup.py bdist_wheel + pip3 install --upgrade dist/*.whl + ``` +5. Run `build.bat --config RelWithDebInfo --build_shared_lib --parallel`. + +On Linux: +3. (optional) Install protobuf 3.6.1 from source code (cmake/external/protobuf). CMake flag protobuf\_BUILD\_SHARED\_LIBS must be turned ON. After the installation, you should have the 'protoc' executable in your PATH. It is recommended to run `ldconfig` to make sure protobuf libraries are found. +4. If you installed your protobuf in a non standard location it would be helpful to set the following env var:`export CMAKE_ARGS="-DONNX_CUSTOM_PROTOC_EXECUTABLE=full path to protoc"` so ONNX build can find it. Also run `ldconfig ` so the linker can find protobuf libraries. 5. (optional) Install onnx from source code (cmake/external/onnx) ``` export ONNX_ML=1 python3 setup.py bdist_wheel pip3 install --upgrade dist/*.whl ``` -6. Run `./build.sh --config RelWithDebInfo --build_wheel` for Linux (or `build.bat --config RelWithDebInfo --build_wheel` for Windows). Upon successful build you should be able to find the wheel under `dist` folder. +6. Run `./build.sh --config RelWithDebInfo --build_shared_lib --parallel`. The build script runs all unit tests by default (for native builds and skips tests by default for cross-compiled builds). The complete list of build options can be found by running `./build.sh (or ./build.bat) --help` ## Build x86 -1. For Windows, just add --x86 argument when launching build.bat -2. For Linux, it must be built out of a x86 os, --x86 argument also needs be specified to build.sh + - For Windows, just add --x86 argument when launching build.bat + - For Linux, it must be built out of a x86 os, --x86 argument also needs be specified to build.sh ## Build ONNX Runtime Server on Linux -1. In the ONNX Runtime root folder, run `./build.sh --config RelWithDebInfo --build_server --use_openmp --parallel` +1. ONNX Runtime server (and only the server) requires you to have Go installed to build, due to building BoringSSL. + See https://golang.org/doc/install for installation instructions. +2. In the ONNX Runtime root folder, run `./build.sh --config RelWithDebInfo --build_server --use_openmp --parallel` +3. ONNX Runtime Server supports sending log to [rsyslog](https://www.rsyslog.com/) daemon. To enable it, please build with an additional parameter: `--cmake_extra_defines onnxruntime_USE_SYSLOG=1`. The build command will look like this: `./build.sh --config RelWithDebInfo --build_server --use_openmp --parallel --cmake_extra_defines onnxruntime_USE_SYSLOG=1` + ## Build/Test Flavors for CI @@ -74,6 +90,9 @@ The complete list of build options can be found by running `./build.sh (or ./bui ## Additional Build Flavors The complete list of build flavors can be seen by running `./build.sh --help` or `./build.bat --help`. Here are some common flavors. +### Windows CMake Generator +The default generator on Windows is Visual Studio 2017, but you can also use the newer Visual Studio 2019 by passing `--cmake_generator "Visual Studio 16 2019"` to build.bat. + ### Windows CUDA Build ONNX Runtime supports CUDA builds. You will need to download and install [CUDA](https://developer.nvidia.com/cuda-toolkit) and [CUDNN](https://developer.nvidia.com/cudnn). @@ -142,7 +161,7 @@ ONNX Runtime supports OpenVINO Execution Provider to enable deep learning infere The OpenVINO Execution Provider can be built using the following commands: -- Install the OpenVINO 2018 R5.0.1 release along with its dependencies from ([https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)). +- Currently supports and validated on two versions of OpenVINO: OpenVINO 2018 R5.0.1 and OpenVINO 2019 R1.1(Recommended). Install the OpenVINO release along with its dependencies from ([https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)). - Install the model optimizer prerequisites for ONNX by running /deployment_tools/model_optimizer/install_prerequisites/install_prerequisites_onnx.sh @@ -151,11 +170,11 @@ The OpenVINO Execution Provider can be built using the following commands: source setupvars.sh -- To configure Intel® Processor Graphics(GPU), please follow the installation steps from (https://docs.openvinotoolkit.org/2018_R5/_docs_install_guides_installing_openvino_linux.html#GPU-steps) +- To configure Intel® Processor Graphics(GPU), please follow the installation steps from (https://docs.openvinotoolkit.org/2019_R1.1/_docs_install_guides_installing_openvino_linux.html#additional-GPU-steps) -- To configure Intel® MovidiusTM USB, please follow the getting started guide from (https://docs.openvinotoolkit.org/2018_R5/_docs_install_guides_installing_openvino_linux.html#Movidius-steps) +- To configure Intel® MovidiusTM USB, please follow the getting started guide from (https://docs.openvinotoolkit.org/2019_R1.1/_docs_install_guides_installing_openvino_linux.html#additional-NCS-steps) -- To configure Intel® Vision Accelerator Design based on 8 MovidiusTM MyriadX VPUs, please follow the configuration guide from (https://docs.openvinotoolkit.org/2018_R5/_docs_install_guides_installing_openvino_linux.html#Vision-Accelerator-Design-steps) +- To configure Intel® Vision Accelerator Design based on 8 MovidiusTM MyriadX VPUs, please follow the configuration guide from (https://docs.openvinotoolkit.org/2019_R1.1/_docs_install_guides_installing_openvino_linux.html#install-VPU) - Build ONNX Runtime using the below command. @@ -172,7 +191,7 @@ The OpenVINO Execution Provider can be built using the following commands: | GPU_FP32 | Intel® Integrated Graphics | | GPU_FP16 | Intel® Integrated Graphics with FP16 quantization of models | | MYRIAD_FP16 | Intel® MovidiusTM USB sticks |  -| VAD-R_FP16 | Intel® Vision Accelerator Design based on 8 MovidiusTM MyriadX VPUs | +| VAD-M_FP16 | Intel® Vision Accelerator Design based on 8 MovidiusTM MyriadX VPUs | For more information on OpenVINO Execution Provider's ONNX Layer support, Topology support, and Intel hardware enabled, please refer to the document OpenVINO-ExecutionProvider.md in $onnxruntime_root/docs/execution_providers diff --git a/README.md b/README.md index 876cab44f49a8..b1a715dbfe95d 100644 --- a/README.md +++ b/README.md @@ -6,91 +6,147 @@ [![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12) [![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13) -**ONNX Runtime** is an open-source scoring engine for Open Neural Network Exchange (ONNX) models. +**ONNX Runtime** is a performance-focused complete scoring engine for Open Neural Network Exchange (ONNX) models, with an open extensible architecture to continually address the latest developments in AI and Deep Learning. ONNX Runtime stays up to date with the ONNX standard with complete implementation of **all** ONNX operators, and supports all ONNX releases (1.2+) with both future and backwards compatibility. Please refer to [this page](docs/Versioning.md) for ONNX opset compatibility details. -ONNX is an open format for machine learning (ML) models that is supported by various ML and DNN frameworks and tools. This format makes it easier to interoperate between frameworks and to maximize the reach of your hardware optimization investments. Learn more about ONNX on [https://onnx.ai](https://onnx.ai) or view the [Github Repo](https://github.com/onnx/onnx). +[ONNX](https://onnx.ai) is an interoperable format for machine learning models supported by various ML and DNN frameworks and tools. The universal format makes it easier to interoperate between frameworks and maximize the reach of hardware optimization investments. -# Why use ONNX Runtime -ONNX Runtime has an open architecture that is continually evolving to address the newest developments and challenges in AI and Deep Learning. ONNX Runtime stays up to date with the ONNX standard, supporting all ONNX releases with future compatibility and maintaining backwards compatibility with prior releases. +*** +**Setup** +* [Installation](#installation) +* [APIs and Official Binaries](#apis-and-official-builds) +* [Building from Source](#building-from-source) -ONNX Runtime continuously strives to provide top performance for a broad and growing number of usage scenarios in Machine Learning. Our investments focus on: -1. Run any ONNX model -2. High performance -3. Cross platform +**Getting Started** +* [Getting ONNX Models](#getting-onnx-models) +* [Deploying ONNX Runtime](#deploying-onnx-runtime) +* [Examples and Tutorials](#examples-and-tutorials) -## Run any ONNX model +**More Info** +* [Technical Design Details](#technical-design-details) +* [Extensibility Options](#extensibility-options) -### Alignment with ONNX Releases -ONNX Runtime provides comprehensive support of the ONNX spec and can be used to run all models based on ONNX v1.2.1 and higher. See ONNX version release details [here](https://github.com/onnx/onnx/releases). +**[Contributions and Feedback](#contribute)** -As of May 2019, ONNX Runtime supports ONNX 1.5 (opset10). See [this table](https://github.com/Microsoft/onnxruntime/blob/master/docs/Versioning.md#version-matrix) for details on ONNX Runtime and ONNX versioning compatibility, +**[License](#license)** +*** +## Key Features +### Run any ONNX model +ONNX Runtime provides comprehensive support of the ONNX spec and can be used to run all models based on ONNX v1.2.1 and higher. See version compatibility details [here](https://github.com/microsoft/onnxruntime/blob/master/docs/Versioning.md). -### Traditional ML support -ONNX Runtime fully supports the ONNX-ML profile of the ONNX spec for traditional ML scenarios. +*Note: Some operators not supported in the current ONNX version may be available as a [Contrib Operator](https://github.com/microsoft/onnxruntime/blob/master/docs/ContribOperators.md)* -## High Performance -ONNX Runtime supports both CPU and GPU hardware through a variety of execution providers. With a variety of graph optimizations and accelerators, ONNX Runtime often provides lower latency and higher efficiency compared to other runtimes. This provides faster end-to-end customer experiences and lower costs from improved machine utilization. +**Traditional ML support** -Currently ONNX Runtime supports CUDA, TensorRT, MLAS (Microsoft Linear Algebra Subprograms), MKL-DNN, MKL-ML, and nGraph for computation acceleration. See more details on available build options [here](https://github.com/Microsoft/onnxruntime/blob/master/BUILD.md). +In addition to DNN models, ONNX Runtime fully supports the [ONNX-ML profile](https://github.com/onnx/onnx/blob/master/docs/Operators-ml.md) of the ONNX spec for traditional ML scenarios. -We are continuously working to integrate new execution providers to provide improvements in latency and efficiency. If you are interested in contributing a new execution provider, please see [this page](docs/AddingExecutionProvider.md). +### High Performance +ONNX Runtime supports both CPU and GPU. Using various graph optimizations and accelerators, ONNX Runtime can provide lower latency compared to other runtimes for faster end-to-end customer experiences and minimized machine utilization costs. -## Cross Platform -ONNX Runtime offers: -* APIs for Python, C#, and C -* Available for Linux, Windows, and Mac  +Currently ONNX Runtime supports the following accelerators: +* CPU + * MLAS (Microsoft Linear Algebra Subprograms) + * MKL-DNN + * MKL-ML + * [Intel nGraph](https://github.com/microsoft/onnxruntime/blob/master/docs/execution_providers/nGraph-ExecutionProvider.md) +* GPU + * CUDA + * [TensorRT](https://github.com/microsoft/onnxruntime/blob/master/docs/execution_providers/TensorRT-ExecutionProvider.md) -See API documentation and package installation instructions [below](#Installation). +Not all variations are supported in the [official release builds](#apis-and-official-builds), but can be built from source following [these instructions](https://github.com/Microsoft/onnxruntime/blob/master/BUILD.md). -We have ongoing investments to make ONNX Runtime compatible with more platforms and architectures. If you have specific scenarios that are not currently supported, please share your suggestions via [Github Issues](https://github.com/microsoft/onnxruntime/issues). +We are continuously working to integrate new execution providers for further improvements in latency and efficiency. If you are interested in contributing a new execution provider, please see [this page](docs/AddingExecutionProvider.md). -# Getting Started -ONNX models: -* Check out the [ONNX Model Zoo](https://github.com/onnx/models) for ready-to-use pre-trained models. -* To get an ONNX model by exporting from various frameworks, see [ONNX Tutorials](https://github.com/onnx/tutorials). +### Cross Platform +[API documentation and package installation](https://github.com/microsoft/onnxruntime#installation) -Once you have an ONNX model, you can [install the runtime](#Installation) for your machine to try it out. There is also an [ONNX-Ecosystem Docker container](https://github.com/onnx/onnx-docker/tree/master/onnx-ecosystem) available and ready for use with the Python API. +ONNX Runtime is available for Linux, Windows, Mac with Python, C#, and C APIs, with more to come! +If you have specific scenarios that are not currently supported, please share your suggestions and scenario details via [Github Issues](https://github.com/microsoft/onnxruntime/issues). -One easy way to deploy the model on the cloud is by using [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service). See [detailed instructions](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-build-deploy-onnx) and [sample notebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment/onnx). +# Installation +**Quick Start:** The [ONNX-Ecosystem Docker container image](https://github.com/onnx/onnx-docker/tree/master/onnx-ecosystem) is available on Dockerhub and includes ONNX Runtime (CPU, Python), dependencies, tools to convert from various frameworks, and Jupyter notebooks to help get started. +Additional dockerfiles for some features can be found [here](https://github.com/microsoft/onnxruntime/tree/master/dockerfiles). -# Installation -## System Requirements -* ONNX Runtime binaries in CPU packages use OpenMP and depends on the library being available at runtime in the +## APIs and Official Builds + +### API Documentation +* [Python](https://aka.ms/onnxruntime-python) +* [C](docs/C_API.md) +* [C#](docs/CSharp_API.md) +* [C++](https://github.com/microsoft/onnxruntime/blob/master/include/onnxruntime/core/session/onnxruntime_cxx_api.h) + +### Official Builds +| | CPU (MLAS+Eigen) | CPU (MKL-ML) | GPU (CUDA) +|---|:---|:---|:---| +| **Python** | **[pypi: onnxruntime](https://pypi.org/project/onnxruntime)**

Windows (x64)
Linux (x64)
Mac OS X (x64) | -- | **[pypi: onnxruntime-gpu](https://pypi.org/project/onnxruntime-gpu)**

Windows (x64)
Linux (x64) | +| **C#** | **[Nuget: Microsoft.ML.OnnxRuntime](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime/)**

Windows (x64, x86)
Linux (x64, x86)
Mac OS X (x64) | **[Nuget: Microsoft.ML.OnnxRuntime.MKLML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.MKLML/)**

Windows (x64)
Linux (x64)
Mac OS X (x64) | **[Nuget: Microsoft.ML.OnnxRuntime.Gpu](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.Gpu/)**

Windows (x64)
Linux (x64) | +| **C** | **[Nuget: Microsoft.ML.OnnxRuntime](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime)**

**[.zip, .tgz](https://aka.ms/onnxruntime-release)**

Windows (x64, x86)
Linux (x64, x86)
Mac OS X (x64 | **[Nuget: Microsoft.ML.OnnxRuntime.MKLML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.MKLML/)**

Windows (x64)
Linux (x64)
Mac OS X (x64) | **[Nuget: Microsoft.ML.OnnxRuntime.Gpu](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.Gpu/)**

**[.zip, .tgz](https://aka.ms/onnxruntime-release)**

Windows (x64)
Linux (x64) | + +#### System Requirements (pre-requisite dependencies) +* ONNX Runtime binaries in the CPU packages use OpenMP and depend on the library being available at runtime in the system. - * For Windows, OpenMP support comes as part of VC runtime. It is also available as redist packages: + * For Windows, **OpenMP** support comes as part of VC runtime. It is also available as redist packages: [vc_redist.x64.exe](https://aka.ms/vs/15/release/vc_redist.x64.exe) and [vc_redist.x86.exe](https://aka.ms/vs/15/release/vc_redist.x86.exe) - * For Linux, the system must have the libgomp.so.1 which can be installed using ```apt-get install libgomp1```. -* The official GPU builds require the CUDA 9.1 and cuDNN 7.1 runtime libraries being installed in the system. -* Python binaries are compatible with Python 3.5-3.7. -* Certain operators makes use of system locales. At the very least you will need to install English language package and configure en_US.UTF-8 locale. - * For Ubuntu install language-pack-en package + * For Linux, the system must have **libgomp.so.1** which can be installed using `apt-get install libgomp1`. +* GPU builds require the **CUDA 10.0 and cuDNN 7.3** runtime libraries being installed on the system. Older releases used 9.1/7.1 - please refer to [release notes](https://github.com/microsoft/onnxruntime/releases) for more details. +* Python binaries are compatible with **Python 3.5-3.7**. See [Python Dev Notes](https://github.com/microsoft/onnxruntime/blob/master/docs/Python_Dev_Notes.md) +* Certain operators makes use of system locales. Installation of the **English language package** and configuring `en_US.UTF-8 locale` is required. + * For Ubuntu install [language-pack-en package](https://packages.ubuntu.com/search?keywords=language-pack-en) * Run the following commands: - * locale-gen en_US.UTF-8 - * update-locale LANG=en_US.UTF-8 + `locale-gen en_US.UTF-8` + `update-locale LANG=en_US.UTF-8` * Follow similar procedure to configure other locales on other platforms. + +## Building from Source +If additional build flavors are needed, please find instructions on building from source at [Build ONNX Runtime](BUILD.md). For production scenarios, it's strongly recommended to build from an [official release branch](https://github.com/microsoft/onnxruntime/releases). + +Dockerfiles are available [here](https://github.com/microsoft/onnxruntime/tree/faxu-doc-updates/tools/ci_build/github/linux/docker) to help you get started. + +## Getting ONNX Models +* The [ONNX Model Zoo](https://github.com/onnx/models) has popular ready-to-use pre-trained models. +* To export or convert a trained ONNX model trained from various frameworks, see [ONNX Tutorials](https://github.com/onnx/tutorials). Versioning comptability information can be found under [Versioning](docs/Versioning.md#tool-compatibility) +* Other services that can be used to create ONNX models include: + * [AutoML from AzureML SDK](https://aka.ms/automatedmldocs) + * [Custom Vision](https://www.customvision.ai/) + * [E2E training on Azure Machine Learning Services](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-onnx) + +## Deploying ONNX Runtime +ONNX Runtime can be deployed to the cloud for model inferencing using [Azure Machine Learning Services](https://azure.microsoft.com/en-us/services/machine-learning-service). See [detailed instructions](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-build-deploy-onnx) and [sample notebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment/onnx). + +**ONNX Runtime Server (beta)** is a hosted application for serving ONNX models using ONNX Runtime, providing a REST API for prediction. Usage details can be found [here](https://github.com/microsoft/onnxruntime/blob/master/docs/ONNX_Runtime_Server_Usage.md), and image installation instructions are [here](https://github.com/microsoft/onnxruntime/tree/master/dockerfiles#onnx-runtime-server-preview). + +## Examples and Tutorials +### Python +* [Basic Inferencing Sample](https://github.com/onnx/onnx-docker/blob/master/onnx-ecosystem/inference_demos/simple_onnxruntime_inference.ipynb) +* [Inferencing (Resnet50)](https://github.com/onnx/onnx-docker/blob/master/onnx-ecosystem/inference_demos/resnet50_modelzoo_onnxruntime_inference.ipynb) +* [Inferencing samples](https://github.com/onnx/onnx-docker/tree/master/onnx-ecosystem/inference_demos) using [ONNX-Ecosystem Docker image](https://github.com/onnx/onnx-docker/tree/master/onnx-ecosystem) +* [Train, Convert, and Inference a SKL pipeline](https://microsoft.github.io/onnxruntime/auto_examples/plot_train_convert_predict.html#sphx-glr-auto-examples-plot-train-convert-predict-py) +* [Convert and Inference a Keras model](https://microsoft.github.io/onnxruntime/auto_examples/plot_dl_keras.html#sphx-glr-auto-examples-plot-dl-keras-py) +* [ONNX Runtime Server: SSD Single Shot MultiBox Detector](https://github.com/onnx/tutorials/blob/master/tutorials/OnnxRuntimeServerSSDModel.ipynb) +* [Running ONNX model tests](https://github.com/microsoft/onnxruntime/blob/master/docs/Model_Test.md) + + +**Deployment with AzureML** +* Inferencing: [Inferencing Facial Expression Recognition](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.ipynb), [Inferencing MNIST Handwritten Digits](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.ipynb), [ Resnet50 Image Classification](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-modelzoo-aml-deploy-resnet50.ipynb), [TinyYolo](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.ipynb) +* [Train and Inference MNIST from Pytorch](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-train-pytorch-aml-deploy-mnist.ipynb) +* [FER+ on Azure Kubernetes Service with TensorRT](https://github.com/microsoft/onnxruntime/blob/master/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb) -## APIs and Official Builds -| API Documentation | CPU package | GPU package | -|-----|-------------|-------------| -| [Python](https://aka.ms/onnxruntime-python) | [Available on Pypi](https://pypi.org/project/onnxruntime)

| [Available on Pypi](https://pypi.org/project/onnxruntime-gpu)


| -| [C#](docs/CSharp_API.md) | **Available on Nuget :**
[MLAS+Eigen](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime/)

[MKL-ML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.MKLML/)| [Available on Nuget](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.Gpu/)

| -| [C](docs/C_API.md) | **Available on Nuget :**
[MLAS+Eigen](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime/)

[MKL-ML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.MKLML/)

[Binaries (.zip, .tgz)](https://aka.ms/onnxruntime-release)
| [Available on Nuget](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.Gpu/)


[Binaries (.zip, .tgz)](https://aka.ms/onnxruntime-release)

| -| [C++](onnxruntime/core/session/inference_session.h) | [Build from source](https://github.com/Microsoft/onnxruntime/blob/master/BUILD.md) | [Build from source](https://github.com/Microsoft/onnxruntime/blob/master/BUILD.md) | -For builds using other execution providers, see Build Details below. +### C# +* [Inferencing Tutorial](https://github.com/microsoft/onnxruntime/blob/master/docs/CSharp_API.md#getting-started) -## Build Details -For details on the build configurations and information on how to create a build, see [Build ONNX Runtime](BUILD.md). -## Versioning -See more details on API and ABI Versioning and ONNX Compatibility in [Versioning](docs/Versioning.md). +### C/C++ +* [Basic Inferencing (SqueezeNet) - C](https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp) +* [Basic Inferencing (SqueezeNet) - C++](https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/CXX_Api_Sample.cpp) +* [Inferencing (MNIST) - C++](https://github.com/microsoft/onnxruntime/tree/master/samples/c_cxx/MNIST) -# Design and Key Features -For an overview of the high level architecture and key decisions in the technical design of ONNX Runtime, see [Engineering Design](docs/HighLevelDesign.md). -ONNX Runtime is built with an extensible design that makes it versatile to support a wide array of models with high performance. +# Technical Design Details +* [High level architectural design](docs/HighLevelDesign.md) +* [Versioning](docs/Versioning.md) +## Extensibility Options * [Add a custom operator/kernel](docs/AddingCustomOp.md) * [Add an execution provider](docs/AddingExecutionProvider.md) * [Add a new graph @@ -98,7 +154,7 @@ transform](include/onnxruntime/core/optimizer/graph_transformer.h) * [Add a new rewrite rule](include/onnxruntime/core/optimizer/rewrite_rule.h) # Contribute -We welcome your contributions! Please see the [contribution guidelines](CONTRIBUTING.md). +We welcome contributions! Please see the [contribution guidelines](CONTRIBUTING.md). ## Feedback For any feedback or to report a bug, please file a [GitHub Issue](https://github.com/Microsoft/onnxruntime/issues). diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index 01fc672d75a0c..4c0ddbb79b05c 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -3046,3 +3046,489 @@ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +----- + +JDAI-CV/DNNLibrary + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2019] [JD.com Inc. JD AI] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +----- + +google/flatbuffers + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2014 Google Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +----- + +google/glog + +Copyright (c) 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +A function gettimeofday in utilities.cc is based on + +http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd + +The license of this code is: + +Copyright (c) 2003-2008, Jouni Malinen and contributors +All Rights Reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name(s) of the above-listed copyright holder(s) nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/VERSION_NUMBER b/VERSION_NUMBER index 1d0ba9ea182b0..8f0916f768f04 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -0.4.0 +0.5.0 diff --git a/cgmanifest.json b/cgmanifest.json index e8bb73be2e02a..2fd8a43254e26 100644 --- a/cgmanifest.json +++ b/cgmanifest.json @@ -49,7 +49,7 @@ "component":{ "type":"git", "git": { - "commitHash": "d94f99d21a9a0820d58966410ceaf525132f85f1", + "commitHash": "65b8e0f9979fbade16e3becbdfa69c0764946f72", "repositoryUrl": "https://github.com/onnx/onnx.git" } } @@ -103,8 +103,8 @@ "component":{ "type":"git", "git":{ - "commitHash":"c2b36154778503a509a70a3b5309b201969eccab", - "repositoryUrl":"https://github.com/dmlc/tvm.git" + "commitHash":"fd4801612817f96e890058656834deb925fc064a", + "repositoryUrl":"https://github.com/microsoft/onnxruntime-tvm.git" } } }, @@ -299,6 +299,33 @@ } } }, + { + "component":{ + "type":"git", + "git":{ + "commitHash":"90cb0f8d60b07e96ca7f0ba92fa50884010599ad", + "repositoryUrl":"https://github.com/JDAI-CV/DNNLibrary.git" + } + } + }, + { + "component":{ + "type":"git", + "git":{ + "commitHash":"9e7e8cbe9f675123dd41b7c62868acad39188cae", + "repositoryUrl":"https://github.com/google/flatbuffers.git" + } + } + }, + { + "component":{ + "type":"git", + "git":{ + "commitHash":"8d7a107d68c127f3f494bb7807b796c8c5a97a82", + "repositoryUrl":"https://github.com/google/glog.git" + } + } + }, { "component":{ "Type":"other", @@ -309,6 +336,79 @@ } } }, + { + "component": { + "git": { + "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece", + "repositoryUrl": "https://github.com/grpc/grpc" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "cc4bed2d74f7c8717e31f9579214ab52a9c9c610", + "repositoryUrl": "https://github.com/abseil/abseil-cpp" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420", + "repositoryUrl": "https://github.com/google/boringssl.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce", + "repositoryUrl": "https://github.com/c-ares/c-ares.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0", + "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba", + "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3", + "repositoryUrl": "https://github.com/google/upb.git" + }, + "type": "git" + } + }, + { + "component":{ + "type":"other", + "Other":{ + "Name":"Go", + "Version":"1.12.6", + "DownloadUrl":"https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz" + } + } +} ], "Version":1 } diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 9762200a58557..9ee7470b0b2b3 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -49,6 +49,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) option(onnxruntime_USE_NSYNC "Build with NSYNC support. This option only takes effect on Linux" OFF) option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON) +option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF) option(onnxruntime_USE_MLAS "Use optimized blas library for GEMM and 2D Convolution" ON) option(onnxruntime_USE_MKLDNN "Build with MKL-DNN support" OFF) option(onnxruntime_USE_MKLML "Build MKL-DNN with MKL-ML binary dependency" OFF) @@ -66,7 +67,7 @@ option(onnxruntime_USE_LLVM "Build tvm with LLVM" OFF) option(onnxruntime_USE_OPENMP "Build with OpenMP support" OFF) option(onnxruntime_BUILD_SHARED_LIB "Build a shared library" OFF) option(onnxruntime_ENABLE_MICROSOFT_INTERNAL "Use this option to enable/disable microsoft internal only code" OFF) -option(onnxruntime_USE_NUPHAR "Build with Nupha" OFF) +option(onnxruntime_USE_NUPHAR "Build with Nuphar" OFF) option(onnxruntime_USE_BRAINSLICE "Build with BrainSlice" OFF) option(onnxruntime_USE_TENSORRT "Build with TensorRT support" OFF) option(onnxruntime_ENABLE_LTO "Enable link time optimization" ON) @@ -84,12 +85,9 @@ set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE) set(NSYNC_ENABLE_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE) set(ONNX_ML 1) -if(onnxruntime_USE_OPENMP AND UNIX) - #if you enabled both of them, the code can still be built and run, but you may see - # 10x performance degradation, because one process should only have one openmp implementation - # mkl(or mklml) depends on Intel OpenMP - # GCC does not support linking against the Intel OpenMP runtime library - # Clang should be ok, but it's not in our consideration right now. +if(onnxruntime_USE_OPENMP) + # MKLML and NGraph depend on their own OpenMP library that may be different with the compiler's. + # Disable the options to build mklml/NGraph and OpenMP together. if(onnxruntime_USE_MKLML) message(FATAL_ERROR "Please use only one of onnxruntime_USE_MKLML, onnxruntime_USE_OPENMP") endif() @@ -132,8 +130,11 @@ add_definitions(-DEIGEN_MPL2_ONLY) if(onnxruntime_CROSS_COMPILING) set(CMAKE_CROSSCOMPILING ON) - string(APPEND CMAKE_CXX_FLAGS " -Wno-error=attributes") - string(APPEND CMAKE_C_FLAGS " -Wno-error=attributes") + check_cxx_compiler_flag(-Wno-error HAS_NOERROR) + if(HAS_NOERROR) + string(APPEND CMAKE_CXX_FLAGS " -Wno-error=attributes") + string(APPEND CMAKE_C_FLAGS " -Wno-error=attributes") + endif() endif() #must after OpenMP settings @@ -426,10 +427,8 @@ else() string(APPEND CMAKE_C_FLAGS " -Wall -Wextra -ffunction-sections -fdata-sections") if(onnxruntime_DEV_MODE) - if(NOT onnxruntime_USE_TVM) - string(APPEND CMAKE_CXX_FLAGS " -Werror") - string(APPEND CMAKE_C_FLAGS " -Werror") - endif() + string(APPEND CMAKE_CXX_FLAGS " -Werror") + string(APPEND CMAKE_C_FLAGS " -Werror") endif() check_cxx_compiler_flag(-Wunused-but-set-variable HAS_UNUSED_BUT_SET_VARIABLE) check_cxx_compiler_flag(-Wunused-parameter HAS_UNUSED_PARAMETER) @@ -468,7 +467,10 @@ if (onnxruntime_USE_JEMALLOC) list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES jemalloc) endif() -include_directories(${ONNXRUNTIME_INCLUDE_DIR}) +include_directories( + ${ONNXRUNTIME_INCLUDE_DIR} + ${REPO_ROOT}/include/onnxruntime/core/session +) if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML) include(mkldnn) @@ -540,8 +542,8 @@ if(onnxruntime_USE_OPENVINO) add_definitions(-DOPENVINO_CONFIG_CPU_FP32=1) endif() - if(onnxruntime_USE_OPENVINO_VAD_R) - add_definitions(-DOPENVINO_CONFIG_VAD_R=1) + if(onnxruntime_USE_OPENVINO_VAD_M) + add_definitions(-DOPENVINO_CONFIG_VAD_M=1) endif() endif() @@ -575,11 +577,13 @@ if (onnxruntime_USE_CUDA) list(APPEND ONNXRUNTIME_CUDA_LIBRARIES cublas cudnn) if (WIN32) link_directories(${onnxruntime_CUDNN_HOME}/lib/x64) - string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" onnxruntime_CUDA_VERSION_MAJOR ${CMAKE_CUDA_COMPILER_VERSION}) - string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" onnxruntime_CUDA_VERSION_MINOR ${CMAKE_CUDA_COMPILER_VERSION}) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DELAYLOAD:cublas64_${onnxruntime_CUDA_VERSION_MAJOR}${onnxruntime_CUDA_VERSION_MINOR}.dll") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DELAYLOAD:cudart64_${onnxruntime_CUDA_VERSION_MAJOR}${onnxruntime_CUDA_VERSION_MINOR}.dll") + file(GLOB cuda_dll_paths "${onnxruntime_CUDA_HOME}/bin/cublas64_*" "${onnxruntime_CUDA_HOME}/bin/cudart64_*") + foreach(cuda_dll_path ${cuda_dll_paths}) + get_filename_component(cuda_dll_file_name ${cuda_dll_path} NAME) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DELAYLOAD:${cuda_dll_file_name}") + endforeach(cuda_dll_path) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DELAYLOAD:cudnn64_7.dll") else() link_directories(${onnxruntime_CUDNN_HOME}/lib64) diff --git a/cmake/external/DNNLibrary b/cmake/external/DNNLibrary new file mode 160000 index 0000000000000..ab22710a3f016 --- /dev/null +++ b/cmake/external/DNNLibrary @@ -0,0 +1 @@ +Subproject commit ab22710a3f0166f31c9c14feab98c04bfb86b71b diff --git a/cmake/external/grpc b/cmake/external/grpc new file mode 160000 index 0000000000000..02a2a458ac159 --- /dev/null +++ b/cmake/external/grpc @@ -0,0 +1 @@ +Subproject commit 02a2a458ac15912d7d87cc1171e811b0c5219ece diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index ed05d885a65a3..364ba88a891c8 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -3,7 +3,7 @@ include (ExternalProject) set(MKLDNN_URL https://github.com/intel/mkl-dnn.git) # If MKLDNN_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated. set(MKLDNN_TAG v0.18.1) -set(MKLML_VERSION 2019.0.3.20190220) +set(MKLML_VERSION 2019.0.5.20190502) if(WIN32) set(MKLML_OS_VERSION_STR "win") @@ -32,7 +32,7 @@ else() endif() if (onnxruntime_USE_MKLML) - set(MKLDNN_VERSION_SHORT v0.18) + set(MKLDNN_VERSION_SHORT v0.20) set(MKLML_URL https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_VERSION_SHORT}/mklml_${MKLML_OS_VERSION_STR}_${MKLML_VERSION}.${MKLML_FILE_EXTENSION}) ExternalProject_Add(project_mklml @@ -59,8 +59,13 @@ if (onnxruntime_USE_MKLDNN) set(MKLDNN_DLL_PATH ${MKLDNN_LIB_DIR}/${MKLDNN_SHARED_LIB}) endif() set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL}/include) + set (MKLDNN_CMAKE_EXTRA_ARGS) if(NOT onnxruntime_BUILD_FOR_NATIVE_MACHINE) - set(MKLDNN_PATCH_COMMAND1 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/platform.cmake.patch) + # pre-v1.0 + list(APPEND MKLDNN_CMAKE_EXTRA_ARGS "-DARCH_OPT_FLAGS=") + # v1.0 + list(APPEND MKLDNN_CMAKE_EXTRA_ARGS "-DMKLDNN_ARCH_OPT_FLAGS=") + set(MKLDNN_PATCH_COMMAND1 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/mem-patch.cmake.patch) # discard prior changes due to patching in mkldnn source to unblock incremental builds. set(MKLDNN_PATCH_DISCARD_COMMAND cd ${MKLDNN_SOURCE} && git checkout -- .) endif() @@ -70,7 +75,7 @@ if (onnxruntime_USE_MKLDNN) GIT_TAG ${MKLDNN_TAG} PATCH_COMMAND ${MKLDNN_PATCH_DISCARD_COMMAND} COMMAND ${MKLDNN_PATCH_COMMAND1} SOURCE_DIR ${MKLDNN_SOURCE} - CMAKE_ARGS -DMKLDNN_PRODUCT_BUILD_MODE=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL} -DMKLROOT=${MKML_DIR} + CMAKE_ARGS -DMKLDNN_PRODUCT_BUILD_MODE=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL} -DMKLROOT=${MKML_DIR} ${MKLDNN_CMAKE_EXTRA_ARGS} ) link_directories(${MKLDNN_LIB_DIR}) if (onnxruntime_USE_MKLML) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 65b7159e34bee..12d0b6e1431db 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -59,6 +59,7 @@ if (MSVC) COMMAND git apply --ignore-space-change --ignore-whitespace ${PROJECT_SOURCE_DIR}/patches/ngraph/ngraph_protobuf.patch COMMAND git apply --ignore-space-change --ignore-whitespace ${PROJECT_SOURCE_DIR}/patches/ngraph/ngraph_fix_install_error.patch COMMAND git apply --ignore-space-change --ignore-whitespace ${PROJECT_SOURCE_DIR}/patches/ngraph/ngraph_fix_library_path.patch + COMMAND git apply --ignore-space-change --ignore-whitespace ${PROJECT_SOURCE_DIR}/patches/ngraph/ngraph_fix_memory.patch CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DNGRAPH_DEX_ONLY=ON diff --git a/cmake/external/onnx b/cmake/external/onnx index d94f99d21a9a0..65b8e0f9979fb 160000 --- a/cmake/external/onnx +++ b/cmake/external/onnx @@ -1 +1 @@ -Subproject commit d94f99d21a9a0820d58966410ceaf525132f85f1 +Subproject commit 65b8e0f9979fbade16e3becbdfa69c0764946f72 diff --git a/cmake/external/openvino.cmake b/cmake/external/openvino.cmake index 0b24b5683cd7c..929011867ad98 100644 --- a/cmake/external/openvino.cmake +++ b/cmake/external/openvino.cmake @@ -4,7 +4,7 @@ include (ExternalProject) set(OPENVINO_URL https://github.com/opencv/dldt.git) -set(OPENVINO_TAG 2018_R5) +set(OPENVINO_TAG 2019_R1.1) set(OPENVINO_SHARED_LIB libinference_engine.so) diff --git a/cmake/external/spdlog b/cmake/external/spdlog new file mode 160000 index 0000000000000..352281313fe1c --- /dev/null +++ b/cmake/external/spdlog @@ -0,0 +1 @@ +Subproject commit 352281313fe1c4313bc222cb9de222afd50c822f diff --git a/cmake/external/tvm b/cmake/external/tvm index c2b3615477850..fd4801612817f 160000 --- a/cmake/external/tvm +++ b/cmake/external/tvm @@ -1 +1 @@ -Subproject commit c2b36154778503a509a70a3b5309b201969eccab +Subproject commit fd4801612817f96e890058656834deb925fc064a diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index e00679bf46071..91508a8aa8f57 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -31,6 +31,10 @@ add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEP target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT}) onnxruntime_add_include_to_target(onnxruntime gsl) +if (onnxruntime_USE_CUDA) + target_include_directories(onnxruntime PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +endif() + if(UNIX) if (APPLE) set(BEGIN_WHOLE_ARCHIVE -Xlinker -all_load) @@ -60,6 +64,7 @@ target_link_libraries(onnxruntime PRIVATE ${PROVIDERS_CUDA} ${PROVIDERS_MKLDNN} ${PROVIDERS_NGRAPH} + ${PROVIDERS_NNAPI} ${PROVIDERS_TENSORRT} ${PROVIDERS_OPENVINO} onnxruntime_optimizer diff --git a/cmake/onnxruntime_codegen.cmake b/cmake/onnxruntime_codegen.cmake index d63d367f2bb9a..df90e36cbce6a 100644 --- a/cmake/onnxruntime_codegen.cmake +++ b/cmake/onnxruntime_codegen.cmake @@ -1,15 +1,24 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +file(GLOB_RECURSE onnxruntime_codegen_common_srcs + "${ONNXRUNTIME_ROOT}/core/codegen/common/*.h" + "${ONNXRUNTIME_ROOT}/core/codegen/common/*.cc" +) + file(GLOB_RECURSE onnxruntime_codegen_tvm_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/codegen/tvm/*.h" - "${ONNXRUNTIME_ROOT}/core/codegen/tvm/*.cc" + "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.h" + "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.cc" + "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.h" + "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.cc" ) +source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs}) + #onnxruntime_codegen_tvm depends on onnxruntime framework -add_library(onnxruntime_codegen_tvm ${onnxruntime_codegen_tvm_srcs}) +add_library(onnxruntime_codegen_tvm ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs}) set_target_properties(onnxruntime_codegen_tvm PROPERTIES FOLDER "ONNXRuntime") -target_include_directories(onnxruntime_codegen_tvm PRIVATE ${ONNXRUNTIME_ROOT} ${TVM_INCLUDES}) +target_include_directories(onnxruntime_codegen_tvm PRIVATE ${ONNXRUNTIME_ROOT} ${TVM_INCLUDES} ${MKLML_INCLUDE_DIR} ${eigen_INCLUDE_DIRS}) onnxruntime_add_include_to_target(onnxruntime_codegen_tvm onnxruntime_common onnxruntime_framework gsl onnx onnx_proto protobuf::libprotobuf) target_compile_options(onnxruntime_codegen_tvm PRIVATE ${DISABLED_WARNINGS_FOR_TVM}) # need onnx to build to create headers that this project includes diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index 71a397176f3a3..0799ab9a6c79e 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -67,3 +67,18 @@ if(WIN32) # Add Code Analysis properties to enable C++ Core checks. Have to do it via a props file include. set_target_properties(onnxruntime_common PROPERTIES VS_USER_PROPS ${PROJECT_SOURCE_DIR}/EnableVisualStudioCodeAnalysis.props) endif() + +# check if we need to link against librt on Linux +include(CheckLibraryExists) +include(CheckFunctionExists) +if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux") + check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME) + + if (NOT HAVE_CLOCK_GETTIME) + set(CMAKE_EXTRA_INCLUDE_FILES time.h) + check_function_exists(clock_gettime HAVE_CLOCK_GETTIME) + set(CMAKE_EXTRA_INCLUDE_FILES) + else() + target_link_libraries(onnxruntime_common rt) + endif() +endif() diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 0b3c566072319..619a4c3d08dc9 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -17,13 +17,7 @@ set(mlas_common_srcs if(MSVC) - if(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM") - - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm/sgemmc.cpp - ) - - elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") + if(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") set(asm_filename ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm64/sgemma.asm) set(pre_filename ${CMAKE_CURRENT_BINARY_DIR}/sgemma.i) @@ -45,17 +39,13 @@ if(MSVC) set(mlas_platform_srcs ${obj_filename}) - elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "Win32") - - enable_language(ASM_MASM) - - set(CMAKE_ASM_MASM_FLAGS "${CMAKE_ASM_MASM_FLAGS} /safeseh") + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM" OR CMAKE_GENERATOR MATCHES "ARM") set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/i386/sgemma.asm + ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm/sgemmc.cpp ) - elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "x64") + elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "x64" OR CMAKE_GENERATOR MATCHES "Win64") enable_language(ASM_MASM) @@ -78,71 +68,76 @@ if(MSVC) ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/ErfKernelFma3.asm ) - endif() - -elseif(CMAKE_SYSTEM_NAME STREQUAL "Android") + else() - if(CMAKE_ANDROID_ARCH_ABI MATCHES "^arm.*") + enable_language(ASM_MASM) - if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") - endif() + set(CMAKE_ASM_MASM_FLAGS "${CMAKE_ASM_MASM_FLAGS} /safeseh") set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm/sgemmc.cpp + ${ONNXRUNTIME_ROOT}/core/mlas/lib/i386/sgemma.asm ) - else() - - message(FATAL_ERROR "Android build is not supported on non-ARM platform now") - endif() - else() + if (CMAKE_SYSTEM_NAME STREQUAL "Android") + if (CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") + set(ARM TRUE) + elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "arm64-v8a") + set(ARM TRUE) # Android NDK fails to compile sgemma.s + elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86_64") + set(X86_64 TRUE) + elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86") + set(X86 TRUE) + endif() + else() + execute_process( + COMMAND ${CMAKE_C_COMPILER} -dumpmachine + OUTPUT_VARIABLE dumpmachine_output + ERROR_QUIET + ) + + if(dumpmachine_output MATCHES "^arm.*") + set(ARM TRUE) + elseif(dumpmachine_output MATCHES "^aarch64.*") + set(ARM64 TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$") + set(X86 TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(X86_64 TRUE) + endif() + endif() - execute_process( - COMMAND ${CMAKE_C_COMPILER} -dumpmachine - OUTPUT_VARIABLE dumpmachine_output - ERROR_QUIET - ) - - if(dumpmachine_output MATCHES "^arm.*") - + if (ARM) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") set(mlas_platform_srcs ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm/sgemmc.cpp - ) - - elseif(dumpmachine_output MATCHES "^aarch64.*") - + ) + elseif (ARM64) enable_language(ASM) set(mlas_platform_srcs ${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch64/sgemma.s - ) - - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$") - + ) + elseif (X86) enable_language(ASM) set(mlas_platform_srcs_sse2 ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelSse2.S - ) + ) set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2") set(mlas_platform_srcs_avx ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelAvx.S - ) + ) set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx") set(mlas_platform_srcs ${mlas_platform_srcs_sse2} ${mlas_platform_srcs_avx} - ) - - elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - + ) + elseif (X86_64) enable_language(ASM) # The LLVM assmebler does not support the .arch directive to enable instruction diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index d73691b4fb0a8..0447e4814d37d 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -50,6 +50,10 @@ if(onnxruntime_USE_OPENVINO) set(PROVIDERS_OPENVINO onnxruntime_providers_openvino) list(APPEND ONNXRUNTIME_PROVIDER_NAMES openvino) endif() +if(onnxruntime_USE_NNAPI) + set(PROVIDERS_NNAPI onnxruntime_providers_nnapi) + list(APPEND ONNXRUNTIME_PROVIDER_NAMES nnapi) +endif() source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_common_srcs} ${onnxruntime_providers_srcs}) # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs}) @@ -73,6 +77,17 @@ if(HAS_DEPRECATED_COPY) set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/tensor/onehot.cc" PROPERTIES COMPILE_FLAGS -Wno-deprecated-copy) set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/tensor/where_op.cc" PROPERTIES COMPILE_FLAGS -Wno-deprecated-copy) endif() + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT MSVC) + # For x86 platforms it is important to pass this flag to compiler. Without this gemmlowp will use slow reference code. + # These optimizations are not enabled on MSVC so excluding it. + message("enabling optimizations for gemmlowp") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/math/matmul_integer.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/math/quantize_linear_matmul.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/nn/qlinearconv.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/nn/conv_integer.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") +endif() + set(gemmlowp_src ${PROJECT_SOURCE_DIR}/external/gemmlowp) set(re2_src ${ONNXRUNTIME_ROOT}/../cmake/external/re2) target_include_directories(onnxruntime_providers PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${gemmlowp_src} ${re2_src}) @@ -92,7 +107,7 @@ if (onnxruntime_USE_CUDA) ) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_cu_srcs}) source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cuda_contrib_ops_cc_srcs} ${onnxruntime_cuda_contrib_ops_cu_srcs}) - + # disable contrib ops conditionally if(onnxruntime_DISABLE_CONTRIB_OPS) add_library(onnxruntime_providers_cuda ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_cu_srcs}) @@ -256,10 +271,20 @@ if (onnxruntime_USE_OPENVINO) # Below variables point to directories within the OpenVINO installation directory # whose value is set in INTEL_CVSDK_DIR variable by running the setupvars.sh script - if (onnxruntime_USE_OPENVINO_BINARY) - set(OPENVINO_INCLUDE_DIR $ENV{INTEL_CVSDK_DIR}/deployment_tools/inference_engine/include) - set(OPENVINO_LIB_DIR $ENV{INTEL_CVSDK_DIR}/deployment_tools/inference_engine/lib/ubuntu_16.04/intel64/) +if (onnxruntime_USE_OPENVINO_BINARY) + if ($ENV{INTEL_CVSDK_DIR} MATCHES "2019.1") + message($ENV{INTEL_CVSDK_DIR}) + set(OPENVINO_INCLUDE_DIR $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/include) + set(OPENVINO_TBB_INCLUDE_DIR $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/include) + set(OPENVINO_LIB_DIR $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64/) + set(OPENVINO_TBB_DIR $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib) + set(OPENVINO_MKL_TINY_DIR $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib) + endif() + if ($ENV{INTEL_CVSDK_DIR} MATCHES "2018.5") + set(OPENVINO_INCLUDE_DIR $ENV{INTEL_CVSDK_DIR}/deployment_tools/inference_engine/include) + set(OPENVINO_LIB_DIR $ENV{INTEL_CVSDK_DIR}/deployment_tools/inference_engine/lib/ubuntu_16.04/intel64/) endif() +endif() find_package(PythonLibs REQUIRED) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs}) @@ -267,14 +292,47 @@ if (onnxruntime_USE_OPENVINO) onnxruntime_add_include_to_target(onnxruntime_providers_openvino gsl onnxruntime_common onnxruntime_framework gsl onnx onnx_proto protobuf::libprotobuf) add_dependencies(onnxruntime_providers_openvino ${onnxruntime_EXTERNAL_DEPENDENCIES}) set_target_properties(onnxruntime_providers_openvino PROPERTIES FOLDER "ONNXRuntime") - target_include_directories(onnxruntime_providers_openvino PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${OPENVINO_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS}) + target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${OPENVINO_INCLUDE_DIR} ${OPENVINO_TBB_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS}) install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/openvino DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers) set_target_properties(onnxruntime_providers_openvino PROPERTIES LINKER_LANGUAGE CXX) - link_directories(onnxruntime_providers_openvino ${OPENVINO_LIB_DIR}) - target_link_libraries(onnxruntime_providers_openvino -linference_engine ${PYTHON_LIBRARIES}) + if ($ENV{INTEL_CVSDK_DIR} MATCHES "2019.1") + link_directories(onnxruntime_providers_openvino ${OPENVINO_LIB_DIR} ${OPENVINO_TBB_DIR} ${OPENVINO_MKL_TINY_DIR}) + target_link_libraries(onnxruntime_providers_openvino -linference_engine -ltbb ${PYTHON_LIBRARIES}) + endif() + if ($ENV{INTEL_CVSDK_DIR} MATCHES "2018.5") + link_directories(onnxruntime_providers_openvino ${OPENVINO_LIB_DIR}) + target_link_libraries(onnxruntime_providers_openvino -linference_engine ${PYTHON_LIBRARIES}) + endif() file(COPY ${onnxruntime_providers_openvino_py_srcs} DESTINATION ${onnxruntime_BINARY_DIR}) endif() +if (onnxruntime_USE_NNAPI) + add_definitions(-DUSE_NNAPI=1) + option(DNN_READ_ONNX "" ON) + set(DNN_CUSTOM_PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE}) + option(DNN_CMAKE_INSTALL "" OFF) + option(DNN_BUILD_BIN "" OFF) + add_subdirectory(${REPO_ROOT}/cmake/external/DNNLibrary) + file(GLOB_RECURSE + onnxruntime_providers_nnapi_cc_srcs CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/core/providers/nnapi/*.h" + "${ONNXRUNTIME_ROOT}/core/providers/nnapi/*.cc" + ) + source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_nnapi_cc_srcs}) + add_library(onnxruntime_providers_nnapi ${onnxruntime_providers_nnapi_cc_srcs}) + onnxruntime_add_include_to_target(onnxruntime_providers_nnapi onnxruntime_common onnxruntime_framework gsl onnx onnx_proto protobuf::libprotobuf-lite dnnlibrary::dnnlibrary) + target_link_libraries(onnxruntime_providers_nnapi dnnlibrary::dnnlibrary) + add_dependencies(onnxruntime_providers_nnapi + dnnlibrary::dnnlibrary + onnx ${onnxruntime_EXTERNAL_DEPENDENCIES}) + # Header files of DNNLibrary requires C++17, fortunately, all modern Android NDKs support C++17 + set_target_properties(onnxruntime_providers_nnapi PROPERTIES CXX_STANDARD 17) + set_target_properties(onnxruntime_providers_nnapi PROPERTIES CXX_STANDARD_REQUIRED ON) + set_target_properties(onnxruntime_providers_nnapi PROPERTIES FOLDER "ONNXRuntime") + target_include_directories(onnxruntime_providers_nnapi PRIVATE ${ONNXRUNTIME_ROOT} ${nnapi_INCLUDE_DIRS}) + set_target_properties(onnxruntime_providers_nnapi PROPERTIES LINKER_LANGUAGE CXX) +endif() + if (onnxruntime_ENABLE_MICROSOFT_INTERNAL) include(onnxruntime_providers_internal.cmake) endif() diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 5a5ecc461d9ee..c9fcb91ff359d 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -60,7 +60,7 @@ onnxruntime_add_include_to_target(onnxruntime_pybind11_state gsl) if(APPLE) set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/python/exported_symbols.lst") elseif(UNIX) - set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/python/version_script.lds -Xlinker --no-undefined -Xlinker --gc-sections") + set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/python/version_script.lds -Xlinker --gc-sections") else() set(ONNXRUNTIME_SO_LINK_FLAG "-DEF:${ONNXRUNTIME_ROOT}/python/pybind.def") endif() @@ -73,6 +73,7 @@ set(onnxruntime_pybind11_state_libs ${PROVIDERS_TENSORRT} ${PROVIDERS_NGRAPH} ${PROVIDERS_OPENVINO} + ${PROVIDERS_NNAPI} onnxruntime_optimizer onnxruntime_providers onnxruntime_util diff --git a/cmake/onnxruntime_server.cmake b/cmake/onnxruntime_server.cmake index 072c9ac26f0c6..eff812bcad13e 100644 --- a/cmake/onnxruntime_server.cmake +++ b/cmake/onnxruntime_server.cmake @@ -1,28 +1,111 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. - set(SERVER_APP_NAME "onnxruntime_server") +set(gRPC_BUILD_TESTS OFF CACHE INTERNAL "Don't build tests") +set(gRPC_GFLAGS_PROVIDER "" CACHE INTERNAL "Don't use gflags") +set(gRPC_BENCHMARK_PROVIDER "" CACHE INTERNAL "Don't use benchmark") +set(gRPC_ZLIB_PROVIDER "package" CACHE INTERNAL "Use preinstalled zlib library") +set(gRPC_PROTOBUF_PROVIDER "" CACHE INTERNAL "Don't use grpc protobuf, set it manually.") + + +# protobuf targets have already been included as submodules - adapted from https://github.com/grpc/grpc/blob/master/cmake/protobuf.cmake +set(_gRPC_PROTOBUF_LIBRARY_NAME "libprotobuf") +set(_gRPC_PROTOBUF_LIBRARIES protobuf::${_gRPC_PROTOBUF_LIBRARY_NAME}) + +set(_gRPC_PROTOBUF_PROTOC_LIBRARIES protobuf::libprotoc) +# extract the include dir from target's properties + +set(_gRPC_PROTOBUF_WELLKNOWN_INCLUDE_DIR ${REPO_ROOT}/cmake/external/protobuf/src) +set(_gRPC_PROTOBUF_PROTOC protobuf::protoc) +set(_gRPC_PROTOBUF_PROTOC_EXECUTABLE $) + +set(_gRPC_PROTOBUF_INCLUDE_DIR ${PROTOBUF_INCLUDE_DIRS}) + +if(NOT WIN32) + string(REPLACE "-Werror" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") # Disable werror for included subdirectories - c-ares<1.15 breaks with -Wall + string(REPLACE "-Werror" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + if(HAS_UNUSED_PARAMETER) # disable warning for unused parameters because (BoringSSL specifically) have unused parameters. + string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-parameter") + string(APPEND CMAKE_C_FLAGS " -Wno-unused-parameter") + endif() +endif() + +add_subdirectory(${PROJECT_SOURCE_DIR}/external/grpc EXCLUDE_FROM_ALL) +if(NOT WIN32) + if(onnxruntime_DEV_MODE) # Reenable Werror for our code subdirectories. + if(NOT onnxruntime_USE_TVM) + string(APPEND CMAKE_CXX_FLAGS " -Werror") + string(APPEND CMAKE_C_FLAGS " -Werror") + endif() + endif() + if(HAS_UNUSED_PARAMETER) # reenable warning for unused parameters for our code. + string(APPEND CMAKE_CXX_FLAGS " -Wunused-parameter") + string(APPEND CMAKE_C_FLAGS " -Wunused-parameter") + endif() +endif() + +set(_GRPC_CPP_PLUGIN_EXECUTABLE $) +set(_GRPC_PY_PLUGIN_EXECUTABLE $) + + # Generate .h and .cc files from protobuf file -add_library(server_proto ${ONNXRUNTIME_ROOT}/server/protobuf/predict.proto) +add_library(server_proto ${ONNXRUNTIME_ROOT}/server/protobuf/predict.proto ${ONNXRUNTIME_ROOT}/server/protobuf/onnx-ml.proto) if(WIN32) target_compile_options(server_proto PRIVATE "/wd4125" "/wd4456") endif() target_include_directories(server_proto PUBLIC $ "${CMAKE_CURRENT_BINARY_DIR}/.." ${CMAKE_CURRENT_BINARY_DIR}/onnx) target_compile_definitions(server_proto PUBLIC $) onnxruntime_protobuf_generate(APPEND_PATH IMPORT_DIRS ${REPO_ROOT}/cmake/external/protobuf/src ${ONNXRUNTIME_ROOT}/server/protobuf ${ONNXRUNTIME_ROOT}/core/protobuf TARGET server_proto) -add_dependencies(server_proto onnx_proto ${onnxruntime_EXTERNAL_DEPENDENCIES}) +add_dependencies(server_proto ${onnxruntime_EXTERNAL_DEPENDENCIES}) if(NOT WIN32) if(HAS_UNUSED_PARAMETER) - set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/model_metadata.pb.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) - set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/model_status.pb.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) - set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/predict.pb.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) + set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/predict.pb.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) + set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/onnx-ml.pb.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) endif() endif() # Setup dependencies include(get_boost.cmake) set(re2_src ${REPO_ROOT}/cmake/external/re2) +set(SPDLOG_BUILD_EXAMPLES OFF) +add_subdirectory(${REPO_ROOT}/cmake/external/spdlog) + +# Generate GRPC service source and headers. +get_filename_component(grpc_proto "${ONNXRUNTIME_ROOT}/server/protobuf/prediction_service.proto" ABSOLUTE) +get_filename_component(grpc_proto_path "${grpc_proto}" PATH) + +set(grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/prediction_service.grpc.pb.cc") +set(grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/prediction_service.grpc.pb.h") +add_custom_command( + OUTPUT "${grpc_srcs}" "${grpc_hdrs}" + COMMAND $ + ARGS + --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" + --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" + --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}" + -I ${grpc_proto_path} + "${grpc_proto}" + DEPENDS "${grpc_proto}" ${_GRPC_CPP_PLUGIN_EXECUTABLE} + COMMENT "Running ${_GRPC_CPP_PLUGIN_EXECUTABLE} on ${grpc_proto}" + ) + +add_library(server_grpc_proto ${grpc_srcs}) +target_include_directories(server_grpc_proto PUBLIC $ "${CMAKE_CURRENT_BINARY_DIR}" ${CMAKE_CURRENT_BINARY_DIR}/onnx PRIVATE) +set(grpc_reflection -Wl,--whole-archive grpc++_reflection -Wl,--no-whole-archive) +set(grpc_static_libs grpc++ grpcpp_channelz) +target_link_libraries(server_grpc_proto ${grpc_static_libs}) +add_dependencies(server_grpc_proto server_proto) +# Include generated *.pb.h files +include_directories("${CMAKE_CURRENT_BINARY_DIR}") + +if(NOT WIN32) + if(HAS_UNUSED_PARAMETER) + set_source_files_properties(${grpc_srcs} PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) + set_source_files_properties(${onnxruntime_server_grpc_srcs} PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) + endif() +endif() + # Setup source code set(onnxruntime_server_lib_srcs @@ -33,14 +116,14 @@ set(onnxruntime_server_lib_srcs "${ONNXRUNTIME_ROOT}/server/executor.cc" "${ONNXRUNTIME_ROOT}/server/converter.cc" "${ONNXRUNTIME_ROOT}/server/util.cc" + "${ONNXRUNTIME_ROOT}/server/core/request_id.cc" + "${ONNXRUNTIME_ROOT}/server/grpc/prediction_service_impl.cc" + "${ONNXRUNTIME_ROOT}/server/grpc/grpc_app.cc" + "${ONNXRUNTIME_ROOT}/server/serializing/tensorprotoutils.cc" ) if(NOT WIN32) if(HAS_UNUSED_PARAMETER) - set_source_files_properties(${ONNXRUNTIME_ROOT}/server/http/json_handling.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) - set_source_files_properties(${ONNXRUNTIME_ROOT}/server/http/predict_request_handler.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) - set_source_files_properties(${ONNXRUNTIME_ROOT}/server/executor.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) - set_source_files_properties(${ONNXRUNTIME_ROOT}/server/converter.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) - set_source_files_properties(${ONNXRUNTIME_ROOT}/server/util.cc PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) + set_source_files_properties(${onnxruntime_server_lib_srcs} PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) endif() endif() @@ -58,6 +141,7 @@ add_library(onnxruntime_server_http_core_lib STATIC target_include_directories(onnxruntime_server_http_core_lib PUBLIC ${ONNXRUNTIME_ROOT}/server/http/core + ${ONNXRUNTIME_ROOT}/server/core ${Boost_INCLUDE_DIR} ${re2_src} ) @@ -70,30 +154,28 @@ target_link_libraries(onnxruntime_server_http_core_lib PRIVATE add_library(onnxruntime_server_lib ${onnxruntime_server_lib_srcs}) onnxruntime_add_include_to_target(onnxruntime_server_lib gsl onnx_proto server_proto) target_include_directories(onnxruntime_server_lib PRIVATE - ${ONNXRUNTIME_ROOT} - ${CMAKE_CURRENT_BINARY_DIR}/onnx + ${ONNXRUNTIME_INCLUDE_DIR} ${ONNXRUNTIME_ROOT}/server ${ONNXRUNTIME_ROOT}/server/http ${ONNXRUNTIME_ROOT}/server/logging + ${ONNXRUNTIME_ROOT}/server/core PUBLIC + ${ONNXRUNTIME_ROOT}/server ${Boost_INCLUDE_DIR} ${re2_src} ) + target_link_libraries(onnxruntime_server_lib PRIVATE server_proto + server_grpc_proto ${Boost_LIBRARIES} onnxruntime_server_http_core_lib - onnxruntime_session - onnxruntime_optimizer - onnxruntime_providers - onnxruntime_util - onnxruntime_framework - onnxruntime_util - onnxruntime_graph - onnxruntime_common - onnxruntime_mlas - ${onnxruntime_EXTERNAL_LIBRARIES} + PUBLIC + protobuf::libprotobuf + ${onnxruntime_EXTERNAL_DEPENDENCIES} + spdlog::spdlog + onnxruntime ) if (onnxruntime_USE_SYSLOG) @@ -124,12 +206,14 @@ message(STATUS "ONNX Runtime Server latest commit id is: ${onnxruntime_LATEST_CO onnxruntime_add_include_to_target(${SERVER_APP_NAME} onnxruntime_session onnxruntime_server_lib gsl onnx onnx_proto server_proto) target_include_directories(${SERVER_APP_NAME} PRIVATE - ${ONNXRUNTIME_ROOT} + ${ONNXRUNTIME_INCLUDE_DIR} ${ONNXRUNTIME_ROOT}/server/http ) + target_link_libraries(${SERVER_APP_NAME} PRIVATE onnxruntime_server_http_core_lib onnxruntime_server_lib + ${grpc_reflection} #Note that this will break the tests if we try to link it to the lib so just link to the executable. ) diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake index ba0d8129ace90..96e9fa55bc53a 100644 --- a/cmake/onnxruntime_session.cmake +++ b/cmake/onnxruntime_session.cmake @@ -15,6 +15,9 @@ onnxruntime_add_include_to_target(onnxruntime_session onnxruntime_common onnxrun target_include_directories(onnxruntime_session PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS}) add_dependencies(onnxruntime_session ${onnxruntime_EXTERNAL_DEPENDENCIES}) set_target_properties(onnxruntime_session PROPERTIES FOLDER "ONNXRuntime") +if (onnxruntime_USE_CUDA) + target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +endif() if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS) add_definitions(-DENABLE_LANGUAGE_INTEROP_OPS) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index ce3c2c905ba27..3223e263a21e1 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -117,6 +117,8 @@ set(onnxruntime_test_providers_src_patterns "${TEST_SRC_DIR}/providers/*.cc" "${TEST_SRC_DIR}/framework/TestAllocatorManager.cc" "${TEST_SRC_DIR}/framework/TestAllocatorManager.h" + "${TEST_SRC_DIR}/framework/test_utils.cc" + "${TEST_SRC_DIR}/framework/test_utils.h" ) if(NOT onnxruntime_DISABLE_CONTRIB_OPS) list(APPEND onnxruntime_test_providers_src_patterns @@ -138,6 +140,13 @@ if (onnxruntime_USE_NGRAPH) list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_ngraph_src}) endif() +if (onnxruntime_USE_NNAPI) + file(GLOB_RECURSE onnxruntime_test_providers_nnapi_src CONFIGURE_DEPENDS + "${TEST_SRC_DIR}/providers/nnapi/*" + ) + list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_nnapi_src}) +endif() + # tests from lowest level library up. # the order of libraries should be maintained, with higher libraries being added first in the list @@ -196,6 +205,10 @@ if(onnxruntime_USE_OPENVINO) list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_openvino) endif() +if(onnxruntime_USE_NNAPI) + list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_nnapi) +endif() + file(GLOB_RECURSE onnxruntime_test_tvm_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/test/tvm/*.h" "${ONNXRUNTIME_ROOT}/test/tvm/*.cc" @@ -218,6 +231,7 @@ set(ONNXRUNTIME_TEST_LIBS ${PROVIDERS_TENSORRT} ${PROVIDERS_NGRAPH} ${PROVIDERS_OPENVINO} + ${PROVIDERS_NNAPI} onnxruntime_optimizer onnxruntime_providers onnxruntime_util @@ -241,6 +255,13 @@ if(onnxruntime_USE_TENSORRT) list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_tensorrt) endif() +if(onnxruntime_USE_NNAPI) + list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/nnapi/*) + list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_nnapi) + list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_nnapi) + list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_nnapi) +endif() + if(WIN32) if (onnxruntime_USE_TVM) list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM}) @@ -408,6 +429,12 @@ if(WIN32) $ ) endif() + if (onnxruntime_USE_TVM) + add_custom_command( + TARGET ${test_data_target} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy $ $ + ) + endif() endif() add_library(onnx_test_data_proto ${TEST_SRC_DIR}/proto/tml.proto) @@ -596,15 +623,6 @@ if (onnxruntime_BUILD_SHARED_LIB) protobuf::libprotobuf DEPENDS ${all_dependencies} ) - #demo - message("PNG Lib Dir = ${PNG_LIBRARIES}") - message("PNG Include Dir = ${PNG_INCLUDE_DIRS}") - if(PNG_FOUND AND NOT WIN32) # for some reason some symbols are not found in Win32 PNG module - add_executable(fns_candy_style_transfer "${ONNXRUNTIME_ROOT}/test/shared_lib/fns_candy_style_transfer.c") - target_include_directories(fns_candy_style_transfer PRIVATE "${TEST_SRC_DIR}/util/include" ${PNG_INCLUDE_DIRS}) - target_link_libraries(fns_candy_style_transfer PRIVATE onnxruntime ${PNG_LIBRARIES}) - set_target_properties(fns_candy_style_transfer PROPERTIES FOLDER "ONNXRuntimeTest") - endif() endif() if (onnxruntime_BUILD_SERVER) @@ -621,25 +639,26 @@ if (onnxruntime_BUILD_SERVER) set_source_files_properties("${TEST_SRC_DIR}/server/unit_tests/json_handling_tests.cc" PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) set_source_files_properties("${TEST_SRC_DIR}/server/unit_tests/converter_tests.cc" PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) set_source_files_properties("${TEST_SRC_DIR}/server/unit_tests/util_tests.cc" PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) + set_source_files_properties("${TEST_SRC_DIR}/server/unit_tests/prediction_service_impl_test.cc" PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) set_source_files_properties("${TEST_SRC_DIR}/server/unit_tests/executor_test.cc" PROPERTIES COMPILE_FLAGS -Wno-unused-parameter) endif() endif() add_library(onnxruntime_test_utils_for_server ${onnxruntime_test_server_src}) - onnxruntime_add_include_to_target(onnxruntime_test_utils_for_server onnxruntime_test_utils_for_framework gtest gmock gsl onnx onnx_proto server_proto) + onnxruntime_add_include_to_target(onnxruntime_test_utils_for_server onnxruntime_test_utils_for_framework gtest gmock gsl onnx onnx_proto server_proto server_grpc_proto) add_dependencies(onnxruntime_test_utils_for_server onnxruntime_server_lib onnxruntime_server_http_core_lib Boost ${onnxruntime_EXTERNAL_DEPENDENCIES}) - target_include_directories(onnxruntime_test_utils_for_server PUBLIC ${Boost_INCLUDE_DIR} ${REPO_ROOT}/cmake/external/re2 ${CMAKE_CURRENT_BINARY_DIR}/onnx ${ONNXRUNTIME_ROOT}/server/http ${ONNXRUNTIME_ROOT}/server/http/core PRIVATE ${ONNXRUNTIME_ROOT} ) + target_include_directories(onnxruntime_test_utils_for_server PUBLIC ${Boost_INCLUDE_DIR} ${REPO_ROOT}/cmake/external/re2 ${CMAKE_CURRENT_BINARY_DIR}/onnx ${ONNXRUNTIME_ROOT}/server ${ONNXRUNTIME_ROOT}/server/http ${ONNXRUNTIME_ROOT}/server/http/core ${ONNXRUNTIME_ROOT}/server/grpc ${ONNXRUNTIME_ROOT}/server ${ONNXRUNTIME_ROOT}/server/core PRIVATE ${ONNXRUNTIME_ROOT} ) if(UNIX) target_compile_options(onnxruntime_test_utils_for_server PRIVATE "$<$:SHELL:-Xcompiler -Wno-error=sign-compare>" "$<$>:-Wno-error=sign-compare>") endif() - target_link_libraries(onnxruntime_test_utils_for_server ${Boost_LIBRARIES}) + target_link_libraries(onnxruntime_test_utils_for_server ${Boost_LIBRARIES} spdlog::spdlog server_grpc_proto) AddTest( TARGET onnxruntime_server_tests SOURCES ${onnxruntime_test_server_src} - LIBS ${onnxruntime_test_server_libs} server_proto onnxruntime_server_lib ${onnxruntime_test_providers_libs} + LIBS ${onnxruntime_test_server_libs} server_proto server_grpc_proto onnxruntime_server_lib ${onnxruntime_test_providers_libs} DEPENDS ${onnxruntime_EXTERNAL_DEPENDENCIES} ) @@ -649,6 +668,20 @@ if (onnxruntime_BUILD_SERVER) LANGUAGE python TARGET onnxruntime_server_tests OUT_VAR server_test_py) + + set(grpc_py "${CMAKE_CURRENT_BINARY_DIR}/prediction_service_pb2_grpc.py") + + add_custom_command( + TARGET onnxruntime_server_tests + COMMAND $ + ARGS + --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" + --plugin=protoc-gen-grpc="${_GRPC_PY_PLUGIN_EXECUTABLE}" + -I ${grpc_proto_path} + "${grpc_proto}" + DEPENDS "${grpc_proto}" + COMMENT "Running ${_GRPC_PY_PLUGIN_EXECUTABLE} on ${grpc_proto}" + ) add_custom_command( TARGET onnxruntime_server_tests POST_BUILD @@ -662,6 +695,9 @@ if (onnxruntime_BUILD_SERVER) COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/predict_pb2.py ${CMAKE_CURRENT_BINARY_DIR}/server_test/ + COMMAND ${CMAKE_COMMAND} -E copy + ${grpc_py} + ${CMAKE_CURRENT_BINARY_DIR}/server_test/ ) endif() diff --git a/cmake/patches/mkldnn/mem-patch.cmake.patch b/cmake/patches/mkldnn/mem-patch.cmake.patch new file mode 100644 index 0000000000000..9d0a2543e6128 --- /dev/null +++ b/cmake/patches/mkldnn/mem-patch.cmake.patch @@ -0,0 +1,107 @@ + +--- + src/cpu/jit_avx2_1x1_convolution.cpp | 6 +++--- + src/cpu/jit_avx512_common_1x1_convolution.cpp | 9 ++++----- + src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp | 6 ++++-- + src/cpu/jit_uni_1x1_conv_utils.hpp | 3 ++- + 4 files changed, 13 insertions(+), 11 deletions(-) + +diff --git a/src/cpu/jit_avx2_1x1_convolution.cpp b/src/cpu/jit_avx2_1x1_convolution.cpp +index 46362886..edb2b6fb 100644 +--- a/src/cpu/jit_avx2_1x1_convolution.cpp ++++ b/src/cpu/jit_avx2_1x1_convolution.cpp +@@ -50,7 +50,7 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward() const { + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad().get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get(key_conv_rtus_space):NULL; + + const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; + const int ndims = dst_d.ndims(); +@@ -180,7 +180,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() const { + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad().get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get(key_conv_rtus_space):NULL; + + // TODO (Roma): remove this restriction + assert(jcp.stride_w == 1 && jcp.stride_h == 1); +@@ -306,7 +306,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() const { + const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1)); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad.get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; + + data_t *diff_bias = pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; +diff --git a/src/cpu/jit_avx512_common_1x1_convolution.cpp b/src/cpu/jit_avx512_common_1x1_convolution.cpp +index 6879cd91..6a32aa49 100644 +--- a/src/cpu/jit_avx512_common_1x1_convolution.cpp ++++ b/src/cpu/jit_avx512_common_1x1_convolution.cpp +@@ -106,7 +106,7 @@ execute_forward_thr(const int ithr, const int nthr, const src_data_t *src, + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad.get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; + + const int ndims = src_d.ndims(); + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; +@@ -301,9 +301,8 @@ void jit_avx512_common_1x1_convolution_bwd_data_tdiff_src_pd()); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad().template get( +- key_conv_rtus_space); +- ++ auto rtus_space = pd()->rtus_.reduce_src_? scratchpad().template get(key_conv_rtus_space): NULL; ++ + const int ndims = diff_src_d.ndims(); + + // TODO (Roma): remove this restriction +@@ -470,7 +469,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() + + const auto scratchpad = this->scratchpad(); + +- auto rtus_space = scratchpad.get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; + data_t *diff_bias = pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; + auto wei_reduction = scratchpad.get(key_conv_wei_reduction); +diff --git a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp +index de303cd2..ec0c54e7 100644 +--- a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp ++++ b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp +@@ -100,8 +100,10 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t + ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0; + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad.get(key_conv_rtus_space); +- auto local_scales = scratchpad.get(key_conv_adjusted_scales); ++ ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; ++ ++ auto local_scales = scratchpad.get(key_conv_adjusted_scales); + + const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; + +diff --git a/src/cpu/jit_uni_1x1_conv_utils.hpp b/src/cpu/jit_uni_1x1_conv_utils.hpp +index a3ed769a..5a0e0635 100644 +--- a/src/cpu/jit_uni_1x1_conv_utils.hpp ++++ b/src/cpu/jit_uni_1x1_conv_utils.hpp +@@ -94,7 +94,8 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d, + template + inline void rtus_prepare_space_info(conv_pd_t *self, + memory_tracking::registrar_t &scratchpad) { +- const auto &jcp = self->jcp_; ++ if (!self->rtus_.reduce_src_) return; ++ const auto &jcp = self->jcp_; + + const int max_threads = mkldnn_get_max_threads(); + const size_t factor = utils::pick_by_prop_kind(self->desc()->prop_kind, +-- +2.17.0.windows.1 + diff --git a/cmake/patches/mkldnn/platform.cmake.patch b/cmake/patches/mkldnn/platform.cmake.patch deleted file mode 100644 index 7fe7e836ea777..0000000000000 --- a/cmake/patches/mkldnn/platform.cmake.patch +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/cmake/platform.cmake b/cmake/platform.cmake -index 3597970a..805ce63e 100644 ---- a/cmake/platform.cmake -+++ b/cmake/platform.cmake -@@ -107,9 +107,6 @@ elseif(UNIX OR MINGW) - append(CMAKE_CCXX_SANITIZER_FLAGS "-g -fno-omit-frame-pointer") - endif() - elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") -- if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) -- set(DEF_ARCH_OPT_FLAGS "-march=native -mtune=native") -- endif() - # suppress warning on assumptions made regarding overflow (#146) - append(CMAKE_CCXX_NOWARN_FLAGS "-Wno-strict-overflow") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") diff --git a/cmake/patches/ngraph/ngraph_fix_memory.patch b/cmake/patches/ngraph/ngraph_fix_memory.patch new file mode 100644 index 0000000000000..3a38b1f287d10 --- /dev/null +++ b/cmake/patches/ngraph/ngraph_fix_memory.patch @@ -0,0 +1,125 @@ + cmake/external_mkldnn.cmake | 1 + + cmake/mkldnn_fix_memory.patch | 99 +++++++++++++++++++++++++++++++++++ + 2 files changed, 100 insertions(+) + create mode 100644 cmake/mkldnn_fix_memory.patch + +diff --git a/cmake/external_mkldnn.cmake b/cmake/external_mkldnn.cmake +index 7874aca76..bbae6d1a4 100644 +--- a/cmake/external_mkldnn.cmake ++++ b/cmake/external_mkldnn.cmake +@@ -194,6 +194,7 @@ if (WIN32) + CONFIGURE_COMMAND + PATCH_COMMAND ${MKLDNN_PATCH_REVERT_COMMAND} + COMMAND git apply --ignore-space-change --ignore-whitespace ${CMAKE_SOURCE_DIR}/cmake/${MKLDNN_PATCH_FILE} ++ COMMAND git apply --ignore-space-change --ignore-whitespace ${CMAKE_SOURCE_DIR}/cmake/mkldnn_fix_memory.patch + CMAKE_GENERATOR ${CMAKE_GENERATOR} + CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM} + CMAKE_GENERATOR_TOOLSET ${CMAKE_GENERATOR_TOOLSET} +diff --git a/cmake/mkldnn_fix_memory.patch b/cmake/mkldnn_fix_memory.patch +new file mode 100644 +index 000000000..ea1a3bd61 +--- /dev/null ++++ b/cmake/mkldnn_fix_memory.patch +@@ -0,0 +1,99 @@ ++ src/cpu/jit_avx2_1x1_convolution.cpp | 6 +++--- ++ src/cpu/jit_avx512_common_1x1_convolution.cpp | 9 +++++---- ++ src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp | 2 +- ++ src/cpu/jit_uni_1x1_conv_utils.hpp | 1 + ++ 4 files changed, 10 insertions(+), 8 deletions(-) ++ ++diff --git a/src/cpu/jit_avx2_1x1_convolution.cpp b/src/cpu/jit_avx2_1x1_convolution.cpp ++index 46362886..edb2b6fb 100644 ++--- a/src/cpu/jit_avx2_1x1_convolution.cpp +++++ b/src/cpu/jit_avx2_1x1_convolution.cpp ++@@ -50,7 +50,7 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward() const { ++ const memory_desc_wrapper weights_d(pd()->weights_pd(0)); ++ ++ const auto &jcp = kernel_->jcp; ++- auto rtus_space = scratchpad().get(key_conv_rtus_space); +++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get(key_conv_rtus_space):NULL; ++ ++ const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; ++ const int ndims = dst_d.ndims(); ++@@ -180,7 +180,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() const { ++ const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); ++ ++ const auto &jcp = kernel_->jcp; ++- auto rtus_space = scratchpad().get(key_conv_rtus_space); +++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get(key_conv_rtus_space):NULL; ++ ++ // TODO (Roma): remove this restriction ++ assert(jcp.stride_w == 1 && jcp.stride_h == 1); ++@@ -306,7 +306,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() const { ++ const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1)); ++ ++ const auto &jcp = kernel_->jcp; ++- auto rtus_space = scratchpad.get(key_conv_rtus_space); +++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; ++ ++ data_t *diff_bias = pd()->wants_padded_bias() ++ ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; ++diff --git a/src/cpu/jit_avx512_common_1x1_convolution.cpp b/src/cpu/jit_avx512_common_1x1_convolution.cpp ++index 6879cd91..47cea4f4 100644 ++--- a/src/cpu/jit_avx512_common_1x1_convolution.cpp +++++ b/src/cpu/jit_avx512_common_1x1_convolution.cpp ++@@ -106,7 +106,7 @@ execute_forward_thr(const int ithr, const int nthr, const src_data_t *src, ++ const memory_desc_wrapper weights_d(pd()->weights_pd(0)); ++ ++ const auto &jcp = kernel_->jcp; ++- auto rtus_space = scratchpad.get(key_conv_rtus_space); +++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; ++ ++ const int ndims = src_d.ndims(); ++ const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; ++@@ -301,8 +301,9 @@ void jit_avx512_common_1x1_convolution_bwd_data_tdiff_src_pd()); ++ ++ const auto &jcp = kernel_->jcp; ++- auto rtus_space = scratchpad().template get( ++- key_conv_rtus_space); +++ auto rtus_space = pd()->rtus_.reduce_src_ +++ ? scratchpad().template get(key_conv_rtus_space) +++ : NULL; ++ ++ const int ndims = diff_src_d.ndims(); ++ ++@@ -470,7 +471,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() ++ ++ const auto scratchpad = this->scratchpad(); ++ ++- auto rtus_space = scratchpad.get(key_conv_rtus_space); +++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; ++ data_t *diff_bias = pd()->wants_padded_bias() ++ ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; ++ auto wei_reduction = scratchpad.get(key_conv_wei_reduction); ++diff --git a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp ++index de303cd2..8129f2b2 100644 ++--- a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp +++++ b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp ++@@ -100,7 +100,7 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t ++ ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0; ++ ++ const auto &jcp = kernel_->jcp; ++- auto rtus_space = scratchpad.get(key_conv_rtus_space); +++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; ++ auto local_scales = scratchpad.get(key_conv_adjusted_scales); ++ ++ const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; ++diff --git a/src/cpu/jit_uni_1x1_conv_utils.hpp b/src/cpu/jit_uni_1x1_conv_utils.hpp ++index a3ed769a..6d76ba56 100644 ++--- a/src/cpu/jit_uni_1x1_conv_utils.hpp +++++ b/src/cpu/jit_uni_1x1_conv_utils.hpp ++@@ -94,6 +94,7 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d, ++ template ++ inline void rtus_prepare_space_info(conv_pd_t *self, ++ memory_tracking::registrar_t &scratchpad) { +++ if (!self->rtus_.reduce_src_) return; ++ const auto &jcp = self->jcp_; ++ ++ const int max_threads = mkldnn_get_max_threads(); ++-- ++2.20.1.windows.1 ++ +-- +2.20.1.windows.1 + diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs index 1738fc4ba2067..5f89bad8bbe9b 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs @@ -303,7 +303,7 @@ internal static NodeMetadata GetMetadataFromTypeInfo(IntPtr typeInfo) OnnxValueType valueType; unsafe { - NativeApiStatus.VerifySuccess(NativeMethods.OrtOnnxTypeFromTypeInfo(typeInfo, new IntPtr(&valueType))); + NativeApiStatus.VerifySuccess(NativeMethods.OrtGetOnnxTypeFromTypeInfo(typeInfo, new IntPtr(&valueType))); } if (valueType != OnnxValueType.ONNX_TYPE_TENSOR && valueType != OnnxValueType.ONNX_TYPE_SPARSETENSOR) { diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj index bffa46e623c56..f7fbdcda281be 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj +++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj @@ -46,6 +46,18 @@ CopyToOutputDirectory="Never" Visible="false" /> + + $(MSBuildThisFileDirectory)..\.. Microsoft.ML.OnnxRuntime + C_Api_Sample.cpp @@ -108,7 +109,7 @@ - + diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh index fc0ea19ea57f4..ab537c300f17f 100755 --- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh +++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh @@ -9,7 +9,9 @@ set -x SOURCE_ROOT=$1 BUILD_DIR=$2 NUGET_REPO_DIRNAME=$3 # path relative to BUILD_DIR -PackageName=${PackageName:-Microsoft.ML.OnnxRuntime} +PackageName=${PACKAGENAME:-Microsoft.ML.OnnxRuntime.Gpu} +RunTestCsharp=${RunTestCsharp:-true} +RunTestNative=${RunTestNative:-true} #CUDA_VER=cuda10.0-cudnn7.3, cuda9.1-cudnn7.1, cuda10.0-cudnn7.3 CUDA_VER=${4:-cuda10.0-cudnn7.3} @@ -21,7 +23,7 @@ cd $SOURCE_ROOT/tools/ci_build/github/linux/docker DOCKER_FILE=Dockerfile.ubuntu_gpu_cuda9 if [ $CUDA_VER = "cuda10.0-cudnn7.3" ]; then -DOCKER_FILE=Dockerfile.ubuntu_gpu_cuda +DOCKER_FILE=Dockerfile.ubuntu_gpu fi docker build -t "onnxruntime-$IMAGE" --build-arg OS_VERSION=16.04 --build-arg PYTHON_VERSION=${PYTHON_VER} -f $DOCKER_FILE . @@ -37,10 +39,12 @@ docker run -h $HOSTNAME \ --volume "$BUILD_DIR:/home/onnxruntimedev" \ --volume "$HOME/.cache/onnxruntime:/home/onnxruntimedev/.cache/onnxruntime" \ -e "OnnxRuntimeBuildDirectory=/home/onnxruntimedev" \ - -e "IsReleaseBuild=$IsReleaseBuild" \ + -e "IsReleaseBuild=$ISRELEASEBUILD" \ -e "PackageName=$PackageName" \ + -e "RunTestCsharp=$RunTestCsharp" \ + -e "RunTestNative=$RunTestNative" \ "onnxruntime-$IMAGE" \ - /bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-gpu.sh \ + /bin/bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \ /home/onnxruntimedev/$NUGET_REPO_DIRNAME /onnxruntime_src /home/onnxruntimedev & wait -n diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh index dd6bd136cdac9..a17f5b5db5f5b 100755 --- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh +++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh @@ -10,7 +10,7 @@ SOURCE_ROOT=$1 BUILD_DIR=$2 NUGET_REPO_DIRNAME=$3 # path relative to BUILD_DIR Arch=${4:-x64} # x32, x64 -PackageName=${PackageName:-Microsoft.ML.OnnxRuntime} +PackageName=${PACKAGENAME:-Microsoft.ML.OnnxRuntime} RunTestCsharp=${RunTestCsharp:-true} RunTestNative=${RunTestNative:-true} PYTHON_VER=3.5 @@ -36,7 +36,7 @@ docker run -h $HOSTNAME \ --volume "$BUILD_DIR:/home/onnxruntimedev" \ --volume "$HOME/.cache/onnxruntime:/home/onnxruntimedev/.cache/onnxruntime" \ -e "OnnxRuntimeBuildDirectory=/home/onnxruntimedev" \ - -e "IsReleaseBuild=$IsReleaseBuild" \ + -e "IsReleaseBuild=$ISRELEASEBUILD" \ -e "PackageName=$PackageName" \ -e "DisableContribOps=$DISABLECONTRIBOPS" \ -e "RunTestCsharp=$RunTestCsharp" \ diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs index 31c0ee67433fa..88bf5f83d4c8f 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs @@ -62,7 +62,7 @@ private void CanRunInferenceOnAModel(uint graphOptimizationLevel, bool disableSe // Set the graph optimization level for this session. SessionOptions options = new SessionOptions(); options.SetSessionGraphOptimizationLevel(graphOptimizationLevel); - if(disableSequentialExecution) options.DisableSequentialExecution(); + if (disableSequentialExecution) options.DisableSequentialExecution(); using (var session = new InferenceSession(modelPath, options)) { @@ -141,25 +141,6 @@ private void ThrowWrongInputType() session.Dispose(); } - [Fact] - private void ThrowWrongDimensions() - { - var tuple = OpenSessionSqueezeNet(); - var session = tuple.Item1; - var inputMeta = session.InputMetadata; - var container = new List(); - var inputData = new float[] { 0.1f, 0.2f, 0.3f }; - var tensor = new DenseTensor(inputData, new int[] { 1, 3 }); - container.Add(NamedOnnxValue.CreateFromTensor("data_0", tensor)); - var ex = Assert.Throws(() => session.Run(container)); - Assert.True( - !string.IsNullOrEmpty(ex.Message) && - ex.Message.StartsWith("[ErrorCode:Fail]") && - ex.Message.Contains("X num_dims does not match W num_dims. X: {1,3} W: {64,3,3,3}") - ); - session.Dispose(); - } - [Fact] private void ThrowExtraInputs() { @@ -220,7 +201,8 @@ private void TestPreTrainedModelsOpset7And8() var disableContribOpsEnvVar = Environment.GetEnvironmentVariable("DisableContribOps"); var isContribOpsDisabled = (disableContribOpsEnvVar != null) ? disableContribOpsEnvVar.Equals("ON") : false; - if (isContribOpsDisabled) { + if (isContribOpsDisabled) + { skipModels.Add("test_tiny_yolov2"); } @@ -661,7 +643,7 @@ private void TestGpu() { var gpu = Environment.GetEnvironmentVariable("TESTONGPU"); var tuple = OpenSessionSqueezeNet(0); // run on deviceID 0 - float[] expectedOutput = LoadTensorFromFile(@"bench.expected_out"); + float[] expectedOutput = LoadTensorFromFile(@"bench.expected_out"); using (var session = tuple.Item1) { @@ -671,7 +653,7 @@ private void TestGpu() var container = new List(); container.Add(NamedOnnxValue.CreateFromTensor("data_0", tensor)); var res = session.Run(container); - var resultArray = res.First().AsTensor().ToArray(); + var resultArray = res.First().AsTensor().ToArray(); Assert.Equal(expectedOutput, resultArray, new floatComparer()); } } @@ -782,8 +764,8 @@ private class GpuFact : FactAttribute { public GpuFact() { - var testOnGpu = System.Environment.GetEnvironmentVariable("TESTONGPU"); - if (testOnGpu == null || !testOnGpu.Equals("ON") ) + var testOnGpu = System.Environment.GetEnvironmentVariable("TESTONGPU"); + if (testOnGpu == null || !testOnGpu.Equals("ON")) { Skip = "GPU testing not enabled"; } diff --git a/csharp/testdata/test_types_DOUBLE.pb b/csharp/testdata/test_types_DOUBLE.pb index 8a98310868091..65ebf0f848a35 100644 Binary files a/csharp/testdata/test_types_DOUBLE.pb and b/csharp/testdata/test_types_DOUBLE.pb differ diff --git a/csharp/testdata/test_types_FLOAT.pb b/csharp/testdata/test_types_FLOAT.pb index ce213fb7ec818..b4ad9807834ed 100644 Binary files a/csharp/testdata/test_types_FLOAT.pb and b/csharp/testdata/test_types_FLOAT.pb differ diff --git a/csharp/testdata/test_types_INT16.pb b/csharp/testdata/test_types_INT16.pb index 911edf549863d..f297ec9c54940 100644 Binary files a/csharp/testdata/test_types_INT16.pb and b/csharp/testdata/test_types_INT16.pb differ diff --git a/csharp/testdata/test_types_INT32.pb b/csharp/testdata/test_types_INT32.pb index 5f59b553379ea..73bb539cf44c6 100644 Binary files a/csharp/testdata/test_types_INT32.pb and b/csharp/testdata/test_types_INT32.pb differ diff --git a/csharp/testdata/test_types_INT64.pb b/csharp/testdata/test_types_INT64.pb index 984cc51001445..ccf8df0033278 100644 Binary files a/csharp/testdata/test_types_INT64.pb and b/csharp/testdata/test_types_INT64.pb differ diff --git a/csharp/testdata/test_types_UINT16.pb b/csharp/testdata/test_types_UINT16.pb index f6fcc00abb362..0a9c6fe3770ce 100644 Binary files a/csharp/testdata/test_types_UINT16.pb and b/csharp/testdata/test_types_UINT16.pb differ diff --git a/csharp/testdata/test_types_UINT32.pb b/csharp/testdata/test_types_UINT32.pb index ca0acc80aab0b..90efef3e7f171 100644 Binary files a/csharp/testdata/test_types_UINT32.pb and b/csharp/testdata/test_types_UINT32.pb differ diff --git a/csharp/testdata/test_types_UINT64.pb b/csharp/testdata/test_types_UINT64.pb index a08e3fd689a26..53214a1a2e0e6 100644 Binary files a/csharp/testdata/test_types_UINT64.pb and b/csharp/testdata/test_types_UINT64.pb differ diff --git a/csharp/testdata/test_types_UINT8.pb b/csharp/testdata/test_types_UINT8.pb index 78c7746e58479..8b6a9c42197ef 100644 Binary files a/csharp/testdata/test_types_UINT8.pb and b/csharp/testdata/test_types_UINT8.pb differ diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda new file mode 100644 index 0000000000000..0a537b774873a --- /dev/null +++ b/dockerfiles/Dockerfile.cuda @@ -0,0 +1,28 @@ +# -------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------- +# Dockerfile to run ONNXRuntime with CUDA, CUDNN integration + +# nVidia cuda 10.0 Base Image +FROM nvidia/cuda:10.0-cudnn7-devel +MAINTAINER Vinitra Swamy "viswamy@microsoft.com" + +ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime +ARG ONNXRUNTIME_SERVER_BRANCH=master + +RUN apt-get update &&\ + apt-get install -y sudo git bash + +WORKDIR /code +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.14.3-Linux-x86_64/bin:/opt/miniconda/bin:${PATH} + +# Prepare onnxruntime repository & build onnxruntime with TensorRT +RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ + /bin/sh onnxruntime/dockerfiles/install_common_deps.sh &&\ + cp onnxruntime/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt &&\ + cd onnxruntime &&\ + /bin/sh ./build.sh --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_cuda --config Release --build_wheel --update --build --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) &&\ + pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\ + cd .. &&\ + rm -rf onnxruntime cmake-3.14.3-Linux-x86_64.tar.gz cmake-3.14.3-Linux-x86_64 diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino index 026879852d880..a829e1651de60 100644 --- a/dockerfiles/Dockerfile.openvino +++ b/dockerfiles/Dockerfile.openvino @@ -3,35 +3,64 @@ # Licensed under the MIT License. #-------------------------------------------------------------------------- -ARG OS_VERSION=16.04 -FROM ubuntu:${OS_VERSION} +FROM ubuntu:16.04 -ARG PYTHON_VERSION=3.5 -ARG OPENVINO_VERSION=2018_R5 -ARG TARGET_DEVICE=CPU_FP32 +RUN apt update && \ + apt -y install python3.5 python3-pip zip x11-apps lsb-core wget cpio sudo libboost-python-dev libpng-dev zlib1g-dev git libnuma1 ocl-icd-libopencl1 clinfo libboost-filesystem1.58.0 libboost-thread1.58.0 protobuf-compiler libprotoc-dev libusb-1.0-0-dev && pip3 install numpy networkx opencv-python pytest && locale-gen en_US.UTF-8 && update-locale LANG=en_US.UTF-8 -ENV DEBIAN_FRONTEND noninteractive +ARG DEVICE=CPU_FP32 +ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime +ARG ONNXRUNTIME_BRANCH=master -RUN apt-get update && \ - apt-get install -y sudo git bash -ENV PATH="/opt/cmake/bin:${PATH}" -RUN git clone --branch preview-v0.7 --recursive https://github.com/intel/onnxruntime onnxruntime -RUN /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} && \ - /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_deps.sh +ENV pattern="COMPONENTS=DEFAULTS" +ENV replacement="COMPONENTS=intel-openvino-ie-sdk-ubuntu-xenial__x86_64;intel-openvino-ie-rt-cpu-ubuntu-xenial__x86_64;intel-openvino-ie-rt-gpu-ubuntu-xenial__x86_64;intel-openvino-ie-rt-vpu-ubuntu-xenial__x86_64;intel-openvino-ie-rt-hddl-ubuntu-xenial__x86_64;intel-openvino-model-optimizer__x86_64;intel-openvino-opencv-lib-ubuntu-xenial__x86_64" +COPY l_openvino_*.tgz . +RUN tar -xzf l_openvino_toolkit*.tgz && \ + rm -rf l_openvino_toolkit*.tgz && \ + cd l_openvino_toolkit* && \ + sed -i "s/$pattern/$replacement/" silent.cfg && \ + sed -i 's/decline/accept/g' silent.cfg && \ + ./install.sh -s silent.cfg && \ + ./install_openvino_dependencies.sh && \ + cd - && \ + rm -rf l_openvino_toolkit* && \ + cd /opt/intel/openvino/deployment_tools/model_optimizer/install_prerequisites && ./install_prerequisites_onnx.sh -RUN /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_openvino.sh -o ${OPENVINO_VERSION} +ENV LD_LIBRARY_PATH=/usr/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH +ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2019.1.144 +ENV INTEL_CVSDK_DIR=/opt/intel/openvino_2019.1.144 +ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/model_optimizer_caffe/bin:${LD_LIBRARY_PATH} +ENV ModelOptimizer_ROOT_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/model_optimizer_caffe +ENV InferenceEngine_DIR=${INTEL_CVSDK_DIR}/deployment_tools/inference_engine/share +ENV IE_PLUGINS_PATH=${INTEL_CVSDK_DIR}/deployment_tools/inference_engine/lib/intel64 +ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/cldnn/lib:${INTEL_OPENVINO_DIR}/inference_engine/external/gna/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/omp/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH} +ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/opencv/share/OpenCV +ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/opencv/lib:${INTEL_OPENVINO_DIR}/opencv/share/OpenCV/3rdparty/lib:${LD_LIBRARY_PATH} +ENV PATH=${INTEL_CVSDK_DIR}/deployment_tools/model_optimizer:$PATH +ENV PYTHONPATH=${INTEL_CVSDK_DIR}/deployment_tools/model_optimizer:$PYTHONPATH +ENV PYTHONPATH=$INTEL_CVSDK_DIR/python/python3.5:${INTEL_CVSDK_DIR}/python/python3.5/ubuntu16:${PYTHONPATH} +ENV HDDL_INSTALL_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl +ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl/lib:$LD_LIBRARY_PATH -WORKDIR / +RUN wget https://github.com/intel/compute-runtime/releases/download/19.15.12831/intel-gmmlib_19.1.1_amd64.deb +RUN wget https://github.com/intel/compute-runtime/releases/download/19.15.12831/intel-igc-core_1.0.2-1787_amd64.deb +RUN wget https://github.com/intel/compute-runtime/releases/download/19.15.12831/intel-igc-opencl_1.0.2-1787_amd64.deb +RUN wget https://github.com/intel/compute-runtime/releases/download/19.15.12831/intel-opencl_19.15.12831_amd64.deb +RUN wget https://github.com/intel/compute-runtime/releases/download/19.15.12831/intel-ocloc_19.15.12831_amd64.deb -ENV INTEL_CVSDK_DIR /data/dldt +RUN sudo dpkg -i *.deb && rm -rf *.deb -ENV LD_LIBRARY_PATH $INTEL_CVSDK_DIR/deployment_tools/inference_engine/lib/ubuntu_16.04/intel64:$INTEL_CVSDK_DIR/deployment_tools/inference_engine/temp/omp/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH -ENV PATH $INTEL_CVSDK_DIR/deployment_tools/model_optimizer:$PATH -ENV PYTHONPATH $INTEL_CVSDK_DIR/deployment_tools/model_optimizer:$INTEL_CVSDK_DIR/tools:$PYTHONPATH +RUN mkdir -p /opt/cmake/bin + +ENV PATH /opt/cmake/bin:$PATH +ENV LANG en_US.UTF-8 +RUN wget https://github.com/Kitware/CMake/releases/download/v3.13.2/cmake-3.13.2-Linux-x86_64.tar.gz && \ + tar -xf cmake-3.13.2-Linux-x86_64.tar.gz --strip 1 -C /opt/cmake && rm -rf /cmake-3.13.2-Linux-x86_64.tar.gz + +RUN git clone --recursive -b $ONNXRUNTIME_BRANCH $ONNXRUNTIME_REPO /onnxruntime && \ + cd /onnxruntime/cmake/external/onnx && python3 setup.py install && \ + cd /onnxruntime && ./build.sh --config RelWithDebInfo --update --build --parallel --use_openvino $DEVICE --build_wheel && pip3 install /onnxruntime/build/Linux/RelWithDebInfo/dist/*-linux_x86_64.whl && rm -rf /onnxruntime + -RUN mkdir -p /onnxruntime/build && \ - python3 /onnxruntime/tools/ci_build/build.py --build_dir /onnxruntime/build --config Release --build_shared_lib --skip_submodule_sync --build_wheel --parallel --use_openvino ${TARGET_DEVICE} && \ - pip3 install /onnxruntime/build/Release/dist/onnxruntime-*linux_x86_64.whl && \ - rm -rf /onnxruntime diff --git a/dockerfiles/Dockerfile.server b/dockerfiles/Dockerfile.server index bc354ad12b15e..3eebdf6db036b 100644 --- a/dockerfiles/Dockerfile.server +++ b/dockerfiles/Dockerfile.server @@ -21,7 +21,10 @@ RUN apt-get update && \ ENV PATH="/opt/cmake/bin:${PATH}" RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime RUN /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} && \ - /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_deps.sh + /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_deps.sh && \ + /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_server_deps.sh + +ENV PATH="/usr/local/go/bin:${PATH}" WORKDIR / RUN mkdir -p /onnxruntime/build && \ @@ -31,6 +34,7 @@ FROM minimal AS final WORKDIR /onnxruntime/server/ ENV MODEL_ABSOLUTE_PATH /onnxruntime/model/model.onnx COPY --from=build /onnxruntime/build/Release/onnxruntime_server /onnxruntime/server/ +COPY --from=build /onnxruntime/build/Release/libonnxruntime.so.* /lib/ RUN apt-get update \ && apt-get install -y libgomp1 ENTRYPOINT /onnxruntime/server/onnxruntime_server --model_path $MODEL_ABSOLUTE_PATH diff --git a/dockerfiles/Dockerfile.source b/dockerfiles/Dockerfile.source new file mode 100644 index 0000000000000..1a0f0921136fb --- /dev/null +++ b/dockerfiles/Dockerfile.source @@ -0,0 +1,27 @@ +# -------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------- +# Dockerfile to run ONNXRuntime with source build for CPU + +# Ubuntu 16.04 Base Image +FROM ubuntu:16.04 +MAINTAINER Vinitra Swamy "viswamy@microsoft.com" + +ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime +ARG ONNXRUNTIME_SERVER_BRANCH=master + +RUN apt-get update &&\ + apt-get install -y sudo git bash + +WORKDIR /code +ENV PATH /opt/miniconda/bin:/code/cmake-3.14.3-Linux-x86_64/bin:${PATH} + +# Prepare onnxruntime repository & build onnxruntime +RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ + /bin/sh onnxruntime/dockerfiles/install_common_deps.sh &&\ + cd onnxruntime &&\ + /bin/sh ./build.sh --config Release --build_wheel --update --build --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) &&\ + pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\ + cd .. &&\ + rm -rf onnxruntime cmake-3.14.3-Linux-x86_64.tar.gz cmake-3.14.3-Linux-x86_64 diff --git a/dockerfiles/Dockerfile.tensorrt b/dockerfiles/Dockerfile.tensorrt new file mode 100644 index 0000000000000..6f3df1fbbba81 --- /dev/null +++ b/dockerfiles/Dockerfile.tensorrt @@ -0,0 +1,28 @@ +# -------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------- +# Dockerfile to run ONNXRuntime with TensorRT integration + +# nVidia TensorRT Base Image +FROM nvcr.io/nvidia/tensorrt:19.02-py3 +MAINTAINER Vinitra Swamy "viswamy@microsoft.com" + +ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime +ARG ONNXRUNTIME_SERVER_BRANCH=master + +RUN apt-get update &&\ + apt-get install -y sudo git bash + +WORKDIR /code +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.14.3-Linux-x86_64/bin:/opt/miniconda/bin:${PATH} + +# Prepare onnxruntime repository & build onnxruntime with TensorRT +RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ + /bin/sh onnxruntime/dockerfiles/install_common_deps.sh &&\ + cp onnxruntime/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt &&\ + cd onnxruntime &&\ + /bin/sh ./build.sh --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /workspace/tensorrt --config Release --build_wheel --update --build --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) &&\ + pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\ + cd .. &&\ + rm -rf onnxruntime cmake-3.14.3-Linux-x86_64.tar.gz cmake-3.14.3-Linux-x86_64 diff --git a/dockerfiles/LICENSE-IMAGE.txt b/dockerfiles/LICENSE-IMAGE.txt new file mode 100644 index 0000000000000..b26cc039868e5 --- /dev/null +++ b/dockerfiles/LICENSE-IMAGE.txt @@ -0,0 +1,15 @@ +This image is made available to you on the condition that you agree to +[your agreement][1] governing your use of Azure. +If you do not have an existing agreement governing your use of Azure, you agree that +your agreement governing use of Azure is the [Microsoft Online Subscription Agreement][2] +(which incorporates the [Online Services Terms][3]). +By using the software you agree to these terms. This software may collect data +that is transmitted to Microsoft. Please see the [Microsoft Privacy Statement][4] +to learn more about how Microsoft processes personal data. + +This image must be used on Microsoft Azure Services only. + +[1]: https://azure.microsoft.com/en-us/support/legal/ +[2]: https://azure.microsoft.com/en-us/support/legal/subscription-agreement/ +[3]: http://www.microsoftvolumelicensing.com/DocumentSearch.aspx?Mode=3&DocumentTypeId=46 +[4]: http://go.microsoft.com/fwlink/?LinkId=248681 diff --git a/dockerfiles/README.md b/dockerfiles/README.md index 96aeb0f0c280a..f395acc2ef6b5 100644 --- a/dockerfiles/README.md +++ b/dockerfiles/README.md @@ -1,13 +1,21 @@ -# Quick-start Docker containers for ONNX Runtime +# Docker containers for ONNX Runtime -## nGraph Version (Preview) -#### Linux 16.04, Python Bindings +- [Arm 32v7](Dockerfile.arm32v7) +- [Build from source (CPU)](Dockerfile.source) +- [CUDA + CUDNN](Dockerfile.cuda) +- [nGraph](Dockerfile.ngraph) +- [TensorRT](Dockerfile.tensorrt) +- [OpenVINO](Dockerfile.openvino) +- [ONNX Runtime Server](Dockerfile.server) + +## Build from Source Version (Preview) +#### Linux 16.04, CPU, Python Bindings 1. Build the docker image from the Dockerfile in this repository. ``` # If you have a Linux machine, preface this command with "sudo" - docker build -t onnxruntime-ngraph -f Dockerfile.ngraph . + docker build -t onnxruntime-source -f Dockerfile.source . ``` 2. Run the Docker image @@ -15,41 +23,53 @@ ``` # If you have a Linux machine, preface this command with "sudo" - docker run -it onnxruntime-ngraph + docker run -it onnxruntime-source ``` -## ONNX Runtime Server (Preview) -#### Linux 16.04 +## CUDA Version (Preview) +#### Linux 16.04, CUDA 10.0, CuDNN 7 -1. Build the docker image from the Dockerfile in this repository - ``` - docker build -t {docker_image_name} -f Dockerfile.server . +1. Build the docker image from the Dockerfile in this repository. ``` - -2. Run the ONNXRuntime server with the image created in step 1 + # If you have a Linux machine, preface this command with "sudo" + docker build -t onnxruntime-cuda -f Dockerfile.cuda . ``` - docker run -v {localModelAbsoluteFolder}:{dockerModelAbsoluteFolder} -e MODEL_ABSOLUTE_PATH={dockerModelAbsolutePath} -p {your_local_port}:8001 {imageName} - ``` -3. Send HTTP requests to the container running ONNX Runtime Server - Send HTTP requests to the docker container through the binding local port. Here is the full [usage document](https://github.com/Microsoft/onnxruntime/blob/master/docs/ONNX_Runtime_Server_Usage.md). +2. Run the Docker image + ``` - curl -X POST -d "@request.json" -H "Content-Type: application/json" http://0.0.0.0:{your_local_port}/v1/models/mymodel/versions/3:predict + # If you have a Linux machine, preface this command with "sudo" + + docker run -it onnxruntime-cuda ``` -## OpenVINO Version (Preview) +## nGraph Version (Preview) #### Linux 16.04, Python Bindings 1. Build the docker image from the Dockerfile in this repository. ``` # If you have a Linux machine, preface this command with "sudo" - docker build -t onnxruntime-openvino -f Dockerfile.openvino . + docker build -t onnxruntime-ngraph -f Dockerfile.ngraph . ``` - To use GPU_FP32: + +2. Run the Docker image + ``` - docker build -t onnxruntime-openvino --build-arg TARGET_DEVICE=GPU_FP32 -f Dockerfile.openvino . + # If you have a Linux machine, preface this command with "sudo" + + docker run -it onnxruntime-ngraph + ``` + +## TensorRT Version (Preview) +#### Linux 16.04, TensorRT 5.0.2 + +1. Build the docker image from the Dockerfile in this repository. + ``` + # If you have a Linux machine, preface this command with "sudo" + + docker build -t onnxruntime-trt -f Dockerfile.tensorrt . ``` 2. Run the Docker image @@ -57,5 +77,118 @@ ``` # If you have a Linux machine, preface this command with "sudo" - docker run -it onnxruntime-openvino - ``` \ No newline at end of file + docker run -it onnxruntime-trt + ``` + +## OpenVINO Version (Preview) +#### Linux 16.04, Python Bindings + +1. Build the onnxruntime image for all the accelerators supported as below + + Retrieve your docker image in one of the following ways. + + - For building the docker image, download OpenVINO online installer version 2019 R1.1 from [here](https://software.intel.com/en-us/openvino-toolkit/choose-download) and copy the openvino tar file in the same directory and build the image. The online installer size is only 16MB and the components needed for the accelerators are mentioned in the dockerfile. Providing the argument device enables onnxruntime for that particular device. You can also provide arguments ONNXRUNTIME_REPO and ONNXRUNTIME_BRANCH to test that particular repo and branch. Default values are http://github.com/microsoft/onnxruntime and repo is master + ``` + docker build -t onnxruntime --build-arg DEVICE=$DEVICE . + ``` + - Pull the official image from DockerHub. + + +2. DEVICE: Specifies the hardware target for building OpenVINO Execution Provider. Below are the options for different Intel target devices. + + | Device Option | Target Device | + | --------- | -------- | + | CPU_FP32 | Intel CPUs | + | GPU_FP32 |Intel Integrated Graphics | + | GPU_FP16 | Intel Integrated Graphics | + | MYRIAD_FP16 | Intel MovidiusTM USB sticks | + | VAD-M_FP16 | Intel Vision Accelerator Design based on MovidiusTM MyriadX VPUs | + +## CPU Version + +1. Retrieve your docker image in one of the following ways. + + - Build the docker image from the DockerFile in this repository. + + ``` + docker build -t onnxruntime-cpu --build-arg DEVICE=CPU_FP32 --network host . + ``` + - Pull the official image from DockerHub. + ``` + # Will be available with next release + ``` +2. Run the docker image + ``` + docker run -it onnxruntime-cpu + ``` + +## GPU Version + +1. Retrieve your docker image in one of the following ways. + - Build the docker image from the DockerFile in this repository. + ``` + docker build -t onnxruntime-gpu --build-arg DEVICE=GPU_FP32 --network host . + ``` + - Pull the official image from DockerHub. + ``` + # Will be available with next release + ``` + +2. Run the docker image + ``` + docker run -it --device /dev/dri:/dev/dri onnxruntime-gpu:latest + ``` +## Myriad VPU Accelerator Version + +1. Retrieve your docker image in one of the following ways. + - Build the docker image from the DockerFile in this repository. + ``` + docker build -t onnxruntime-myriad --build-arg DEVICE=MYRIAD_FP16 --network host . + ``` + - Pull the official image from DockerHub. + ``` + # Will be available with next release + ``` +2. Install the Myriad rules drivers on the host machine according to the reference in [here](https://docs.openvinotoolkit.org/latest/_docs_install_guides_installing_openvino_linux.html#additional-NCS-steps) +3. Run the docker image by mounting the device drivers + ``` + docker run -it --network host --privileged -v /dev:/dev onnxruntime-myriad:latest + + ``` +## VAD-M Accelerator Version + +1. Retrieve your docker image in one of the following ways. + - Build the docker image from the DockerFile in this repository. + ``` + docker build -t onnxruntime-vadr --build-arg DEVICE=VAD-M_FP16 --network host . + ``` + - Pull the official image from DockerHub. + ``` + # Will be available with next release + ``` +2. Install the HDDL drivers on the host machine according to the reference in [here](https://docs.openvinotoolkit.org/latest/_docs_install_guides_installing_openvino_linux_ivad_vpu.html) +3. Run the docker image by mounting the device drivers + ``` + docker run -it --device --mount type=bind,source=/var/tmp,destination=/var/tmp --device /dev/ion:/dev/ion onnxruntime-hddl:latest + + ``` +## ONNX Runtime Server (Preview) +#### Linux 16.04 + +1. Build the docker image from the Dockerfile in this repository + ``` + docker build -t {docker_image_name} -f Dockerfile.server . + ``` + +2. Run the ONNXRuntime server with the image created in step 1 + + ``` + docker run -v {localModelAbsoluteFolder}:{dockerModelAbsoluteFolder} -e MODEL_ABSOLUTE_PATH={dockerModelAbsolutePath} -p {your_local_port}:8001 {imageName} + ``` +3. Send HTTP requests to the container running ONNX Runtime Server + + Send HTTP requests to the docker container through the binding local port. Here is the full [usage document](https://github.com/Microsoft/onnxruntime/blob/master/docs/ONNX_Runtime_Server_Usage.md). + ``` + curl -X POST -d "@request.json" -H "Content-Type: application/json" http://0.0.0.0:{your_local_port}/v1/models/mymodel/versions/3:predict + ``` + diff --git a/dockerfiles/install_common_deps.sh b/dockerfiles/install_common_deps.sh new file mode 100644 index 0000000000000..dab394cb33fe7 --- /dev/null +++ b/dockerfiles/install_common_deps.sh @@ -0,0 +1,25 @@ +#!/bin/bash +DEBIAN_FRONTEND=noninteractive +apt-get install -y --no-install-recommends \ + wget \ + zip \ + ca-certificates \ + build-essential \ + curl \ + libcurl4-openssl-dev \ + libssl-dev \ + python3-dev + +# Dependencies: conda +wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh --no-check-certificate && /bin/bash ~/miniconda.sh -b -p /opt/miniconda +rm ~/miniconda.sh +/opt/miniconda/bin/conda clean -tipsy +find / -type d -name __pycache__ -prune -exec rm -rf {}; + +conda install -y python=3.6 numpy +conda clean -aqy +rm -rf /opt/miniconda/pkgs + +# Dependencies: cmake +sudo wget --quiet https://github.com/Kitware/CMake/releases/download/v3.14.3/cmake-3.14.3-Linux-x86_64.tar.gz +tar zxf cmake-3.14.3-Linux-x86_64.tar.gz diff --git a/docs/C_API.md b/docs/C_API.md index e13ddecfb095e..ea99c5875fe45 100644 --- a/docs/C_API.md +++ b/docs/C_API.md @@ -11,6 +11,7 @@ * Setting the thread pool size for each session. * Setting graph optimization level for each session. * Dynamically loading custom ops. [Instructions](/docs/AddingCustomOp.md) +* Ability to load a model from a byte array. See ```OrtCreateSessionFromArray``` in [onnxruntime_c_api.h](/include/onnxruntime/core/session/onnxruntime_c_api.h). ## Usage Overview diff --git a/docs/HighLevelDesign.md b/docs/HighLevelDesign.md index 922dfcc2de677..a67b869034f50 100644 --- a/docs/HighLevelDesign.md +++ b/docs/HighLevelDesign.md @@ -35,8 +35,6 @@ provider using the GetCapability() API. ![ONNXRuntime high level system architecture](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/228d22d3-6e3e-48b1-811c-1d48353f031c.png) -*Note: TensorRT and nGraph support are in progress* - ### More about partitioning ONNXRuntime partitions a model graph into subgraphs based on the available execution providers, one for each distinct provider. ONNXRuntime provides a default execution provider that is used as the fallback execution for the @@ -77,7 +75,7 @@ different representation if they choose to, but it is their responsibility to convert the values from/to the standard representation at the boundaries of their subgraph. -## Extensibility points +## Extensibility Options * [Add a custom operator/kernel](AddingCustomOp.md) * [Add an execution provider](AddingExecutionProvider.md) * [Add a new graph diff --git a/docs/ONNX_Runtime_Perf_Tuning.md b/docs/ONNX_Runtime_Perf_Tuning.md index 32e7be449be9e..b0ac2e0b2c039 100644 --- a/docs/ONNX_Runtime_Perf_Tuning.md +++ b/docs/ONNX_Runtime_Perf_Tuning.md @@ -97,7 +97,7 @@ You can enable ONNX Runtime latency profiling in code: import onnxruntime as rt sess_options = rt.SessionOptions() -enable_profiling.enable_profiling = True +sess_options.enable_profiling = True ``` Or, if you are using the onnxruntime_perf_test.exe tool, you can add -p [profile_file] to enable performance profiling. diff --git a/docs/ONNX_Runtime_Server_Usage.md b/docs/ONNX_Runtime_Server_Usage.md index ac23bbb83726a..aa6a7c1b840ad 100644 --- a/docs/ONNX_Runtime_Server_Usage.md +++ b/docs/ONNX_Runtime_Server_Usage.md @@ -1,11 +1,14 @@

Note: ONNX Runtime Server is still in beta state. It's currently not ready for production environments.

-# How to Use ONNX Runtime Server REST API for Prediction +# How to Use ONNX Runtime Server for Prediction -ONNX Runtime Server provides a REST API for prediction. The goal of the project is to make it easy to "host" any ONNX model as a RESTful service. The CLI command to start the service is shown below: +ONNX Runtime Server provides an easy way to start an inferencing server for prediction with both HTTP and GRPC endpoints. The CLI command to start the server is shown below: ``` $ ./onnxruntime_server +Version: +Commit ID: + the option '--model_path' is required but missing Allowed options: -h [ --help ] Shows a help message and exits @@ -15,21 +18,22 @@ Allowed options: --address arg (=0.0.0.0) The base HTTP address --http_port arg (=8001) HTTP port to listen to requests --num_http_threads arg (=<# of your cpu cores>) Number of http threads - - + --grpc_port arg (=50051) GRPC port to listen to requests ``` -Note: The only mandatory argument for the program here is `model_path` +**Note**: The only mandatory argument for the program here is `model_path` ## Start the Server -To host an ONNX model as a REST API server, run: +To host an ONNX model as an inferencing server, simply run: ``` ./onnxruntime_server --model_path /// ``` -The prediction URL is in this format: +## HTTP Endpoint + +The prediction URL for HTTP endpoint is in this format: ``` http://:/v1/models//versions/:predict @@ -37,16 +41,20 @@ http://:/v1/models//versions/ 0. In the future, model_names and versions will be verified. -## Request and Response Payload +### Request and Response Payload -An HTTP request can be a Protobuf message in two formats: binary or JSON. The HTTP request header field `Content-Type` tells the server how to handle the request and thus it is mandatory for all requests. Requests missing `Content-Type` will be rejected as `400 Bad Request`. +The request and response need to be a protobuf message. The Protobuf definition can be found [here](https://github.com/Microsoft/onnxruntime/blob/master/onnxruntime/server/protobuf/predict.proto). + +A protobuf message could have two formats: binary and JSON. Usually the binary payload has better latency, in the meanwhile the JSON format is easy for human readability. + +The HTTP request header field `Content-Type` tells the server how to handle the request and thus it is mandatory for all requests. Requests missing `Content-Type` will be rejected as `400 Bad Request`. * For `"Content-Type: application/json"`, the payload will be deserialized as JSON string in UTF-8 format * For `"Content-Type: application/vnd.google.protobuf"`, `"Content-Type: application/x-protobuf"` or `"Content-Type: application/octet-stream"`, the payload will be consumed as protobuf message directly. -The Protobuf definition can be found [here](https://github.com/Microsoft/onnxruntime/blob/master/onnxruntime/server/protobuf/predict.proto). +Clients can control the response type by setting the request with an `Accept` header field and the server will serialize in your desired format. The choices currently available are the same as the `Content-Type` header field. If this field is not set in the request, the server will use the same type as your request. -## Inferencing +### Inferencing To send a request to the server, you can use any tool which supports making HTTP requests. Here is an example using `curl`: @@ -60,11 +68,17 @@ or curl -X POST --data-binary "@predict_request_0.pb" -H "Content-Type: application/octet-stream" -H "Foo: 1234" http://127.0.0.1:8001/v1/models/mymodel/versions/3:predict ``` -Clients can control the response type by setting the request with an `Accept` header field and the server will serialize in your desired format. The choices currently available are the same as the `Content-Type` header field. +### Interactive tutorial notebook + +A simple Jupyter notebook demonstrating the usage of ONNX Runtime server to host an ONNX model and perform inferencing can be found [here](https://github.com/onnx/tutorials/blob/master/tutorials/OnnxRuntimeServerSSDModel.ipynb). + +## GRPC Endpoint + +If you prefer using the GRPC endpoint, the protobuf could be found [here](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/server/protobuf/prediction_service.proto). You could generate your client and make a GRPC call to it. To learn more about how to generate the client code and call to the server, please refer to [the tutorials of GRPC](https://grpc.io/docs/tutorials/). ## Advanced Topics -### Number of HTTP Threads +### Number of Worker Threads You can change this to optimize server utilization. The default is the number of CPU cores on the host machine. @@ -75,66 +89,11 @@ For easy tracking of requests, we provide the following header fields: * `x-ms-request-id`: will be in the response header, no matter the request result. It will be a GUID/uuid with dash, e.g. `72b68108-18a4-493c-ac75-d0abd82f0a11`. If the request headers contain this field, the value will be ignored. * `x-ms-client-request-id`: a field for clients to tracking their requests. The content will persist in the response headers. -Here is an example of a client sending a request: - -#### Client Side +### rsyslog Support -``` -$ curl -v -X POST --data-binary "@predict_request_0.pb" -H "Content-Type: application/octet-stream" -H "Foo: 1234" -H "x-ms-client-request-id: my-request-001" -H "Accept: application/json" http://127.0.0.1:8001/v1/models/mymodel/versions/3:predict -Note: Unnecessary use of -X or --request, POST is already inferred. -* Trying 127.0.0.1... -* Connected to 127.0.0.1 (127.0.0.1) port 8001 (#0) -> POST /v1/models/mymodel/versions/3:predict HTTP/1.1 -> Host: 127.0.0.1:8001 -> User-Agent: curl/7.47.0 -> Content-Type: application/octet-stream -> x-ms-client-request-id: my-request-001 -> Accept: application/json -> Content-Length: 3179 -> Expect: 100-continue -> -* Done waiting for 100-continue -* We are completely uploaded and fine -< HTTP/1.1 200 OK -< Content-Type: application/json -< x-ms-request-id: 72b68108-18a4-493c-ac75-d0abd82f0a11 -< x-ms-client-request-id: my-request-001 -< Content-Length: 159 -< -* Connection #0 to host 127.0.0.1 left intact -{"outputs":{"Sample_Output_Name":{"dims":["1","10"],"dataType":1,"rawData":"6OpzRFquGsSFdM1FyAEnRFtRZcRa9NDEUBj0xI4ydsJIS0LE//CzxA==","dataLocation":"DEFAULT"}}}% -``` +If you prefer using an ONNX Runtime Server with [rsyslog](https://www.rsyslog.com/) support([build instruction](https://github.com/microsoft/onnxruntime/blob/master/BUILD.md#build-onnx-runtime-server-on-linux)), you should be able to see the log in `/var/log/syslog` after the ONNX Runtime Server runs. For detail about how to use rsyslog, please reference [here](https://www.rsyslog.com/category/guides-for-rsyslog/). -#### Server Side +## Report Issues -And here is what the output on the server side looks like with logging level of verbose: +If you see any issues or want to ask questions about the server, please feel free to do so in this repo with the version and commit id from the command line. -``` -2019-04-04 23:48:26.395200744 [V:onnxruntime:72b68108-18a4-493c-ac75-d0abd82f0a11, predict_request_handler.cc:40 Predict] Name: mymodel Version: 3 Action: predict -2019-04-04 23:48:26.395289437 [V:onnxruntime:72b68108-18a4-493c-ac75-d0abd82f0a11, predict_request_handler.cc:46 Predict] x-ms-client-request-id: [my-request-001] -2019-04-04 23:48:26.395540707 [I:onnxruntime:InferenceSession, inference_session.cc:736 Run] Running with tag: 72b68108-18a4-493c-ac75-d0abd82f0a11 -2019-04-04 23:48:26.395596858 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, inference_session.cc:976 CreateLoggerForRun] Created logger for run with id of 72b68108-18a4-493c-ac75-d0abd82f0a11 -2019-04-04 23:48:26.395731391 [I:onnxruntime:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:42 Execute] Begin execution -2019-04-04 23:48:26.395763319 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:45 Execute] Size of execution plan vector: 12 -2019-04-04 23:48:26.396228981 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Convolution28 -2019-04-04 23:48:26.396580161 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Plus30 -2019-04-04 23:48:26.396623732 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:197 ReleaseNodeMLValues] Releasing mlvalue with index: 10 -2019-04-04 23:48:26.396878822 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: ReLU32 -2019-04-04 23:48:26.397091882 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Pooling66 -2019-04-04 23:48:26.397126243 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:197 ReleaseNodeMLValues] Releasing mlvalue with index: 11 -2019-04-04 23:48:26.397772701 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Convolution110 -2019-04-04 23:48:26.397818174 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:197 ReleaseNodeMLValues] Releasing mlvalue with index: 13 -2019-04-04 23:48:26.398060592 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Plus112 -2019-04-04 23:48:26.398095300 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:197 ReleaseNodeMLValues] Releasing mlvalue with index: 14 -2019-04-04 23:48:26.398257563 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: ReLU114 -2019-04-04 23:48:26.398426740 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Pooling160 -2019-04-04 23:48:26.398466031 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:197 ReleaseNodeMLValues] Releasing mlvalue with index: 15 -2019-04-04 23:48:26.398542823 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Times212_reshape0 -2019-04-04 23:48:26.398599687 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Times212_reshape1 -2019-04-04 23:48:26.398692631 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Times212 -2019-04-04 23:48:26.398731471 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:197 ReleaseNodeMLValues] Releasing mlvalue with index: 17 -2019-04-04 23:48:26.398832735 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:156 Execute] Releasing node ML values after computing kernel: Plus214 -2019-04-04 23:48:26.398873229 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:197 ReleaseNodeMLValues] Releasing mlvalue with index: 19 -2019-04-04 23:48:26.398922929 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:160 Execute] Fetching output. -2019-04-04 23:48:26.398956560 [V:VLOG1:72b68108-18a4-493c-ac75-d0abd82f0a11, sequential_executor.cc:163 Execute] Done with execution. -``` \ No newline at end of file diff --git a/docs/PyOp.md b/docs/PyOp.md index 1d0e5e27bcd42..e82d7b6995b25 100644 --- a/docs/PyOp.md +++ b/docs/PyOp.md @@ -1,10 +1,10 @@ # Python Operator -To facilitate Python coders on model developing, onnxruntime provides a way to invoke operators implemented in Python. +The Python Operator provides the capability to easily invoke any custom Python code within a single node of an ONNX graph using ONNX Runtime. This can be useful for quicker experimentation when a model requires operators that are not officially supported in ONNX and ONNX Runtime, particularly if there is already a Python implementation for the required functionality. This should be used with discretion in production scenarios, and all security or other risks should be considered. -## Implemenation -The feature is implemented under onnxruntime/core/language_interop_ops. +## Design Overview +The feature can be found under [onnxruntime/core/language_interop_ops](../onnxruntime/core/language_interop_ops). All Python C API dependent code are compiled into a dynamic linked library named pywrapper. -Before calling into Python script, pywrapper will convert onnxruntime tensor(s) to numpy(s), which get converted back when done. +Before calling into Python script, pywrapper will convert onnxruntime tensor(s) to numpy(s), which is converted back when completed.

Here is a chart illustrating the calling sequence:

 onnxruntime                          pywrapper                          script
@@ -13,18 +13,20 @@ onnxruntime                          pywrapper                          script
      |       call with tensor(s)        | ------------------------------> |
      |                                  |         call with numpy(s)      | 
      |                                  |                                 | compute
-     |                                  |  <----------------------------- |
+     |                                  | <------------------------------ |
      | <------------------------------  |           return numpys(s)      |
      |         return tensor(s)         |                                 |
 
-## Usage -Step 1, build onnxruntime with“--config Release --enable_language_interop_ops --build_shared_lib” and override existing onnxruntime binary with the latest, then copy onnxruntime_pywrapper.dll or libonnxruntime_pywrapper.so or libonnxruntime_pywrapper.dylib to the path where onnxruntime binary is placed. -Note: -* It is suggested to compile within the Python environment where inferencing will happen. For example, if inferencing will happen in a conda env named myconda1, please compile the binary within that environment as well; -* If "--numpy_version=..." is specified, Python operator will build with that version. +## How to Use +### Step 1 +Build onnxruntime with `--config Release --enable_language_interop_ops --build_shared_lib` and override the existing onnxruntime binary with the latest. Then, copy onnxruntime_pywrapper.dll, libonnxruntime_pywrapper.so, or libonnxruntime_pywrapper.dylib to the path where the onnxruntime binary is located. +**Notes:** +* It is recommended to compile within the Python environment where inferencing will happen. For example, if inferencing will happen in a conda env named myconda1, please compile the binary within that environment as well +* If `--numpy_version=...` is specified, the Python operator will build with that version. -Step 2, create an onnx model containing Python operator nodes: +### Step 2 +Create an onnx model containing Python operator nodes: ```python ad1_node = helper.make_node('Add', ['A','B'], ['S']) mul_node = helper.make_node('Mul', ['C','D'], ['P']) @@ -48,7 +50,8 @@ graph = helper.make_graph([ad1_node,mul_node,py1_node,ad2_node,py2_node,sub_node model = helper.make_model(graph, producer_name = 'pyop_model') onnx.save(model, './model.onnx') ``` -Step 3, implement mymodule.py: +### Step 3 +Implement mymodule.py: ```python class Multi_1: def __init__(self, W1, W2, W3): @@ -63,23 +66,24 @@ class Multi_2: r1, r2 = H + N, N + E return r1, r2 ``` -Step 4, copy mymodule.py into Python sys.path, then reference with onnxruntime. On Windows, please set PYTHONHOME beforehand. It should point to directory where the python is installed, such as C:\Python37 or C:\ProgramData\Anaconda3\envs\myconda1 if it is in conda. +### Step 4 +Copy mymodule.py into Python sys.path, then reference with onnxruntime. On Windows, please set PYTHONHOME beforehand. It should point to directory where the python is installed, such as C:\Python37 or C:\ProgramData\Anaconda3\envs\myconda1 if it is in conda. ## Supported Data Types -* TensorProto.BOOL, -* TensorProto.UINT8, -* TensorProto.UINT16, -* TensorProto.UINT32, -* TensorProto.INT16, -* TensorProto.INT32, -* TensorProto.FLOAT, +* TensorProto.BOOL +* TensorProto.UINT8 +* TensorProto.UINT16 +* TensorProto.UINT32 +* TensorProto.INT16 +* TensorProto.INT32 +* TensorProto.FLOAT * TensorProto.DOUBLE ## Limitations -* On Windows, "--config Debug" has known issues, build with "--config RelWithDebInfo" if need debugging symbols; -* Due to python C API restrictions, multi-threading is disabled, meaning Python operators will run sequentially. +* On Windows, `--config Debug` has known issues. Please build with `--config RelWithDebInfo` if debugging symbols are needed. +* Due to Python C API restrictions, multi-threading is disabled so Python operators will run sequentially. -## Test +## Test Coverage The operator has been tested on multiple platforms, with or without conda: Platform | Python 3.5 | Python 3.6 | Python 3.7 @@ -88,3 +92,47 @@ Windows | (conda) passed | (conda) passed | passed Linux | (conda) passed | (conda) passed | passed Mac | (conda) passed | (conda) passed | (conda) passed +## Example +Developers could resort to PyOp during model conversion for missing operators: +```python +import os +import numpy as np +from onnx import * +from skl2onnx import convert_sklearn +from skl2onnx.common.data_types import FloatTensorType +from skl2onnx.common.utils import check_input_and_output_numbers + +X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]],dtype=np.single) +nmf = NMF(n_components=2, init='random', random_state=0) +W = np.array(nmf.fit_transform(X), dtype=np.single) + +def calculate_sklearn_nmf_output_shapes(operator): + check_input_and_output_numbers(operator, output_count_range=1, input_count_range=1) + operator.outputs[0].type.shape = operator.inputs[0].type.shape + +def convert_nmf(scope, operator, container): + ws = [str(w) for w in W.flatten()] + attrs = {'W':'|'.join(ws)} + container.add_node(op_type='PyOp', name='nmf', inputs=['X'], outputs=['variable'], + op_version=10, op_domain='MyDomain', module='mymodule', class_name='MyNmf', + input_types=[TensorProto.FLOAT], output_types=[TensorProto.FLOAT], **attrs) + +custom_shape_calculators = {type(nmf): calculate_sklearn_nmf_output_shapes} +custom_conversion_functions = {type(nmf): convert_nmf} +initial_types = [('X', FloatTensorType([6,2]))] +onx = convert_sklearn(nmf, '', initial_types, '', None, custom_conversion_functions, custom_shape_calculators) +with th open("model.onnx", "wb") as f: + f.write(onx.SerializeToString()) +``` +mymodule.py: +```python +import numpy as np +class MyNmf: + def __init__(self,W): + A = [] + for w in W.split('|'): + A.append(float(w)) + self.__W = np.array(A,dtype=np.single).reshape(6,2) + def compute(self,X): + return self.__W +``` diff --git a/docs/Versioning.md b/docs/Versioning.md index cf503df1d820a..d646d777d8335 100644 --- a/docs/Versioning.md +++ b/docs/Versioning.md @@ -7,12 +7,13 @@ same as what is described in the semantic versioning doc linked above. ## Current stable release version The version number of the current stable release can be found -[here](../VERSION_NUMBER) +[here](../VERSION_NUMBER). ## Release cadence See [Release Management](ReleaseManagement.md) -## Compatibility with ONNX opsets +# Compatibility +## ONNX Compatibility ONNX Runtime supports both backwards and forward compatibility. ### Backwards compatibility @@ -26,14 +27,31 @@ the model doesn't use ops that were newly introduced in opset ver 9. ### Version matrix Following table summarizes the relationship between the ONNX Runtime version and the ONNX -opset version implemented in that release. - -| ONNX Runtime release version | ONNX opset version
implemented in this release | ONNX ML opset version
implemented in this release | Supported ONNX IR version | -|------------------------------|--------------------|----------------------|------------------| -| 0.4.0 | 10 | 1 | 5 | -| 0.3.1 | 9 | 1 | 3 | -| 0.3.0 | 9 | 1 | 3 | -| 0.2.1 | 8 | 1 | 3 | -| 0.2.0 | 8 | 1 | 3 | -| 0.1.5 | 8 | 1 | 3 | -| 0.1.4 | 8 | 1 | 3 | +opset version implemented in that release. Please note the Backwards and Forward compatiblity notes above. +For more details on ONNX Release versions, see [this page](https://github.com/onnx/onnx/blob/master/docs/Versioning.md). + +| ONNX Runtime release version | ONNX release version | ONNX opset version | ONNX ML opset version | Supported ONNX IR version | [WinML compatibility](https://docs.microsoft.com/en-us/windows/ai/windows-ml/)| +|------------------------------|--------------------|--------------------|----------------------|------------------|------------------| +| 0.5.0 | 1.5 | 10 | 1 | 5 | -- | +| 0.4.0 | 1.5 | 10 | 1 | 5 | -- | +| 0.3.1
0.3.0 | 1.4 | 9 | 1 | 3 | -- | +| 0.2.1
0.2.0 | 1.3 | 8 | 1 | 3 | 1903 (19H1)+ | +| 0.1.5
0.1.4 | 1.3 | 8 | 1 | 3 | 1809 (RS5)+ | + + +## Tool Compatibility +A variety of tools can be used to create ONNX models. Unless otherwise noted, please use the latest released version of the tools to convert/export the ONNX model. Many tools are backwards compatible and support multiple ONNX versions. Join this with the table above to evaluate ONNX Runtime compatibility. + + +|Tool|Recommended Version|Supported ONNX version(s)| +|---|---|---| +|[PyTorch](https://pytorch.org/)|[Latest stable](https://pytorch.org/get-started/locally/)|1.2-1.5*
*may require [ONNX version converter](https://github.com/onnx/onnx/blob/master/docs/VersionConverter.md) to convert to desired opset #*| +|[ONNXMLTools](https://pypi.org/project/onnxmltools/)
CoreML, LightGBM, XGBoost, LibSVM|[Latest stable](https://github.com/onnx/onnxmltools/releases)|1.2-1.5| +|[ONNXMLTools](https://pypi.org/project/onnxmltools/)
SparkML|[Latest stable](https://github.com/onnx/onnxmltools/releases)|1.4-1.5| +|[SKLearn-ONNX](https://pypi.org/project/skl2onnx/)|[Latest stable](https://github.com/onnx/sklearn-onnx/releases)|1.2-1.5| +|[Keras-ONNX](https://pypi.org/project/keras2onnx/)|[Latest stable](https://github.com/onnx/keras-onnx/releases)|1.2-1.5| +|[Tensorflow-ONNX](https://pypi.org/project/tf2onnx/)|[Latest stable](https://github.com/onnx/tensorflow-onnx/releases)|1.2-1.5| +|[WinMLTools](https://docs.microsoft.com/en-us/windows/ai/windows-ml/convert-model-winmltools)|[Latest stable](https://pypi.org/project/winmltools/)|1.2-1.4| +|[AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml)|[1.0.39+](https://pypi.org/project/azureml-automl-core)|1.5| +| |[1.0.33](https://pypi.org/project/azureml-automl-core/1.0.33/)|1.4| + diff --git a/docs/execution_providers/OpenVINO-ExecutionProvider.md b/docs/execution_providers/OpenVINO-ExecutionProvider.md index 5cfa516af3dca..1d5838268d3f6 100644 --- a/docs/execution_providers/OpenVINO-ExecutionProvider.md +++ b/docs/execution_providers/OpenVINO-ExecutionProvider.md @@ -6,9 +6,9 @@ OpenVINO Execution Provider enables deep learning inference on Intel CPUs, Intel Below table shows the ONNX layers supported using OpenVINO Execution Provider and the mapping between ONNX layers and OpenVINO layers. The below table also lists the Intel hardware support for each of the layers. CPU refers to Intel® Atom, Core, and Xeon processors. GPU refers to the Intel Integrated Graphics. VPU refers to USB based Intel® MovidiusTM -VPUs as well as Intel® Vision accelerator Design with Intel Movidius TM MyriadX VPU. +VPUs as well as Intel® Vision accelerator Design with Intel Movidius TM MyriadX VPU. -| **ONNX Layers** | **OpenVINO Layers** | **CPU** | **GPU** | **VPU** | +| **ONNX Layers** | **OpenVINO Layers** | **CPU** | **GPU** | **VPU** | | --- | --- | --- | --- | --- | | Add | Eltwise (operation=sum) | Yes | Yes | Yes | AveragePool | Pooling(pool\_method=avg) | Yes | Yes | Yes @@ -33,7 +33,7 @@ VPUs as well as Intel® Vision accelerator Design with Intel Movidiu | UnSqueeze | Reshape | Yes | Yes | Yes | LeakyRelu | ReLU | Yes | Yes | Yes -*MatMul is supported in GPU only when the following layer is an Add layer in the topology. +*MatMul is supported in GPU only when the following layer is an Add layer in the topology. # Topology Support @@ -41,17 +41,17 @@ Below topologies are supported from ONNX open model zoo using OpenVINO Execution ## Image Classification Networks -| **Topology** | **CPU** | **GPU** | **VPU** | -| --- | --- | --- | --- | +| **Topology** | **CPU** | **GPU** | **VPU** | +| --- | --- | --- | --- | | bvlc\_alexnet | Yes | Yes | Yes | bvlc\_googlenet | Yes | Yes | Yes -| bvlc\_reference\_caffenet | Yes | Yes | Yes -| bvlc\_reference\_rcnn\_ilsvrc13 | Yes | Yes | Yes +| bvlc\_reference\_caffenet | Yes | Yes | Yes +| bvlc\_reference\_rcnn\_ilsvrc13 | Yes | Yes | Yes | densenet121 | Yes | Yes | Yes -| Inception\_v1 | Yes | Yes | No +| Inception\_v1 | Yes | Yes | Yes** | Inception\_v2 | Yes | Yes | Yes | Shufflenet | Yes | Yes | Yes -| Zfnet512 | Yes | Yes | Yes +| Zfnet512 | Yes | Yes | Yes | Squeeznet 1.1 | Yes | Yes | Yes | Resnet18v1 | Yes | Yes | Yes | Resnet34v1 | Yes | Yes | Yes @@ -62,29 +62,32 @@ Below topologies are supported from ONNX open model zoo using OpenVINO Execution | Resnet34v2 | Yes | Yes | Yes | Resnet50v2 | Yes | Yes | Yes | Resnet101v2 | Yes | Yes | Yes -| Resnet152v2 | Yes | Yes | Yes +| Resnet152v2 | Yes | Yes | Yes | Mobilenetv2 | Yes | Yes | Yes | vgg16 | Yes | Yes | Yes | vgg19 | Yes | Yes | Yes + ## Image Recognition Networks -| **Topology** | **CPU** | **GPU** | **VPU** | -| --- | --- | --- | --- | -| MNIST | Yes | Yes | No +| **Topology** | **CPU** | **GPU** | **VPU** | +| --- | --- | --- | --- | +| MNIST | Yes | Yes | Yes** + +**Inception_v1 and MNIST are supported in OpenVINO R1.1 and are not supported in OpenVINO R5.0.1. ## Object Detection Networks -| **Topology** | **CPU** | **GPU** | **VPU** | -| --- | --- | --- | --- | +| **Topology** | **CPU** | **GPU** | **VPU** | +| --- | --- | --- | --- | |TinyYOLOv2 | Yes | Yes | Yes -| ResNet101\_DUC\_HDC | Yes | Yes | No +| ResNet101\_DUC\_HDC | Yes | No | No -# Application code changes for VAD-R performance scaling +# Application code changes for VAD-M performance scaling -VAD-R has 8 VPUs and is suitable for applications that require multiple inferences to run in parallel. We use batching approach for performance scaling on VAD-R. +VAD-M has 8 VPUs and is suitable for applications that require multiple inferences to run in parallel. We use batching approach for performance scaling on VAD-M. -Below python code snippets provide sample classification code to batch input images, load a model and process the output results. +Below python code snippets provide sample classification code to batch input images, load a model and process the output results. ~~~ import onnxruntime as rt @@ -95,7 +98,7 @@ import sys import cv2 import numpy import time -import glob +import glob ~~~ ### Load the input onnx model @@ -111,19 +114,19 @@ for i in range(iters): images = [cv2.imread(file) for file in glob.glob(str(sys.argv[2])+'/*.jpg')] for img in images: # resizing the image - img = cv2.resize(img, (224,224)) - # convert image to numpy - x = numpy.asarray(img).astype(numpy.float32) - x = numpy.transpose(x, (2,0,1)) + img = cv2.resize(img, (224,224)) + # convert image to numpy + x = numpy.asarray(img).astype(numpy.float32) + x = numpy.transpose(x, (2,0,1)) # expand the dimension and batch the images - x = numpy.expand_dims(x,axis=0) - if y is None: - y = x - else: - y = numpy.concatenate((y,x), axis=0) + x = numpy.expand_dims(x,axis=0) + if y is None: + y = x + else: + y = numpy.concatenate((y,x), axis=0) ~~~ -### Start Inference +### Start Inference ~~~ res = sess.run([sess.get_outputs()[0].name], {sess.get_inputs()[0].name: y}) ~~~ diff --git a/docs/python/README.rst b/docs/python/README.rst index 756383579ee45..0fe76b1624ef3 100644 --- a/docs/python/README.rst +++ b/docs/python/README.rst @@ -52,6 +52,11 @@ replaces *scikit-learn* to compute the predictions. Changes ------- +0.5.0 +^^^^^ + +Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v0.5.0 + 0.4.0 ^^^^^ diff --git a/docs/python/examples/plot_pipeline.py b/docs/python/examples/plot_pipeline.py index 5063479492429..0a002f6223e1b 100644 --- a/docs/python/examples/plot_pipeline.py +++ b/docs/python/examples/plot_pipeline.py @@ -21,7 +21,7 @@ """ from onnxruntime.datasets import get_example -example1 = get_example("mul_1.pb") +example1 = get_example("mul_1.onnx") import onnx model = onnx.load(example1) # model is a ModelProto protobuf message diff --git a/docs/python/examples/plot_profiling.py b/docs/python/examples/plot_profiling.py index 3844962033f9d..d5617d41726c5 100644 --- a/docs/python/examples/plot_profiling.py +++ b/docs/python/examples/plot_profiling.py @@ -19,7 +19,7 @@ ######################### # Let's load a very simple model and compute some prediction. -example1 = get_example("mul_1.pb") +example1 = get_example("mul_1.onnx") sess = rt.InferenceSession(example1) input_name = sess.get_inputs()[0].name diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index 462aed63f1d68..8a37553ea976b 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -15,21 +15,80 @@ #include "core/framework/fence.h" #include "core/session/onnxruntime_c_api.h" +// Struct to represent a physical device. +struct OrtDevice { + using DeviceType = int8_t; + using MemoryType = int8_t; + using DeviceId = int16_t; + + // Pre-defined device types. + static const DeviceType CPU = 0; + static const DeviceType GPU = 1; //CUDA + static const DeviceType FPGA = 2; + + struct MemType { + // Pre-defined memory types. + static const MemoryType DEFAULT = 0; + static const MemoryType CUDA_PINNED = 1; + }; + + constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_) + : device_type(device_type_), + memory_type(memory_type_), + device_id(device_id_) {} + + constexpr OrtDevice() : OrtDevice(CPU, MemType::DEFAULT, 0) {} + + DeviceType Type() const { + return device_type; + } + + MemoryType MemType() const { + return memory_type; + } + + DeviceId Id() const { + return device_id; + } + + std::string ToString() const { + std::ostringstream ostr; + ostr << "Device: [" + << " type:" << static_cast(device_type) + << " memory_type:" << static_cast(memory_type) + << " device_id:" << device_id + << "]"; + return ostr.str(); + } + + private: + // Device type. + DeviceType device_type; + + // Memory type. + MemoryType memory_type; + + // Device index. + DeviceId device_id; +}; + struct OrtAllocatorInfo { // use string for name, so we could have customized allocator in execution provider. const char* name; int id; OrtMemType mem_type; OrtAllocatorType type; + OrtDevice device; - constexpr OrtAllocatorInfo(const char* name_, OrtAllocatorType type_, int id_ = 0, OrtMemType mem_type_ = OrtMemTypeDefault) + constexpr OrtAllocatorInfo(const char* name_, OrtAllocatorType type_, OrtDevice device_ = OrtDevice(), int id_ = 0, OrtMemType mem_type_ = OrtMemTypeDefault) #if (defined(__GNUC__) || defined(__clang__)) __attribute__((nonnull)) #endif : name(name_), id(id_), mem_type(mem_type_), - type(type_) { + type(type_), + device(device_) { } // To make OrtAllocatorInfo become a valid key in std map @@ -67,6 +126,8 @@ std::ostream& operator<<(std::ostream& out, const OrtAllocatorInfo& info); namespace onnxruntime { constexpr const char* CPU = "Cpu"; +constexpr const char* CUDA = "Cuda"; +constexpr const char* CUDA_PINNED = "CudaPinned"; // forward declaration class SessionState; diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index d3b1aa5d75020..6e7c919601060 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -84,20 +84,6 @@ class IExecutionProvider { */ virtual std::shared_ptr GetKernelRegistry() const; - /** - * Copy tensor between execution providers. It's always a deep copy - * Either src.location is CPU, or dst.location is CPU. They can't be both on CPU. - */ - virtual common::Status CopyTensor(const Tensor& src, Tensor& dst) const = 0; - - /** - * Copy tensor between execution providers on specified exec queue - * It's always a deep copy - * Either src.location is CPU, or dst.location is CPU. They can't be both on CPU. - */ - virtual common::Status CopyTensor(const Tensor& src, Tensor& dst, - int exec_queue_id) const; - /** Returns an opaque handle whose exact type varies based on the provider and is interpreted accordingly by the corresponding kernel implementation. diff --git a/include/onnxruntime/core/framework/kernel_registry.h b/include/onnxruntime/core/framework/kernel_registry.h index b7b41ebcf70c9..3a0d35e298f98 100644 --- a/include/onnxruntime/core/framework/kernel_registry.h +++ b/include/onnxruntime/core/framework/kernel_registry.h @@ -24,9 +24,12 @@ class KernelRegistry { // for its clients unless the factory is managing the lifecycle of the pointer // itself. // TODO(Task:132) Make usage of unique_ptr/shared_ptr as out param consistent - Status TryCreateKernel(const onnxruntime::Node& node, const IExecutionProvider& execution_provider, - const std::unordered_map& initialized_tensors, - const OrtValueNameIdxMap& mlvalue_name_idx_map, const FuncManager& funcs_mgr, + Status TryCreateKernel(const onnxruntime::Node& node, + const IExecutionProvider& execution_provider, + const std::unordered_map& constant_initialized_tensors, + const OrtValueNameIdxMap& mlvalue_name_idx_map, + const FuncManager& funcs_mgr, + const DataTransferManager& data_transfer_mgr, std::unique_ptr& op_kernel) const; // Check if an execution provider can create kernel for a node and return diff --git a/include/onnxruntime/core/framework/op_kernel_info.h b/include/onnxruntime/core/framework/op_kernel_info.h index f38e6858847ee..e377f0d4e4239 100644 --- a/include/onnxruntime/core/framework/op_kernel_info.h +++ b/include/onnxruntime/core/framework/op_kernel_info.h @@ -15,16 +15,20 @@ namespace onnxruntime { class OrtValueNameIdxMap; class FuncManager; +class DataTransferManager; // A very light-weight class, which works as an aggregated // view of all data needed for constructing a Kernel instance. // NOTE: it does not own/hold any objects. class OpKernelInfo : public OpNodeProtoHelper { public: - explicit OpKernelInfo(const onnxruntime::Node& node, const KernelDef& kernel_def, + explicit OpKernelInfo(const onnxruntime::Node& node, + const KernelDef& kernel_def, const IExecutionProvider& execution_provider, - const std::unordered_map& initialized_tensors, - const OrtValueNameIdxMap& mlvalue_name_idx_map, const FuncManager& funcs_mgr); + const std::unordered_map& constant_initialized_tensors, + const OrtValueNameIdxMap& mlvalue_name_idx_map, + const FuncManager& funcs_mgr, + const DataTransferManager& data_transfer_mgr); OpKernelInfo(const OpKernelInfo& other); @@ -36,6 +40,8 @@ class OpKernelInfo : public OpNodeProtoHelper { const IExecutionProvider* GetExecutionProvider() const noexcept; + const DataTransferManager& GetDataTransferManager() const noexcept; + const onnxruntime::Node& node() const noexcept; bool TryGetConstantInput(int input_index, const Tensor** constant_input_value) const; @@ -53,9 +59,10 @@ class OpKernelInfo : public OpNodeProtoHelper { // For non cpu/cuda case, this pointer should be set so that function kernel // will delegate kernel compute call to compute call. gsl::not_null execution_provider_; - const std::unordered_map& initialized_tensors_; + const std::unordered_map& constant_initialized_tensors_; const OrtValueNameIdxMap& ort_value_name_idx_map_; const FuncManager& funcs_mgr_; + const DataTransferManager& data_transfer_mgr_; ProtoHelperNodeContext proto_helper_context_; }; diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h index 52285311e5254..b66607853856a 100644 --- a/include/onnxruntime/core/framework/run_options.h +++ b/include/onnxruntime/core/framework/run_options.h @@ -14,8 +14,8 @@ struct OrtRunOptions { /// Log severity. See https://github.com/microsoft/onnxruntime/blob/master/include/onnxruntime/core/common/logging/severity.h /// Default = -1 (use the log severity from the InferenceSession that the Run is for). int run_log_severity_level = -1; - unsigned run_log_verbosity_level = 0; ///< VLOG level if debug build and run_log_severity_level is 0 (VERBOSE). - std::string run_tag; ///< A tag for the Run() calls using this. + int run_log_verbosity_level = 0; ///< VLOG level if debug build and run_log_severity_level is 0 (VERBOSE). + std::string run_tag; ///< A tag for the Run() calls using this. // Set to 'true' to ensure the termination of all the outstanding Run() calls // that use this OrtRunOptions instance. Some of the outstanding Run() calls may diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h index 260d1731bc6c0..35eb359c714a3 100644 --- a/include/onnxruntime/core/framework/tensor.h +++ b/include/onnxruntime/core/framework/tensor.h @@ -170,7 +170,7 @@ class Tensor final { /** The number of bytes of data. */ - size_t Size() const { + size_t SizeInBytes() const { size_t ret; int64_t l = shape_.Size(); if (l >= static_cast(std::numeric_limits::max())) { diff --git a/include/onnxruntime/core/framework/tensor_shape.h b/include/onnxruntime/core/framework/tensor_shape.h index 5cf9cf08e0868..acf39638fe0db 100644 --- a/include/onnxruntime/core/framework/tensor_shape.h +++ b/include/onnxruntime/core/framework/tensor_shape.h @@ -37,6 +37,7 @@ class TensorShape : private std::vector { TensorShape(const int64_t* dimension_sizes, size_t dimension_count); TensorShape(const std::vector& dims); + TensorShape(std::vector&& dims); TensorShape(const std::initializer_list& dims); diff --git a/include/onnxruntime/core/graph/constants.h b/include/onnxruntime/core/graph/constants.h index 639ff301ff08f..5872228f383d2 100644 --- a/include/onnxruntime/core/graph/constants.h +++ b/include/onnxruntime/core/graph/constants.h @@ -18,6 +18,7 @@ constexpr const char* kOnnxDomain = ""; constexpr const char* kOnnxDomainAlias = "ai.onnx"; constexpr const char* kMLDomain = "ai.onnx.ml"; constexpr const char* kMSDomain = "com.microsoft"; +constexpr const char* kMSNchwcDomain = "com.microsoft.nchwc"; constexpr const char* kNGraphDomain = "com.intel.ai"; constexpr const char* kCpuExecutionProvider = "CPUExecutionProvider"; constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider"; @@ -27,5 +28,6 @@ constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider"; constexpr const char* kNupharExecutionProvider = "NupharExecutionProvider"; constexpr const char* kBrainSliceExecutionProvider = "BrainSliceExecutionProvider"; constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider"; +constexpr const char* kNnapiExecutionProvider = "NnapiExecutionProvider"; } // namespace onnxruntime diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 66b5954cf5177..b626a7541713f 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -279,6 +279,10 @@ class Node { return !attr_to_subgraph_map_.empty(); } + /** Get the const subgraphs from a node. + @remarks Creates a new vector so calling ContainsSubgraphs first is preferred. */ + std::vector> GetSubgraphs() const; + /** Gets a map of attribute name to the mutable Graph instances for all subgraphs of the Node. @returns Map of the attribute name that defines the subgraph to the subgraph's Graph instance. nullptr if the Node has no subgraphs. @@ -500,6 +504,9 @@ class Graph { /** Removes all initializer tensors from this Graph and releases the memory they were using. */ void CleanAllInitializedTensors() noexcept; + /** Returns true if an initializer value can be overridden by a graph input with the same name. */ + bool CanOverrideInitializer() const noexcept { return ir_version_ >= 4; } + /** Gets the Graph inputs excluding initializers. These are the required inputs to the Graph as the initializers can be optionally overridden via graph inputs. @remarks Contains no nullptr values. */ @@ -750,6 +757,12 @@ class Graph { /** Returns true if this is a subgraph or fase if it is a high-level graph. */ bool IsSubgraph() const { return parent_graph_ != nullptr; } + /** Returns the parent graph if this is a subgraph */ + const Graph* ParentGraph() const { return parent_graph_; } + + /** Returns the mutable parent graph if this is a subgraph */ + Graph* MutableParentGraph() { return parent_graph_; } + /** Construct a Graph instance for a subgraph that is created from a GraphProto attribute in a Node. Inherits some properties from the parent graph. @param parent_graph The Graph containing the Node which has a GraphProto attribute. @@ -840,7 +853,7 @@ class Graph { // Build and verify node connection (edges). // Verify NodeArg name/type/shape matching correctly. - common::Status BuildConnections(std::vector& outer_scope_node_args_consumed); + common::Status BuildConnections(std::unordered_set& outer_scope_node_args_consumed); common::Status VerifyNoDuplicateName(); @@ -962,7 +975,7 @@ class Graph { std::unordered_map model_functions_; // Model IR version. - Version ir_version_{}; + Version ir_version_{ONNX_NAMESPACE::Version::IR_VERSION}; int name_generator_ = 0; @@ -974,6 +987,9 @@ class Graph { // NodeArgs that come from outer scope. Used when building a graph so that // these don't get recorded as graph inputs in the GraphProto. std::unordered_set outer_scope_node_arg_names_; + + // number of times Resolve has run. + int num_resolves_ = 0; }; } // namespace onnxruntime diff --git a/include/onnxruntime/core/optimizer/graph_transformer_level.h b/include/onnxruntime/core/optimizer/graph_transformer_level.h index ad7d71096ef69..4f2d5b305ce1d 100644 --- a/include/onnxruntime/core/optimizer/graph_transformer_level.h +++ b/include/onnxruntime/core/optimizer/graph_transformer_level.h @@ -7,11 +7,12 @@ namespace onnxruntime { -enum class TransformerLevel : uint32_t { +enum class TransformerLevel : int { Default = 0, Level1, Level2, - // Convenience enum to always get the max available value. + Level3, + // Convenience enum to always get the max available value. // This way when we add more levels code which iterates over this enum does not need to change. MaxTransformerLevel }; diff --git a/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h b/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h index 360de99b5cf62..66f258922c1f4 100644 --- a/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h +++ b/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/session/onnxruntime_c_api.h" +#include "onnxruntime_c_api.h" #ifdef __cplusplus extern "C" { diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h b/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h index 3fc4b7b51f4f3..81b5477b3cb4d 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h +++ b/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/session/onnxruntime_c_api.h" +#include "onnxruntime_c_api.h" #ifdef __cplusplus extern "C" { diff --git a/include/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.h b/include/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.h index 03ef1158eeef5..a54b522d9e79f 100644 --- a/include/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.h +++ b/include/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.h @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/session/onnxruntime_c_api.h" +#include "onnxruntime_c_api.h" #ifdef __cplusplus extern "C" { diff --git a/include/onnxruntime/core/providers/ngraph/ngraph_provider_factory.h b/include/onnxruntime/core/providers/ngraph/ngraph_provider_factory.h index 0970362a2b557..87d98cdbdd34a 100644 --- a/include/onnxruntime/core/providers/ngraph/ngraph_provider_factory.h +++ b/include/onnxruntime/core/providers/ngraph/ngraph_provider_factory.h @@ -1,7 +1,7 @@ // Copyright(C) 2019 Intel Corporation // Licensed under the MIT License -#include "core/session/onnxruntime_c_api.h" +#include "onnxruntime_c_api.h" #ifdef __cplusplus extern "C" { diff --git a/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h b/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h new file mode 100644 index 0000000000000..d8b6a1ec27634 --- /dev/null +++ b/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h @@ -0,0 +1,15 @@ +// Copyright 2019 JD.com Inc. JD AI + +#include "onnxruntime_c_api.h" + +#ifdef __cplusplus +extern "C" { +#endif + +ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Nnapi, _In_ OrtSessionOptions* options); + +#ifdef __cplusplus +} +#endif + + diff --git a/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h b/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h index eadcd45603762..08200319c71a2 100644 --- a/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h +++ b/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h @@ -1,7 +1,7 @@ // Copyright(C) 2019 Intel Corporation // Licensed under the MIT License -#include "core/session/onnxruntime_c_api.h" +#include "onnxruntime_c_api.h" #ifdef __cplusplus extern "C" { diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.h index bee1ae1b0939c..fb077fc5ff41d 100644 --- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.h +++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.h @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/session/onnxruntime_c_api.h" +#include "onnxruntime_c_api.h" #ifdef __cplusplus extern "C" { diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 03bf3a4467df3..6848fc31e453c 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -18,6 +18,7 @@ extern "C" { #define _In_ #define _In_opt_ #define _Out_ +#define _Outptr_ #define _Out_opt_ #define _Inout_ #define _Inout_opt_ @@ -58,7 +59,6 @@ extern "C" { #ifdef __cplusplus // Windows users should use unicode paths when possible to bypass the MAX_PATH limitation -// Every type name starting with 'P' is a pointer type, an opaque handler // Every pointer marked with _In_ or _Out_, cannot be NULL. Caller should ensure that. // for ReleaseXXX(...) functions, they can accept NULL pointer. #define NO_EXCEPTION noexcept @@ -152,6 +152,7 @@ ORT_RUNTIME_CLASS(TensorTypeAndShapeInfo); ORT_RUNTIME_CLASS(SessionOptions); ORT_RUNTIME_CLASS(Callback); ORT_RUNTIME_CLASS(CustomOpDomain); +ORT_RUNTIME_CLASS(Allocator); // When passing in an allocator to any ORT function, be sure that the allocator object // is not destroyed until the last allocated object using it is freed. @@ -169,76 +170,76 @@ typedef void(ORT_API_CALL* OrtLoggingFunction)( /** * \param out Should be freed by `OrtReleaseEnv` after use */ -ORT_API_STATUS(OrtCreateEnv, OrtLoggingLevel default_warning_level, _In_ const char* logid, _Out_ OrtEnv** out) +ORT_API_STATUS(OrtCreateEnv, OrtLoggingLevel default_logging_level, _In_ const char* logid, _Outptr_ OrtEnv** out) ORT_ALL_ARGS_NONNULL; /** * \param out Should be freed by `OrtReleaseEnv` after use */ ORT_API_STATUS(OrtCreateEnvWithCustomLogger, OrtLoggingFunction logging_function, - _In_opt_ void* logger_param, OrtLoggingLevel default_warning_level, + _In_opt_ void* logger_param, OrtLoggingLevel default_logging_level, _In_ const char* logid, - _Out_ OrtEnv** out); + _Outptr_ OrtEnv** out); // TODO: document the path separator convention? '/' vs '\' // TODO: should specify the access characteristics of model_path. Is this read only during the // execution of OrtCreateSession, or does the OrtSession retain a handle to the file/directory // and continue to access throughout the OrtSession lifetime? // What sort of access is needed to model_path : read or read/write? -ORT_API_STATUS(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path, - _In_ const OrtSessionOptions* options, _Out_ OrtSession** out); +ORT_API_STATUS(OrtCreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path, + _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out); -ORT_API_STATUS(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length, - _In_ const OrtSessionOptions* options, _Out_ OrtSession** out); +ORT_API_STATUS(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length, + _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out); ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess, - _In_ const OrtRunOptions* run_options, + _In_opt_ const OrtRunOptions* run_options, _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len, - _In_ const char* const* output_names, size_t output_names_len, _Out_ OrtValue** output); + _In_ const char* const* output_names, size_t output_names_len, _Outptr_ OrtValue** out); /** * \return A pointer of the newly created object. The pointer should be freed by OrtReleaseSessionOptions after use */ -ORT_API_STATUS(OrtCreateSessionOptions, _Out_ OrtSessionOptions** output); +ORT_API_STATUS(OrtCreateSessionOptions, _Outptr_ OrtSessionOptions** options); // create a copy of an existing OrtSessionOptions -ORT_API_STATUS(OrtCloneSessionOptions, _In_ OrtSessionOptions* in, _Out_ OrtSessionOptions** output); -ORT_API_STATUS(OrtEnableSequentialExecution, _In_ OrtSessionOptions* options); -ORT_API_STATUS(OrtDisableSequentialExecution, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtCloneSessionOptions, _In_ const OrtSessionOptions* in_options, _Outptr_ OrtSessionOptions** out_options); +ORT_API_STATUS(OrtEnableSequentialExecution, _Inout_ OrtSessionOptions* options); +ORT_API_STATUS(OrtDisableSequentialExecution, _Inout_ OrtSessionOptions* options); // Enable profiling for this session. -ORT_API_STATUS(OrtEnableProfiling, _In_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix); -ORT_API_STATUS(OrtDisableProfiling, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtEnableProfiling, _Inout_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix); +ORT_API_STATUS(OrtDisableProfiling, _Inout_ OrtSessionOptions* options); // Enable the memory pattern optimization. // The idea is if the input shapes are the same, we could trace the internal memory allocation // and generate a memory pattern for future request. So next time we could just do one allocation // with a big chunk for all the internal memory allocation. // Note: memory pattern optimization is only available when SequentialExecution enabled. -ORT_API_STATUS(OrtEnableMemPattern, _In_ OrtSessionOptions* options); -ORT_API_STATUS(OrtDisableMemPattern, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtEnableMemPattern, _Inout_ OrtSessionOptions* options); +ORT_API_STATUS(OrtDisableMemPattern, _Inout_ OrtSessionOptions* options); // Enable the memory arena on CPU // Arena may pre-allocate memory for future usage. // set this option to false if you don't want it. -ORT_API_STATUS(OrtEnableCpuMemArena, _In_ OrtSessionOptions* options); -ORT_API_STATUS(OrtDisableCpuMemArena, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtEnableCpuMemArena, _Inout_ OrtSessionOptions* options); +ORT_API_STATUS(OrtDisableCpuMemArena, _Inout_ OrtSessionOptions* options); // < logger id to use for session output -ORT_API_STATUS(OrtSetSessionLogId, _In_ OrtSessionOptions* options, const char* logid); +ORT_API_STATUS(OrtSetSessionLogId, _Inout_ OrtSessionOptions* options, const char* logid); // < applies to session load, initialization, etc -ORT_API_STATUS(OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, uint32_t session_log_verbosity_level); +ORT_API_STATUS(OrtSetSessionLogVerbosityLevel, _Inout_ OrtSessionOptions* options, int session_log_verbosity_level); // Set Graph optimization level. // Available options are : 0, 1, 2. // 0 -> Disable all optimizations // 1 -> Enable basic optimizations // 2 -> Enable all optimizations -ORT_API_STATUS(OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, uint32_t graph_optimization_level); +ORT_API_STATUS(OrtSetSessionGraphOptimizationLevel, _Inout_ OrtSessionOptions* options, int graph_optimization_level); // How many threads in the session thread pool. -ORT_API_STATUS(OrtSetSessionThreadPoolSize, _In_ OrtSessionOptions* options, int session_thread_pool_size); +ORT_API_STATUS(OrtSetSessionThreadPoolSize, _Inout_ OrtSessionOptions* options, int session_thread_pool_size); /** * To use additional providers, you must build ORT with the extra providers enabled. Then call one of these @@ -257,35 +258,36 @@ ORT_API_STATUS(OrtSessionGetOutputCount, _In_ const OrtSession* sess, _Out_ size /** * \param out should be freed by OrtReleaseTypeInfo after use */ -ORT_API_STATUS(OrtSessionGetInputTypeInfo, _In_ const OrtSession* sess, size_t index, _Out_ OrtTypeInfo** out); +ORT_API_STATUS(OrtSessionGetInputTypeInfo, _In_ const OrtSession* sess, size_t index, _Outptr_ OrtTypeInfo** type_info); /** * \param out should be freed by OrtReleaseTypeInfo after use */ -ORT_API_STATUS(OrtSessionGetOutputTypeInfo, _In_ const OrtSession* sess, size_t index, _Out_ OrtTypeInfo** out); +ORT_API_STATUS(OrtSessionGetOutputTypeInfo, _In_ const OrtSession* sess, size_t index, _Outptr_ OrtTypeInfo** type_info); /** * \param value is set to a null terminated string allocated using 'allocator'. The caller is responsible in freeing it. */ ORT_API_STATUS(OrtSessionGetInputName, _In_ const OrtSession* sess, size_t index, - _Inout_ OrtAllocator* allocator, _Out_ char** value); + _Inout_ OrtAllocator* allocator, _Outptr_ char** value); ORT_API_STATUS(OrtSessionGetOutputName, _In_ const OrtSession* sess, size_t index, - _Inout_ OrtAllocator* allocator, _Out_ char** value); + _Inout_ OrtAllocator* allocator, _Outptr_ char** value); /** * \return A pointer to the newly created object. The pointer should be freed by OrtReleaseRunOptions after use */ -ORT_API_STATUS(OrtCreateRunOptions, _Out_ OrtRunOptions** out); +ORT_API_STATUS(OrtCreateRunOptions, _Outptr_ OrtRunOptions** out); -ORT_API_STATUS(OrtRunOptionsSetRunLogVerbosityLevel, _In_ OrtRunOptions*, unsigned int); +ORT_API_STATUS(OrtRunOptionsSetRunLogVerbosityLevel, _Inout_ OrtRunOptions* options, int value); ORT_API_STATUS(OrtRunOptionsSetRunTag, _In_ OrtRunOptions*, _In_ const char* run_tag); -ORT_API_STATUS(OrtRunOptionsGetRunLogVerbosityLevel, _In_ OrtRunOptions*, _Out_ unsigned int* out); -ORT_API_STATUS(OrtRunOptionsGetRunTag, _In_ OrtRunOptions*, _Out_ const char** out); +ORT_API_STATUS(OrtRunOptionsGetRunLogVerbosityLevel, _In_ const OrtRunOptions* options, _Out_ int* out); +ORT_API_STATUS(OrtRunOptionsGetRunTag, _In_ const OrtRunOptions*, _Out_ const char** out); // Set a flag so that any running OrtRun* calls that are using this instance of OrtRunOptions // will exit as soon as possible if the flag is true. -ORT_API_STATUS(OrtRunOptionsSetTerminate, _In_ OrtRunOptions*, _In_ int flag); +ORT_API_STATUS(OrtRunOptionsEnableTerminate, _Inout_ OrtRunOptions* options); +ORT_API_STATUS(OrtRunOptionsDisableTerminate, _Inout_ OrtRunOptions* options); /** * Create a tensor from an allocator. OrtReleaseValue will also release the buffer inside the output value @@ -294,7 +296,7 @@ ORT_API_STATUS(OrtRunOptionsSetTerminate, _In_ OrtRunOptions*, _In_ int flag); */ ORT_API_STATUS(OrtCreateTensorAsOrtValue, _Inout_ OrtAllocator* allocator, _In_ const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type, - _Out_ OrtValue** out); + _Outptr_ OrtValue** out); /** * Create a tensor with user's buffer. You can fill the buffer either before calling this function or after. @@ -303,11 +305,11 @@ ORT_API_STATUS(OrtCreateTensorAsOrtValue, _Inout_ OrtAllocator* allocator, */ ORT_API_STATUS(OrtCreateTensorWithDataAsOrtValue, _In_ const OrtAllocatorInfo* info, _Inout_ void* p_data, size_t p_data_len, _In_ const int64_t* shape, size_t shape_len, - ONNXTensorElementDataType type, _Out_ OrtValue** out); + ONNXTensorElementDataType type, _Outptr_ OrtValue** out); // This function doesn't work with string tensor // this is a no-copy method whose pointer is only valid until the backing OrtValue is free'd. -ORT_API_STATUS(OrtGetTensorMutableData, _Inout_ OrtValue* value, _Out_ void** out); +ORT_API_STATUS(OrtGetTensorMutableData, _Inout_ OrtValue* value, _Outptr_ void** out); /** * \Sets *out to 1 iff an OrtValue is a tensor, 0 otherwise @@ -319,7 +321,7 @@ ORT_API_STATUS(OrtIsTensor, _In_ const OrtValue* value, _Out_ int* out); * \param s each A string array. Each string in this array must be null terminated. * \param s_len length of s */ -ORT_API_STATUS(OrtFillStringTensor, _In_ OrtValue* value, _In_ const char* const* s, size_t s_len); +ORT_API_STATUS(OrtFillStringTensor, _Inout_ OrtValue* value, _In_ const char* const* s, size_t s_len); /** * \param value A tensor created from OrtCreateTensor... function. * \param len total data length, not including the trailing '\0' chars. @@ -350,7 +352,7 @@ ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _Out_ void */ ORT_API_STATUS(OrtTensorProtoToOrtValue, _In_ const void* input, int input_len, _In_opt_ const ORTCHAR_T* input_file_path, _Inout_ void* preallocated, size_t preallocated_size, - _Out_ OrtValue** out, _Out_ OrtCallback** deleter); + _Outptr_ OrtValue** out, _Outptr_ OrtCallback** deleter); /** * f will be freed in this call @@ -366,19 +368,19 @@ ORT_API_STATUS(OrtGetTensorMemSizeInBytesFromTensorProto, _In_ const void* input /** * Don't free the 'out' value */ -ORT_API_STATUS(OrtCastTypeInfoToTensorInfo, _In_ OrtTypeInfo*, _Out_ const OrtTensorTypeAndShapeInfo** out); +ORT_API_STATUS(OrtCastTypeInfoToTensorInfo, _In_ const OrtTypeInfo*, _Out_ const OrtTensorTypeAndShapeInfo** out); /** * Return OnnxType from OrtTypeInfo */ -ORT_API_STATUS(OrtOnnxTypeFromTypeInfo, _In_ const OrtTypeInfo*, _Out_ enum ONNXType* out); +ORT_API_STATUS(OrtGetOnnxTypeFromTypeInfo, _In_ const OrtTypeInfo*, _Out_ enum ONNXType* out); /** * The 'out' value should be released by calling OrtReleaseTensorTypeAndShapeInfo */ -ORT_API_STATUS(OrtCreateTensorTypeAndShapeInfo, OrtTensorTypeAndShapeInfo** out); +ORT_API_STATUS(OrtCreateTensorTypeAndShapeInfo, _Outptr_ OrtTensorTypeAndShapeInfo** out); -ORT_API_STATUS(OrtSetTensorElementType, _In_ OrtTensorTypeAndShapeInfo*, enum ONNXTensorElementDataType type); +ORT_API_STATUS(OrtSetTensorElementType, _Inout_ OrtTensorTypeAndShapeInfo*, enum ONNXTensorElementDataType type); /** * \param info Created from OrtCreateTensorTypeAndShapeInfo() function @@ -405,14 +407,14 @@ ORT_API_STATUS(OrtGetTensorShapeElementCount, _In_ const OrtTensorTypeAndShapeIn /** * \param out Should be freed by OrtReleaseTensorTypeAndShapeInfo after use */ -ORT_API_STATUS(OrtGetTensorTypeAndShape, _In_ const OrtValue* value, _Out_ OrtTensorTypeAndShapeInfo** out); +ORT_API_STATUS(OrtGetTensorTypeAndShape, _In_ const OrtValue* value, _Outptr_ OrtTensorTypeAndShapeInfo** out); /** * Get the type information of an OrtValue * \param value * \param out The returned value should be freed by OrtReleaseTypeInfo after use */ -ORT_API_STATUS(OrtGetTypeInfo, _In_ const OrtValue* value, OrtTypeInfo** out); +ORT_API_STATUS(OrtGetTypeInfo, _In_ const OrtValue* value, _Outptr_ OrtTypeInfo** out); ORT_API_STATUS(OrtGetValueType, _In_ const OrtValue* value, _Out_ enum ONNXType* out); @@ -432,12 +434,12 @@ typedef enum OrtMemType { OrtMemTypeDefault = 0, // the default allocator for execution provider } OrtMemType; -ORT_API_STATUS(OrtCreateAllocatorInfo, _In_ const char* name1, enum OrtAllocatorType type, int id1, enum OrtMemType mem_type1, _Out_ OrtAllocatorInfo** out); +ORT_API_STATUS(OrtCreateAllocatorInfo, _In_ const char* name1, enum OrtAllocatorType type, int id1, enum OrtMemType mem_type1, _Outptr_ OrtAllocatorInfo** out); /** * Convenience function for special case of OrtCreateAllocatorInfo, for the CPU allocator. Uses name = "Cpu" and id = 0. */ -ORT_API_STATUS(OrtCreateCpuAllocatorInfo, enum OrtAllocatorType type, enum OrtMemType mem_type1, _Out_ OrtAllocatorInfo** out) +ORT_API_STATUS(OrtCreateCpuAllocatorInfo, enum OrtAllocatorType type, enum OrtMemType mem_type1, _Outptr_ OrtAllocatorInfo** out) ORT_ALL_ARGS_NONNULL; /** @@ -450,17 +452,16 @@ ORT_ALL_ARGS_NONNULL; /** * Do not free the returned value */ -ORT_API_STATUS(OrtAllocatorInfoGetName, _In_ OrtAllocatorInfo* ptr, _Out_ const char** out); -ORT_API_STATUS(OrtAllocatorInfoGetId, _In_ OrtAllocatorInfo* ptr, _Out_ int* out); -ORT_API_STATUS(OrtAllocatorInfoGetMemType, _In_ OrtAllocatorInfo* ptr, _Out_ OrtMemType* out); -ORT_API_STATUS(OrtAllocatorInfoGetType, _In_ OrtAllocatorInfo* ptr, _Out_ OrtAllocatorType* out); +ORT_API_STATUS(OrtAllocatorInfoGetName, _In_ const OrtAllocatorInfo* ptr, _Out_ const char** out); +ORT_API_STATUS(OrtAllocatorInfoGetId, _In_ const OrtAllocatorInfo* ptr, _Out_ int* out); +ORT_API_STATUS(OrtAllocatorInfoGetMemType, _In_ const OrtAllocatorInfo* ptr, _Out_ OrtMemType* out); +ORT_API_STATUS(OrtAllocatorInfoGetType, _In_ const OrtAllocatorInfo* ptr, _Out_ OrtAllocatorType* out); -ORT_API_STATUS(OrtAllocatorAlloc, _Inout_ OrtAllocator* ptr, size_t size, _Out_ void** out); +ORT_API_STATUS(OrtAllocatorAlloc, _Inout_ OrtAllocator* ptr, size_t size, _Outptr_ void** out); ORT_API_STATUS(OrtAllocatorFree, _Inout_ OrtAllocator* ptr, void* p); ORT_API_STATUS(OrtAllocatorGetInfo, _In_ const OrtAllocator* ptr, _Out_ const OrtAllocatorInfo** out); -ORT_API_STATUS(OrtCreateDefaultAllocator, _Out_ OrtAllocator** out); -ORT_API(void, OrtReleaseAllocator, _In_ OrtAllocator* allocator); +ORT_API_STATUS(OrtCreateDefaultAllocator, _Outptr_ OrtAllocator** out); ORT_API(const char*, OrtGetVersionString); /** @@ -509,13 +510,13 @@ ORT_ALL_ARGS_NONNULL; * If input OrtValue represents a sequence, use index to retrieve the index'th element * of the sequence. */ -ORT_API_STATUS(OrtGetValue, const OrtValue* value, int index, OrtAllocator* allocator, OrtValue** out); +ORT_API_STATUS(OrtGetValue, _In_ const OrtValue* value, int index, _Inout_ OrtAllocator* allocator, _Outptr_ OrtValue** out); /** * Returns 2 for type map and N for sequence where N is the number of elements * in the sequence. */ -ORT_API_STATUS(OrtGetValueCount, const OrtValue* value, size_t* out); +ORT_API_STATUS(OrtGetValueCount, _In_ const OrtValue* value, _Out_ size_t* out); /** * To construct a map, use num_values = 2 and 'in' should be an arrary of 2 OrtValues @@ -524,8 +525,8 @@ ORT_API_STATUS(OrtGetValueCount, const OrtValue* value, size_t* out); * sequence. 'in' should be an arrary of N OrtValues. * \value_type should be either map or sequence. */ -ORT_API_STATUS(OrtCreateValue, OrtValue** in, size_t num_values, enum ONNXType value_type, - OrtValue** out); +ORT_API_STATUS(OrtCreateValue, _In_ const OrtValue* const* in, size_t num_values, enum ONNXType value_type, + _Outptr_ OrtValue** out); /* * EXPERIMENTAL APIS - Subject to change. Released as a preview to get feedback and enable early testing @@ -548,8 +549,9 @@ struct OrtCustomOpApi { */ OrtStatus*(ORT_API_CALL* KernelInfoGetAttribute_float)(_In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ float* out); OrtStatus*(ORT_API_CALL* KernelInfoGetAttribute_int64)(_In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ int64_t* out); + OrtStatus*(ORT_API_CALL* KernelInfoGetAttribute_string)(_In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t* size); - OrtStatus*(ORT_API_CALL* GetTensorTypeAndShape)(_In_ const OrtValue* value, _Out_ OrtTensorTypeAndShapeInfo** out); + OrtStatus*(ORT_API_CALL* GetTensorTypeAndShape)(_In_ const OrtValue* value, _Outptr_ OrtTensorTypeAndShapeInfo** out); OrtStatus*(ORT_API_CALL* GetTensorShapeElementCount)(_In_ const OrtTensorTypeAndShapeInfo* info, _Out_ size_t* out); OrtStatus*(ORT_API_CALL* GetTensorElementType)(_In_ const OrtTensorTypeAndShapeInfo*, _Out_ enum ONNXTensorElementDataType* out); @@ -557,14 +559,14 @@ struct OrtCustomOpApi { OrtStatus*(ORT_API_CALL* GetDimensionCount)(_In_ const OrtTensorTypeAndShapeInfo* info, _Out_ size_t* out); OrtStatus*(ORT_API_CALL* GetDimensions)(_In_ const OrtTensorTypeAndShapeInfo* info, _Out_ int64_t* dim_values, size_t dim_values_length); OrtStatus*(ORT_API_CALL* SetDimensions)(OrtTensorTypeAndShapeInfo* info, _In_ const int64_t* dim_values, size_t dim_count); - OrtStatus*(ORT_API_CALL* GetTensorMutableData)(_Inout_ OrtValue* value, _Out_ void** data); + OrtStatus*(ORT_API_CALL* GetTensorMutableData)(_Inout_ OrtValue* value, _Outptr_ void** data); - void(ORT_API_CALL* ReleaseTensorTypeAndShapeInfo)(OrtTensorTypeAndShapeInfo* input); + void(ORT_API_CALL* ReleaseTensorTypeAndShapeInfo)(_In_ OrtTensorTypeAndShapeInfo* input); - OrtStatus*(ORT_API_CALL* KernelContext_GetInputCount)(const OrtKernelContext* context, _Out_ size_t* out); - OrtStatus*(ORT_API_CALL* KernelContext_GetInput)(const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out); - OrtStatus*(ORT_API_CALL* KernelContext_GetOutputCount)(const OrtKernelContext* context, _Out_ size_t* out); - OrtStatus*(ORT_API_CALL* KernelContext_GetOutput)(OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out); + OrtStatus*(ORT_API_CALL* KernelContext_GetInputCount)(_In_ const OrtKernelContext* context, _Out_ size_t* out); + OrtStatus*(ORT_API_CALL* KernelContext_GetInput)(_In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out); + OrtStatus*(ORT_API_CALL* KernelContext_GetOutputCount)(_In_ const OrtKernelContext* context, _Out_ size_t* out); + OrtStatus*(ORT_API_CALL* KernelContext_GetOutput)(_Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out); }; typedef struct OrtCustomOpApi OrtCustomOpApi; @@ -599,19 +601,19 @@ typedef struct OrtCustomOp OrtCustomOp; /* * Create a custom op domain. After all sessions using it are released, call OrtReleaseCustomOpDomain */ -ORT_API_STATUS(OrtCreateCustomOpDomain, _In_ const char* domain, _Out_ OrtCustomOpDomain** out); +ORT_API_STATUS(OrtCreateCustomOpDomain, _In_ const char* domain, _Outptr_ OrtCustomOpDomain** out); /* * Add custom ops to the OrtCustomOpDomain * Note: The OrtCustomOp* pointer must remain valid until the OrtCustomOpDomain using it is released */ -ORT_API_STATUS(OrtCustomOpDomain_Add, _In_ OrtCustomOpDomain* custom_op_domain, _In_ OrtCustomOp* op); +ORT_API_STATUS(OrtCustomOpDomain_Add, _Inout_ OrtCustomOpDomain* custom_op_domain, _In_ OrtCustomOp* op); /* * Add a custom op domain to the OrtSessionOptions * Note: The OrtCustomOpDomain* must not be deleted until the sessions using it are released */ -ORT_API_STATUS(OrtAddCustomOpDomain, _In_ OrtSessionOptions* options, OrtCustomOpDomain* custom_op_domain); +ORT_API_STATUS(OrtAddCustomOpDomain, _Inout_ OrtSessionOptions* options, _In_ OrtCustomOpDomain* custom_op_domain); /* * END EXPERIMENTAL */ diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index df15d2d2ecde6..e21e87596781e 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -73,8 +73,8 @@ struct Base { protected: Base(const Base&) = delete; - Base(Base&& v) : p_{v.p_} { v.p_ = nullptr; } - void operator=(Base&& v) { + Base(Base&& v) noexcept : p_{v.p_} { v.p_ = nullptr; } + void operator=(Base&& v) noexcept { OrtRelease(p_); p_ = v.p_; v.p_ = nullptr; @@ -101,8 +101,8 @@ struct Value; struct Env : Base { Env(nullptr_t) {} - Env(OrtLoggingLevel default_warning_level, _In_ const char* logid); - Env(OrtLoggingLevel default_warning_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param); + Env(OrtLoggingLevel default_logging_level, _In_ const char* logid); + Env(OrtLoggingLevel default_logging_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param); explicit Env(OrtEnv* p) : Base{p} {} }; @@ -117,13 +117,14 @@ struct RunOptions : Base { RunOptions(nullptr_t) {} RunOptions(); - RunOptions& SetRunLogVerbosityLevel(unsigned int); - unsigned int GetRunLogVerbosityLevel() const; + RunOptions& SetRunLogVerbosityLevel(int); + int GetRunLogVerbosityLevel() const; RunOptions& SetRunTag(const char* run_tag); const char* GetRunTag() const; - RunOptions& SetTerminate(bool flag); + RunOptions& EnableTerminate(); + RunOptions& DisableTerminate(); }; struct SessionOptions : Base { @@ -134,7 +135,7 @@ struct SessionOptions : Base { SessionOptions Clone() const; SessionOptions& SetThreadPoolSize(int session_thread_pool_size); - SessionOptions& SetGraphOptimizationLevel(uint32_t graph_optimization_level); + SessionOptions& SetGraphOptimizationLevel(int graph_optimization_level); SessionOptions& EnableCpuMemArena(); SessionOptions& DisableCpuMemArena(); @@ -252,7 +253,7 @@ struct AllocatorInfo : Base { struct CustomOpApi { CustomOpApi(const OrtCustomOpApi& api) : api_(api) {} - template // T is only implemented for float and int64_t + template // T is only implemented for float, int64_t, and string T KernelInfoGetAttribute(_In_ const OrtKernelInfo* info, _In_ const char* name); OrtTensorTypeAndShapeInfo* GetTensorTypeAndShape(_In_ const OrtValue* value); diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index 970155aeaa383..0fbbbde445b16 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +// Don't include this file directly. Please include "onnxruntime_cxx_api.h" instead. // These are the inline implementations of the C++ header APIs. They're in this separate file as to not clutter // the main C++ file with implementation details. @@ -90,13 +91,13 @@ inline RunOptions::RunOptions() { ORT_THROW_ON_ERROR(OrtCreateRunOptions(&p_)); } -inline RunOptions& RunOptions::SetRunLogVerbosityLevel(unsigned int level) { +inline RunOptions& RunOptions::SetRunLogVerbosityLevel(int level) { ORT_THROW_ON_ERROR(OrtRunOptionsSetRunLogVerbosityLevel(p_, level)); return *this; } -inline unsigned int RunOptions::GetRunLogVerbosityLevel() const { - unsigned int out; +inline int RunOptions::GetRunLogVerbosityLevel() const { + int out; ORT_THROW_ON_ERROR(OrtRunOptionsGetRunLogVerbosityLevel(p_, &out)); return out; } @@ -112,8 +113,13 @@ inline const char* RunOptions::GetRunTag() const { return out; } -inline RunOptions& RunOptions::SetTerminate(bool flag) { - ORT_THROW_ON_ERROR(OrtRunOptionsSetTerminate(p_, flag ? 1 : 0)); +inline RunOptions& RunOptions::EnableTerminate() { + ORT_THROW_ON_ERROR(OrtRunOptionsEnableTerminate(p_)); + return *this; +} + +inline RunOptions& RunOptions::DisableTerminate() { + ORT_THROW_ON_ERROR(OrtRunOptionsDisableTerminate(p_)); return *this; } @@ -132,7 +138,7 @@ inline SessionOptions& SessionOptions::SetThreadPoolSize(int session_thread_pool return *this; } -inline SessionOptions& SessionOptions::SetGraphOptimizationLevel(uint32_t graph_optimization_level) { +inline SessionOptions& SessionOptions::SetGraphOptimizationLevel(int graph_optimization_level) { ORT_THROW_ON_ERROR(OrtSetSessionGraphOptimizationLevel(p_, graph_optimization_level)); return *this; } @@ -283,7 +289,7 @@ inline Unowned TypeInfo::GetTensorTypeAndShapeInfo() con inline ONNXType TypeInfo::GetONNXType() const { ONNXType out; - ORT_THROW_ON_ERROR(OrtOnnxTypeFromTypeInfo(p_, &out)); + ORT_THROW_ON_ERROR(OrtGetOnnxTypeFromTypeInfo(p_, &out)); return out; } @@ -393,6 +399,24 @@ inline int64_t CustomOpApi::KernelInfoGetAttribute(_In_ const OrtKernel return out; } +template <> +inline std::string CustomOpApi::KernelInfoGetAttribute(_In_ const OrtKernelInfo* info, _In_ const char* name) { + size_t size = 0; + std::string out; + OrtStatus* status = api_.KernelInfoGetAttribute_string(info, name, nullptr, &size); + + // The status should be ORT_INVALID_ARGUMENT because the size is insufficient to hold the string + if (OrtGetErrorCode(status) == ORT_INVALID_ARGUMENT) { + OrtReleaseStatus(status); + out.resize(size); + ORT_THROW_ON_ERROR(api_.KernelInfoGetAttribute_string(info, name, &out[0], &size)); + out.resize(size - 1); // remove the terminating character '\0' + } else { + ORT_THROW_ON_ERROR(status); + } + return out; +} + inline OrtTensorTypeAndShapeInfo* CustomOpApi::GetTensorTypeAndShape(_In_ const OrtValue* value) { OrtTensorTypeAndShapeInfo* out; ORT_THROW_ON_ERROR(api_.GetTensorTypeAndShape(value, &out)); diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py index ad73baf144239..29e8f5fb33ebf 100644 --- a/onnxruntime/__init__.py +++ b/onnxruntime/__init__.py @@ -12,7 +12,7 @@ as Deep Learning algorithms in the `ONNX-ML format `_. """ -__version__ = "0.4.0" +__version__ = "0.5.0" __author__ = "Microsoft" from onnxruntime.capi import onnxruntime_validation diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc index 23eb0cc8e1424..7f7102475c620 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc +++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc @@ -228,77 +228,122 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { gsl::span last_cell_2 = last_cell.subspan(last_cell_size_per_direction, last_cell_size_per_direction); - auto fam = std::make_unique>( - alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false); - fam->SetWeights( + BahdanauAttention fam( + alloc, + logger, + batch_size, + max_memory_step, + memory_depth, + query_depth, + am_attn_size, + false); + + fam.SetWeights( FirstHalfSpan(am_v_weights.DataAsSpan()), FirstHalfSpan(am_query_layer_weights.DataAsSpan()), FirstHalfSpan(am_memory_layer_weights.DataAsSpan())); - fam->PrepareMemory(attn_memory.DataAsSpan(), memory_seq_lens_span); - - auto faw = std::make_unique>( - alloc, logger, batch_size, memory_depth, attn_layer_depth, hidden_size_, has_attention_layer, *fam); - faw->SetWeights(FirstHalfSpan(attn_layer_weights_span)); - - auto fw = std::make_unique>( + fam.PrepareMemory(attn_memory.DataAsSpan(), memory_seq_lens_span); + + AttentionWrapper faw( + alloc, + logger, + batch_size, + memory_depth, + attn_layer_depth, + hidden_size_, + has_attention_layer, + fam); + faw.SetWeights(FirstHalfSpan(attn_layer_weights_span)); + + UniDirectionalAttnLstm fw( alloc, logger, seq_length, batch_size, input_size, - hidden_size_, Direction::kForward, input_forget_, *faw, + hidden_size_, Direction::kForward, input_forget_, faw, bias_1, peephole_weights_1, initial_hidden_1, initial_cell_1, activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], clip_, ttp_); - auto bam = std::make_unique>( - alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false); - bam->SetWeights( + BahdanauAttention bam( + alloc, + logger, + batch_size, + max_memory_step, + memory_depth, + query_depth, + am_attn_size, + false); + bam.SetWeights( SecondHalfSpan(am_v_weights.DataAsSpan()), SecondHalfSpan(am_query_layer_weights.DataAsSpan()), SecondHalfSpan(am_memory_layer_weights.DataAsSpan())); - bam->PrepareMemory(attn_memory.DataAsSpan(), memory_seq_lens_span); - - auto baw = std::make_unique>( - alloc, logger, batch_size, memory_depth, attn_layer_depth, hidden_size_, has_attention_layer, *bam); - baw->SetWeights(SecondHalfSpan(attn_layer_weights_span)); - - auto bw = std::make_unique>( + bam.PrepareMemory(attn_memory.DataAsSpan(), memory_seq_lens_span); + + AttentionWrapper baw( + alloc, + logger, + batch_size, + memory_depth, + attn_layer_depth, + hidden_size_, + has_attention_layer, + bam); + baw.SetWeights(SecondHalfSpan(attn_layer_weights_span)); + + UniDirectionalAttnLstm bw( alloc, logger, seq_length, batch_size, input_size, - hidden_size_, Direction::kReverse, input_forget_, *baw, + hidden_size_, Direction::kReverse, input_forget_, baw, bias_2, peephole_weights_2, initial_hidden_2, initial_cell_2, activation_funcs_.Entries()[3], activation_funcs_.Entries()[4], activation_funcs_.Entries()[5], clip_, ttp_); - fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); - bw->Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2); + fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); + bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2); } else { - auto fam = std::make_unique>( - alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false); - fam->SetWeights( + BahdanauAttention fam( + alloc, + logger, + batch_size, + max_memory_step, + memory_depth, + query_depth, + am_attn_size, + false); + + fam.SetWeights( am_v_weights.DataAsSpan(), am_query_layer_weights.DataAsSpan(), am_memory_layer_weights.DataAsSpan()); - fam->PrepareMemory(attn_memory.DataAsSpan(), memory_seq_lens_span); + fam.PrepareMemory(attn_memory.DataAsSpan(), memory_seq_lens_span); + + AttentionWrapper faw( + alloc, + logger, + batch_size, + memory_depth, + attn_layer_depth, + hidden_size_, + has_attention_layer, + fam); - auto faw = std::make_unique>( - alloc, logger, batch_size, memory_depth, attn_layer_depth, hidden_size_, has_attention_layer, *fam); - faw->SetWeights(attn_layer_weights_span); + faw.SetWeights(attn_layer_weights_span); - auto fw = std::make_unique>( + UniDirectionalAttnLstm fw( alloc, logger, seq_length, batch_size, input_size, - hidden_size_, direction_, input_forget_, *faw, + hidden_size_, direction_, input_forget_, faw, bias_1, peephole_weights_1, initial_hidden_1, initial_cell_1, activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], clip_, ttp_); - fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); + fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); } if (!output.empty()) { diff --git a/onnxruntime/contrib_ops/cpu/fused_activation.cc b/onnxruntime/contrib_ops/cpu/fused_activation.cc new file mode 100644 index 0000000000000..d63e19991e754 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/fused_activation.cc @@ -0,0 +1,49 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "contrib_ops/cpu/fused_activation.h" + +namespace onnxruntime { + +common::Status GetFusedActivationAttr(const OpKernelInfo& info, MLAS_ACTIVATION& activation) { + // Convert the activation parameters from the node into a MLAS_ACTIVATION. + activation.ActivationKind = MlasIdentityActivation; + + std::string activation_type; + if (info.GetAttr("activation", &activation_type).IsOK()) { + if (activation_type == "Relu") { + activation.ActivationKind = MlasReluActivation; + } else if (activation_type == "Tanh") { + activation.ActivationKind = MlasTanhActivation; + } else if (activation_type == "Sigmoid") { + activation.ActivationKind = MlasLogisticActivation; + } else { + // The remaining activation types have additional parameters to be pulled out. + size_t activation_params_count; + if (activation_type == "LeakyRelu") { + activation.ActivationKind = MlasLeakyReluActivation; + activation_params_count = 1; + } else if (activation_type == "Clip") { + activation.ActivationKind = MlasClipActivation; + activation_params_count = 2; + } else { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "unimplemented activation: " + activation_type); + } + + std::vector activation_params; + common::Status status = info.GetAttrs("activation_params", activation_params); + if (!status.IsOK()) { + return status; + } else if (activation_params_count != activation_params.size()) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "activation_params count mismatch"); + } + for (size_t i = 0; i < activation_params_count; i++) { + activation.Parameters.Values[i] = activation_params[i]; + } + } + } + + return Status::OK(); +} + +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/fused_activation.h b/onnxruntime/contrib_ops/cpu/fused_activation.h new file mode 100644 index 0000000000000..0121a2038e1cb --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/fused_activation.h @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/util/math.h" +#include "core/mlas/inc/mlas.h" + +namespace onnxruntime { + +common::Status GetFusedActivationAttr(const OpKernelInfo& info, MLAS_ACTIVATION& activation); + +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/fused_conv.cc b/onnxruntime/contrib_ops/cpu/fused_conv.cc index ae8f81e8129ce..2e07fa27d7cbb 100644 --- a/onnxruntime/contrib_ops/cpu/fused_conv.cc +++ b/onnxruntime/contrib_ops/cpu/fused_conv.cc @@ -1,16 +1,26 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "fused_conv.h" +#include "core/providers/cpu/nn/conv.h" +#include "contrib_ops/cpu/fused_activation.h" namespace onnxruntime { namespace contrib { + +class FusedConvFloat final : public Conv { + public: + FusedConvFloat(const OpKernelInfo& info) : Conv(info) { + ORT_ENFORCE(GetFusedActivationAttr(info, activation_).IsOK()); + } +}; + ONNX_CPU_OPERATOR_TYPED_MS_KERNEL( FusedConv, 1, float, KernelDefBuilder() .TypeConstraint("T", DataTypeImpl::GetTensorType()), - FusedConv); + FusedConvFloat); + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/fused_conv.h b/onnxruntime/contrib_ops/cpu/fused_conv.h deleted file mode 100644 index 329eb82990838..0000000000000 --- a/onnxruntime/contrib_ops/cpu/fused_conv.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "core/providers/cpu/nn/conv_impl.h" - -namespace onnxruntime { -namespace contrib { - -template -class FusedConv : public Conv { - public: - FusedConv(const OpKernelInfo& info) : Conv(info) { - Conv::activation_ = info.GetAttrOrDefault("activation", ""); - Conv::alpha_ = info.GetAttrOrDefault("alpha", 0.01f); - } - - Status Compute(OpKernelContext* context) const override { - return Conv::Compute(context); - } -}; -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/fused_gemm.cc b/onnxruntime/contrib_ops/cpu/fused_gemm.cc index e3bfe5b3881ce..d743a3fcad7be 100644 --- a/onnxruntime/contrib_ops/cpu/fused_gemm.cc +++ b/onnxruntime/contrib_ops/cpu/fused_gemm.cc @@ -1,15 +1,26 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "fused_gemm.h" +#include "core/providers/cpu/math/gemm.h" namespace onnxruntime { namespace contrib { + +template +class FusedGemm final : public Gemm { + public: + FusedGemm(const OpKernelInfo& info) : Gemm(info) { + Gemm::activation_ = info.GetAttrOrDefault("activation", ""); + Gemm::leaky_relu_alpha_ = info.GetAttrOrDefault("leaky_relu_alpha", 0.01f); + } +}; + ONNX_CPU_OPERATOR_TYPED_MS_KERNEL( FusedGemm, 1, float, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - FusedGemm); + FusedGemm); + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/fused_gemm.h b/onnxruntime/contrib_ops/cpu/fused_gemm.h deleted file mode 100644 index 5be1b34cb41c4..0000000000000 --- a/onnxruntime/contrib_ops/cpu/fused_gemm.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "core/providers/cpu/math/gemm.h" - -namespace onnxruntime { -namespace contrib { -template -class FusedGemm : public Gemm { - public: - FusedGemm(const OpKernelInfo& info) : Gemm(info) { - Gemm::activation_ = info.GetAttrOrDefault("activation", ""); - Gemm::leaky_relu_alpha_ = info.GetAttrOrDefault("leaky_relu_alpha", 0.01f); - } - - Status Compute(OpKernelContext* context) const override { - return Gemm::Compute(context); - } -}; -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc new file mode 100644 index 0000000000000..b5625551ad104 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc @@ -0,0 +1,205 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/op_kernel_context_internal.h" +#include "nchwc_ops.h" +#include "core/mlas/inc/mlas.h" + +namespace onnxruntime { +namespace contrib { + +#define ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL(name, ver, type, builder, ...) \ + ONNX_OPERATOR_TYPED_KERNEL_EX(name, kMSNchwcDomain, ver, type, kCpuExecutionProvider, builder, __VA_ARGS__) + +ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL( + ReorderInput, + 1, + float, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + ReorderInput); + +ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL( + ReorderOutput, + 1, + float, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + ReorderOutput); + +ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL( + Conv, + 1, + float, + KernelDefBuilder() + .MayInplace(3, 0) + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + NchwcConv); + +ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL( + MaxPool, + 1, + float, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + NchwcMaxPool); + +ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL( + GlobalMaxPool, + 1, + float, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + NchwcMaxPool); + +ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL( + AveragePool, + 1, + float, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + NchwcAveragePool); + +ONNX_CPU_OPERATOR_TYPED_NCHWC_KERNEL( + GlobalAveragePool, + 1, + float, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + NchwcAveragePool); + +template +Status ReorderInput::Compute(OpKernelContext* context) const { + const auto* X = context->Input(0); + const auto& X_shape = X->Shape(); + ORT_ENFORCE(X_shape.NumDimensions() == 4); + ORT_ENFORCE((X_shape[1] % MlasNchwcGetBlockSize()) == 0); + auto* Y = context->Output(0, X_shape); + MlasReorderInput(X_shape.GetDims().data(), X->template Data(), Y->template MutableData()); + return Status::OK(); +} + +template +Status ReorderOutput::Compute(OpKernelContext* context) const { + const auto* X = context->Input(0); + const auto& X_shape = X->Shape(); + ORT_ENFORCE(X_shape.NumDimensions() == 4); + std::vector Y_shape(X_shape.GetDims()); + ORT_ENFORCE(channels_ <= Y_shape[1]); + Y_shape[1] = channels_; + auto* Y = context->Output(0, Y_shape); + MlasReorderOutput(Y_shape.data(), X->template Data(), Y->template MutableData()); + return Status::OK(); +} + +Status NchwcConv::Compute(OpKernelContext* context) const { + const auto* X = context->Input(0); + const auto* W = context->Input(1); + const auto* B = context->Input(2); + const auto* Sum = context->Input(3); + + ORT_RETURN_IF_ERROR(ConvBase::ValidateInputShape(X, W)); + + const auto& X_shape = X->Shape(); + const auto& W_shape = W->Shape(); + ORT_ENFORCE(X_shape.NumDimensions() == 4); + + const size_t nchwc_block_size = MlasNchwcGetBlockSize(); + ORT_ENFORCE((static_cast(X_shape[1]) < nchwc_block_size) || ((X_shape[1] % nchwc_block_size) == 0)); + + std::vector kernel_shape; + ORT_RETURN_IF_ERROR(ConvBase::ComputeKernelShape(W_shape, kernel_shape)); + if (kernel_shape.size() != 2) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Unsupported convolution size."); + } + + std::vector pads(ConvBase::pads_); + if (pads.empty()) { + pads.resize(kernel_shape.size() * 2, 0); + } + std::vector dilations(ConvBase::dilations_); + if (dilations.empty()) { + dilations.resize(kernel_shape.size(), 1); + } + std::vector strides(ConvBase::strides_); + if (strides.empty()) { + strides.resize(kernel_shape.size(), 1); + } + + std::vector Y_dims; + Y_dims.insert(Y_dims.begin(), {X_shape[0], W_shape[0]}); + TensorShape input_shape = X->Shape().Slice(2); + ORT_RETURN_IF_ERROR(ConvBase::InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims)); + auto* Y = context->Output(0, Y_dims); + auto* y_data = Y->template MutableData(); + + // Check for the optional Conv/Sum fusion. + if (Sum != nullptr) { + const auto& sum_shape = Sum->Shape(); + ORT_RETURN_IF_NOT(Y->Shape() == sum_shape, "output and sum shape must match"); + // If the output was not allocated inplace with the sum tensor, then copy here. + const auto* sum_data = Sum->template Data(); + if (y_data != sum_data) { + memcpy(y_data, sum_data, sum_shape.Size() * sizeof(float)); + } + } + + MlasNchwcConv(kernel_shape.size(), + X_shape.GetDims().data(), + kernel_shape.data(), + dilations.data(), + pads.data(), + strides.data(), + Y_dims.data(), + static_cast(ConvBase::group_), + X->template Data(), + W->template Data(), + B != nullptr ? B->template Data() : nullptr, + y_data, + &activation_, + Sum == nullptr, + const_cast(static_cast(context)->GetOperatorThreadPool())); + + return Status::OK(); +} + +Status NchwcPoolBase::NchwcPool(OpKernelContext* context, MLAS_POOLING_KIND kind) const { + const auto* X = context->Input(0); + + const auto& X_shape = X->Shape(); + ORT_ENFORCE(X_shape.NumDimensions() == 4); + ORT_ENFORCE((X_shape[1] % MlasNchwcGetBlockSize()) == 0); + + if (!global_pooling_) { + ORT_RETURN_IF_NOT(kernel_shape_.size() == 2, "kernel_shape num_dims is not compatible with X num_dims."); + } + + std::vector pads = pads_; + std::vector output_dims = PoolBase::SetOutputSize(X_shape, X_shape[1], &pads, dilations_, ceil_mode_); + auto* Y = context->Output(0, output_dims); + + MlasNchwcPool(kind, + 2, + X_shape.GetDims().data(), + global_pooling_ ? nullptr : kernel_shape_.data(), + global_pooling_ ? nullptr : dilations_.data(), + global_pooling_ ? nullptr : pads.data(), + global_pooling_ ? nullptr : strides_.data(), + output_dims.data(), + X->template Data(), + Y->template MutableData(), + const_cast(static_cast(context)->GetOperatorThreadPool())); + + return Status::OK(); +} + +Status NchwcMaxPool::Compute(OpKernelContext* context) const { + return NchwcPoolBase::NchwcPool(context, MlasMaximumPooling); +} + +Status NchwcAveragePool::Compute(OpKernelContext* context) const { + return NchwcPoolBase::NchwcPool(context, count_include_pad_ ? MlasAveragePoolingIncludePad : MlasAveragePoolingExcludePad); +} + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.h b/onnxruntime/contrib_ops/cpu/nchwc_ops.h new file mode 100644 index 0000000000000..65045cd0eeb85 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.h @@ -0,0 +1,75 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/providers/cpu/nn/conv_base.h" +#include "core/providers/cpu/nn/pool.h" +#include "contrib_ops/cpu/fused_activation.h" + +namespace onnxruntime { +namespace contrib { + +template +class ReorderInput : public OpKernel { + public: + ReorderInput(const OpKernelInfo& info) : OpKernel(info) { + } + + Status Compute(OpKernelContext* context) const override; +}; + +template +class ReorderOutput : public OpKernel { + public: + ReorderOutput(const OpKernelInfo& info) : OpKernel(info) { + ORT_ENFORCE(info.GetAttr("channels", &channels_).IsOK()); + ORT_ENFORCE(channels_ > 0, "invalid channel count"); + } + + Status Compute(OpKernelContext* context) const override; + + private: + int64_t channels_; +}; + +class NchwcConv : public OpKernel, public ConvBase { + public: + NchwcConv(const OpKernelInfo& info) : OpKernel(info), ConvBase(info) { + ORT_ENFORCE(GetFusedActivationAttr(info, activation_).IsOK()); + } + + Status Compute(OpKernelContext* context) const override; + + private: + MLAS_ACTIVATION activation_; +}; + +class NchwcPoolBase : public PoolBase { + public: + NchwcPoolBase(const OpKernelInfo& info) : PoolBase(info) { + } + + Status NchwcPool(OpKernelContext* context, MLAS_POOLING_KIND kind) const; +}; + +class NchwcMaxPool : public OpKernel, public NchwcPoolBase { + public: + NchwcMaxPool(const OpKernelInfo& info) : OpKernel(info), NchwcPoolBase(info) { + } + + Status Compute(OpKernelContext* context) const override; +}; + +class NchwcAveragePool : public OpKernel, public NchwcPoolBase { + public: + NchwcAveragePool(const OpKernelInfo& info) : OpKernel(info), NchwcPoolBase(info) { + } + + Status Compute(OpKernelContext* context) const override; +}; + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu_contrib_kernels.cc index e9994011aa039..8446a35bd8947 100644 --- a/onnxruntime/contrib_ops/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu_contrib_kernels.cc @@ -3,6 +3,7 @@ #include "contrib_ops/cpu_contrib_kernels.h" #include "core/graph/constants.h" +#include "core/mlas/inc/mlas.h" namespace onnxruntime { namespace contrib { @@ -49,6 +50,29 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Sca class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ThresholdedRelu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Scale); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, ReorderInput); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, ReorderOutput); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, GlobalMaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, AveragePool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, GlobalAveragePool); + +void RegisterNchwcKernels(KernelRegistry& kernel_registry) { + static const BuildKernelCreateInfoFn function_table[] = { + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo}; + + for (auto& function_table_entry : function_table) { + kernel_registry.Register(function_table_entry()); + } +} + void RegisterCpuContribKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, @@ -96,6 +120,12 @@ void RegisterCpuContribKernels(KernelRegistry& kernel_registry) { for (auto& function_table_entry : function_table) { kernel_registry.Register(function_table_entry()); } + + // Register the NCHWc kernels if supported by the platform. + if (MlasNchwcGetBlockSize() > 1) { + RegisterNchwcKernels(kernel_registry); + } } + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/common.cc b/onnxruntime/core/codegen/common/common.cc new file mode 100644 index 0000000000000..757c1677dd2e5 --- /dev/null +++ b/onnxruntime/core/codegen/common/common.cc @@ -0,0 +1,258 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/common/common.h" + +#include "core/framework/tensorprotoutils.h" +#include "core/graph/graph.h" +#include "core/graph/schema_registry.h" +#include +#include + +namespace onnxruntime { + +NodeKey GetKey(const onnxruntime::Node* node) { + ORT_ENFORCE(nullptr != node); + ORT_ENFORCE(node->OutputDefs().size() > 0); + return node->OutputDefs()[0]->Name(); +} + +NodeKey GetKey(const onnxruntime::Node& node) { + ORT_ENFORCE(node.OutputDefs().size() > 0); + return node.OutputDefs()[0]->Name(); +} + +NodeKey GetKey(const onnxruntime::NodeArg* def) { + // NodeArg's name is unique. + ORT_ENFORCE(nullptr != def); + return def->Name(); +} + +bool IsRecurrentNode(const onnxruntime::Node& node) { + auto op_type = node.OpType(); + return (op_type == "LSTM" || op_type == "RNN" || op_type == "GRU" || + op_type == "Scan" || op_type == "Loop"); +} + +bool IsAliasNode(const onnxruntime::Node& node) { + auto op_type = node.OpType(); + return (op_type == "Flatten" || op_type == "Identity" || op_type == "Reshape" || + op_type == "Squeeze" || op_type == "Unsqueeze"); +} + +std::string NormalizeCppName(const std::string& name) { + std::string normalized_name = name; + for (char c : {'.', ' ', '+', '-', '*', '/', '\\', '='}) + std::replace(normalized_name.begin(), normalized_name.end(), c, '_'); + return normalized_name; +} + +std::string NormalizeNodeArgName(const NodeArg* def) { + return NormalizeCppName(def->Name()); +} + +bool IsFusedNode(const Node& node) { + if (node.NodeType() == Node::Type::Fused) { + return true; + } + return false; +} + +// A unified API to get Subgraph +const Graph* GetSubgraph(const Node& node) { + if (node.NodeType() == Node::Type::Fused) { + return &(node.GetFunctionBody()->Body()); + } else if (node.OpType() == "Scan") { + return node.GetGraphAttribute("body"); + } + // return nullptr implying no subgraph + return nullptr; +} + +bool HasLoop(const Node& node) { + auto op_type = node.OpType(); + if (op_type == "LSTM" || + op_type == "GRU" || + op_type == "RNN" || + op_type == "Scan") { + return true; + } + return false; +} + +// Return the corresponding input node for the NodeArg of the given node +const onnxruntime::Node* GetInputNode(const Node& node, const NodeArg* def) { + const auto& input_name = def->Name(); + const onnxruntime::Node* input_node = nullptr; + // search input node set to see if input_name is in their outputs (weights are not from node) + for (auto iter = node.InputNodesBegin(); iter != node.InputNodesEnd(); ++iter) { + const onnxruntime::Node& p = *iter; + bool found = false; + p.ForEachWithIndex( + p.OutputDefs(), + [&found, &input_name](const onnxruntime::NodeArg& out_def, size_t) { + if (input_name == out_def.Name()) { + found = true; + } + return Status::OK(); + }); + if (found) + input_node = &p; + } + return input_node; +} + +// create capacity from subgraph +std::unique_ptr ToCapacity(const onnxruntime::GraphViewer& graph, + std::unique_ptr& subgraph) { + auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); + static int fuse_count = 0; + meta_def->name = "Fuse" + std::to_string(fuse_count++); + meta_def->domain = "Fuse"; + + std::set node_indices(subgraph->nodes.begin(), subgraph->nodes.end()); + + const auto& start_node_index = subgraph->nodes.front(); + const auto& start_node = *graph.GetNode(start_node_index); + const auto& end_node_index = subgraph->nodes.back(); + const auto& end_node = *graph.GetNode(end_node_index); + meta_def->name += start_node.OpType() + std::to_string(start_node_index); + meta_def->name += "_With" + std::to_string(subgraph->nodes.size()) + "Nodes_"; + meta_def->name += end_node.OpType() + std::to_string(end_node_index); + + for (const auto& node_index : subgraph->nodes) { + const auto& node = *graph.GetNode(node_index); + // handle current graph's inputs + node.ForEachWithIndex( + node.InputDefs(), + [&meta_def, &node, &node_indices](const onnxruntime::NodeArg& def, size_t) { + const onnxruntime::Node* input_node = GetInputNode(node, &def); + bool input_from_subgraph = (input_node && node_indices.count(input_node->Index())); + if (!input_from_subgraph) { + // input is from weights or outside of graph + meta_def->inputs.push_back(def.Name()); + } + return Status::OK(); + }); + + // Handle outouts + // two cases are considerd as outputs + // 1. Output NodeArg is not used by any Node + // 2. Output NodeArg is used by at least one Node out of this subgraph. + // Note a NodeArg can be used by Nodes in and out of the subgraph at the same time. + + auto InsertOutputToSubgraph = [&meta_def](const NodeArg* def) { + if (std::find(meta_def->outputs.begin(), meta_def->outputs.end(), def->Name()) == + meta_def->outputs.end()) { + meta_def->outputs.push_back(def->Name()); + } + }; + + std::unordered_set input_names_from_the_output_node; + + for (auto o_iter = node.OutputEdgesBegin(); o_iter != node.OutputEdgesEnd(); ++o_iter) { + const auto& p = *o_iter; + const Node& out_node = p.GetNode(); + + // preprocess for the case 1 + out_node.ForEachWithIndex( + out_node.InputDefs(), + [&input_names_from_the_output_node](const onnxruntime::NodeArg& in_def, size_t) { + input_names_from_the_output_node.insert(in_def.Name()); + return Status::OK(); + }); + + // handle the case 2 + if (node_indices.count(out_node.Index()) == 0) { + const NodeArg* def = node.OutputDefs()[p.GetSrcArgIndex()]; + InsertOutputToSubgraph(def); + } + } + + // handle case 1 + node.ForEachWithIndex( + node.OutputDefs(), + [&](const onnxruntime::NodeArg& def, size_t) { + if (input_names_from_the_output_node.count(def.Name()) == 0) { + InsertOutputToSubgraph(&def); + } + return Status::OK(); + }); + } + + // Handle subgraph's initializers + const auto& all_initializers = graph.GetAllInitializedTensors(); + for (const auto& node_index : subgraph->nodes) { + const auto& node = *graph.GetNode(node_index); + // check whether it is an immediate nested subgraph + auto immediate_nested_subgraph = GetSubgraph(node); + // If so, copy the immediate nested subgraph's initializers to meta_def->inputs. + // Note we don't need recursion here, since Ort did recursion for us by handling subgraph early than the current graph. + // Therefore, the all inner nested subgraph's initializers should be already in the immediate nested subgraph's inputs. + if (nullptr != immediate_nested_subgraph) { + for (auto& n : immediate_nested_subgraph->Nodes()) { + n.ForEachWithIndex( + n.InputDefs(), + [&meta_def, &all_initializers](const onnxruntime::NodeArg& def, size_t) { + auto iter = all_initializers.find(def.Name()); + if (iter != all_initializers.end()) { + meta_def->inputs.push_back(def.Name()); + } + return Status::OK(); + }); + } + } + } + + meta_def->since_version = 1; + meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL; + std::unique_ptr finished_subgraph(subgraph.release()); + finished_subgraph->SetMetaDef(meta_def); + return std::make_unique(std::move(finished_subgraph)); +} + +int64_t ShapeRank(const NodeArg* def) { + ORT_ENFORCE_DEBUG(nullptr != def); + return gsl::narrow_cast(def->Shape()->dim_size()); +} + +bool ShapeHasValue(const NodeArg* def, int i) { + ORT_ENFORCE_DEBUG(nullptr != def); + ORT_ENFORCE_DEBUG(i >= 0); + ORT_ENFORCE_DEBUG(i < def->Shape()->dim_size()); + return def->Shape()->dim(i).has_dim_value(); +} + +bool ShapeHasSymbol(const NodeArg* def, int i) { + ORT_ENFORCE_DEBUG(nullptr != def); + ORT_ENFORCE_DEBUG(i >= 0); + ORT_ENFORCE_DEBUG(i < def->Shape()->dim_size()); + return def->Shape()->dim(i).has_dim_param(); +} + +int64_t ShapeValue(const NodeArg* def, int i) { + ORT_ENFORCE_DEBUG(ShapeHasValue(def, i)); + return def->Shape()->dim(i).dim_value(); +} + +const std::string& ShapeSymbol(const NodeArg* def, int i) { + ORT_ENFORCE_DEBUG(ShapeHasSymbol(def, i)); + return def->Shape()->dim(i).dim_param(); +} + +ONNX_NAMESPACE::TensorProto_DataType TensorProtoDataType(const NodeArg* def) { + ORT_ENFORCE_DEBUG(nullptr != def); + return static_cast(def->TypeAsProto()->tensor_type().elem_type()); +} + +// Convert GraphNodes to internal NodePtrs without check lifetime. +// Please use it only locally when GraphNodes still exist +std::vector ConvertGraphNodesToNodePtrs(const GraphNodes& graph_nodes) { + std::vector nodes; + for (auto& node : graph_nodes) { + nodes.push_back(&node); + } + return nodes; +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/common.h b/onnxruntime/core/codegen/common/common.h new file mode 100644 index 0000000000000..11ad05325a381 --- /dev/null +++ b/onnxruntime/core/codegen/common/common.h @@ -0,0 +1,151 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/framework/compute_capability.h" +#include "core/framework/tensor.h" +#include "core/graph/graph_nodes.h" +#include "core/graph/graph_viewer.h" + +#ifndef NDEBUG +#define ORT_ENFORCE_DEBUG(...) ORT_ENFORCE(__VA_ARGS__) +#else +#define ORT_ENFORCE_DEBUG(...) +#endif // !NDEBUG + +// DYN_PROMOTE is a simplified llvm::dyn_cast, which does not need RTTI +// DYN_PROMOTE is faster than dynamic_cast and also has smaller binary size +// Please use DYN_PROMOTE in a critical path. +#define DYN_PROMOTE(BASE) \ + template \ + inline const ToType* Promote(const BASE* base) { \ + if (ToType::IsType(base)) \ + return static_cast(base); \ + return nullptr; \ + } \ + \ + template \ + inline ToType* Promote(BASE* base) { \ + if (ToType::IsType(base)) \ + return static_cast(base); \ + return nullptr; \ + } \ + \ + template \ + inline ToType* Promote(const std::unique_ptr& base) { \ + if (ToType::IsType(base.get())) \ + return static_cast(base); \ + return nullptr; \ + } \ + \ + template \ + inline ToType* Promote(const std::shared_ptr& base) { \ + if (ToType::IsType(base.get())) \ + return static_cast(base); \ + return nullptr; \ + } + +// DYN_PROMOTE_BASE is a macro inserted in the base class to support DYN_PROMOTE +// TYPE_ID is required for DYN_PROMOTE and TYPE_ID is a enum class +// TYPE_ID_VAR is a corresponding variable name for in the base class +#define DYN_PROMOTE_BASE(BASE, TYPE_ID, TYPE_ID_VAR) \ + inline const TYPE_ID TypeID() const { \ + return TYPE_ID_VAR; \ + } \ + \ + static inline bool IsType(const BASE*) { \ + return true; \ + } + +// DYN_PROMOTE_DERIVED is a macro inserted in a derived class to support DYN_PROMOTE +// TYPE_ID is required for DYN_PROMOTE and TYPE_ID is a enum class +// TYPE_ID_VALUE is corresponding TYPE_ID::value of a derived class. +#define DYN_PROMOTE_DERIVED(BASE, TYPE_ID, TYPE_ID_VALUE) \ + static inline bool IsType(const BASE* base) { \ + ORT_ENFORCE_DEBUG(nullptr != base); \ + return base->TypeID() == TYPE_ID::TYPE_ID_VALUE; \ + } + +// DYNAMIC_PROMOTE is a dynamic_cast needing RTTI +// DYNAMIC_PROMOTE is usually slower than than DYN_PROMOTE. +// Please use DYNAMIC_PROMOTE in a non-critical path. +#define DYNAMIC_PROMOTE(BASE) \ + template \ + inline const X* Promote(const BASE* base) { \ + auto derived = dynamic_cast(base); \ + ORT_ENFORCE(nullptr != derived); \ + return derived; \ + } \ + \ + template \ + inline X* Promote(BASE* base) { \ + auto derived = dynamic_cast(base); \ + ORT_ENFORCE(nullptr != derived); \ + return derived; \ + } \ + \ + template \ + inline X* Promote(const std::unique_ptr& base) { \ + auto derived = dynamic_cast(base.get()); \ + ORT_ENFORCE(nullptr != derived); \ + return derived; \ + } \ + \ + template \ + inline X* Promote(const std::shared_ptr& base) { \ + auto derived = dynamic_cast(base.get()); \ + ORT_ENFORCE(nullptr != derived); \ + return derived; \ + } + +namespace onnxruntime { + +// Nodekey is used as a key for maps +using NodeKey = std::string; + +NodeKey GetKey(const onnxruntime::Node* node); +NodeKey GetKey(const onnxruntime::Node& node); +NodeKey GetKey(const onnxruntime::NodeArg* def); + +bool IsRecurrentNode(const onnxruntime::Node& node); + +bool IsAliasNode(const onnxruntime::Node& node); + +// Helper function that creates ComputeCapability for subgraphs +std::unique_ptr ToCapacity(const onnxruntime::GraphViewer& graph, + std::unique_ptr& subgraph); + +bool IsFusedNode(const Node& node); + +bool HasLoop(const Node& node); + +const Graph* GetSubgraph(const Node& node); + +std::string NormalizeCppName(const std::string& name); + +std::string NormalizeNodeArgName(const NodeArg* def); + +// Return the corresponding input node for the NodeArg of the given node +const onnxruntime::Node* GetInputNode(const Node& node, const NodeArg* def); + +int64_t ShapeRank(const NodeArg* def); + +bool ShapeHasValue(const NodeArg* def, int i); + +bool ShapeHasSymbol(const NodeArg* def, int i); + +int64_t ShapeValue(const NodeArg* def, int i); + +const std::string& ShapeSymbol(const NodeArg* def, int i); + +ONNX_NAMESPACE::TensorProto_DataType TensorProtoDataType(const NodeArg* def); + +// Convert GraphNodes to internal NodePtrs without check lifetime. +// Please use it only locally when GraphNodes still exist +std::vector ConvertGraphNodesToNodePtrs(const GraphNodes& graph_nodes); + +enum : int { + Dimension_Unknown = -1, +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/creator.h b/onnxruntime/core/codegen/common/creator.h new file mode 100644 index 0000000000000..d15e86b5a481f --- /dev/null +++ b/onnxruntime/core/codegen/common/creator.h @@ -0,0 +1,76 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/common/dispatcher.h" + +// TODO rename this file to creator_base +namespace onnxruntime { +namespace codegen { + +// It is a base class for TVM Op IR builder, weight layout builder, TVM scheduler +// CreatorBase is a template class of compiler pass +// for 1) TVM IR builder +// 2) Weight layout transformer +// 3) TVM Scheduler, etc. +// CreatorBase is similor to OpXXCreate in llvm IR builder + +template +class CreatorBase { + public: + CreatorBase(const std::string& name) + : name_(name) {} + + ~CreatorBase() = default; + + virtual RETURN_TYPE Evaluate(INPUT_TYPE, + NODE_TYPE, + CONTEXT_TYPE, + OUTPUT_TYPE) = 0; + + const std::string& Name() const { + return name_; + } + + protected: + std::string name_; + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CreatorBase); +}; + +// macro to stringize +#define STRINGIZE_NX(OP) #OP +#define STRINGIZE(OP) STRINGIZE_NX(OP) + +// macro returns class name +#define CREATOR_CLASS(OP, POSTFIX) \ + OP##POSTFIX + +// macro returns class name as string +#define CREATOR_STRING(OP, POSTFIX) \ + STRINGIZE(CREATOR_CLASS(OP, POSTFIX)) + +// macro returns class constructor name +#define CREATOR_CLASS_FUNC(OP, POSTFIX) \ + OP##POSTFIX() + +// macro declares a creator class inheriting the template class CreatorBase +// with corresponding template parameters +#define DECLARE_CREATOR_CLASS(OP, POSTFIX, INPUT, NODE, CONTEXT, OUTPUT, RETURN) \ + class CREATOR_CLASS(OP, POSTFIX) : public onnxruntime::codegen::CreatorBase { \ + public: \ + CREATOR_CLASS_FUNC(OP, POSTFIX) : CreatorBase(CREATOR_STRING(OP, POSTFIX)) {} \ + RETURN Evaluate(INPUT, \ + NODE, \ + CONTEXT, \ + OUTPUT) override; \ + \ + private: \ + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CREATOR_CLASS(OP, POSTFIX)); \ + }; + +} // namespace codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/dispatcher.h b/onnxruntime/core/codegen/common/dispatcher.h new file mode 100644 index 0000000000000..b4313cecad3a8 --- /dev/null +++ b/onnxruntime/core/codegen/common/dispatcher.h @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include +#include +#include + +namespace onnxruntime { +namespace codegen { + +// DispatcherBase is a customized unordered_map +// that provides all codegen-related functionality +// including 1) dispatching a pass +// 2) dump corresponding name +// DispatcherBase may or may not keep ownership, +// depending on the template parameter, CONTENT_TYPE. + +template +class DispatcherBase { + public: + DispatcherBase(const std::string& name) + : name_(name) {} + + const std::string& Name() const { + return name_; + } + + bool Contains(const std::string& name) const { + return contents_.count(name) > 0; + } + + void ForEach(std::function + func) { + for (auto& p : contents_) { + func(p.first, p.second); + } + } + + bool Register(const std::string& name, + CONTENT_TYPE op) { + if (!Contains(name)) { + contents_.emplace(name, op); + return true; + } + return false; + } + + CONTENT_TYPE Get(const std::string& key) const { + auto iter = contents_.find(key); + if (iter != contents_.end()) { + return iter->second; + } + return nullptr; + } + + const std::unordered_map GetContents() const { + return contents_; + } + + std::unordered_map GetMutableContents() { + return contents_; + } + + protected: + std::string name_; + std::unordered_map contents_; + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(DispatcherBase); +}; + +} // namespace codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/dump_array.h b/onnxruntime/core/codegen/common/dump_array.h new file mode 100644 index 0000000000000..8e51cd36d0087 --- /dev/null +++ b/onnxruntime/core/codegen/common/dump_array.h @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include +#include +#include +#include + +namespace onnxruntime { + +template +void DumpArrayRecursive(const T1* data, int64_t& data_offset, const std::vector& shape, int idx) { + int dim = static_cast(shape.size()); + if (dim == 0) { + std::cout << "[]\n"; + return; + } + + assert(idx < dim); + int sz = shape[idx]; + + std::cout << "["; + if (idx < dim - 1) { + for (auto i = 0; i < sz; ++i) { + DumpArrayRecursive(data, data_offset, shape, idx + 1); + if (i < sz - 1) { + std::cout << ","; + // print multiple newlines after ',' when necessary + for (int j = idx + 1; j < dim; j++) + std::cout << "\n"; + // print leading spaces before "[" when necessary + for (int j = 0; j < idx + 1; ++j) + std::cout << " "; + } + } + } else { + for (auto i = 0; i < sz; ++i) { + if (std::is_same::value || std::is_same::value) + std::cout << std::setw(3) << static_cast(*(data + data_offset)); + else + std::cout << std::setw(12) << std::setprecision(8) << *(data + data_offset); + data_offset++; + if (i < sz - 1) + std::cout << ","; + } + } + std::cout << "]"; +} + +// A helper function to dump multidimensional arrays in a way similar to numpy +template +void DumpArray(const std::string& tag, const T1* data, const std::vector& shape) { + std::cout << tag << "\n"; + int64_t data_offset = 0; + DumpArrayRecursive(data, data_offset, shape, 0); + assert(data_offset == TotalSize(shape)); + std::cout << std::endl; +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/handle.h b/onnxruntime/core/codegen/common/handle.h new file mode 100644 index 0000000000000..7caad27dcbe01 --- /dev/null +++ b/onnxruntime/core/codegen/common/handle.h @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/common/target_info.h" +#include +#include + +namespace onnxruntime { +namespace codegen { + +using DomainVersionLookupFunc = std::function; + +struct CodeGenHandle { + CodeGenTarget* codegen_target; + DomainVersionLookupFunc domain_version_lookup_func = + // by default, always uses the latest opset implemented + [](const std::string&) { return INT_MAX; }; +}; + +} // namespace codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/op_macro.h b/onnxruntime/core/codegen/common/op_macro.h new file mode 100644 index 0000000000000..91a0e803e521a --- /dev/null +++ b/onnxruntime/core/codegen/common/op_macro.h @@ -0,0 +1,98 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +namespace onnxruntime { + +#define LIST_BINARY_OPS() \ + BINARY_OP(Add) \ + BINARY_OP(Div) \ + BINARY_OP(Mul) \ + BINARY_OP(PRelu) \ + BINARY_OP(Sub) + +#define LIST_BINARY_CMP_OPS() \ + BINARY_CMP_OP(Equal) \ + BINARY_CMP_OP(Greater) \ + BINARY_CMP_OP(Less) + +#define LIST_POOL_OPS() \ + POOL_OP(MaxPool) \ + POOL_OP(AveragePool) \ + POOL_OP(GlobalMaxPool) \ + POOL_OP(GlobalAveragePool) + +#define LIST_REDUCE_OPS() \ + REDUCE_INDEXED_OP(ArgMax) \ + REDUCE_INDEXED_OP(ArgMin) \ + REDUCE_OP(ReduceL1) \ + REDUCE_OP(ReduceL2) \ + REDUCE_OP(ReduceLogSum) \ + REDUCE_OP(ReduceLogSumExp) \ + REDUCE_OP(ReduceMax) \ + REDUCE_OP(ReduceMean) \ + REDUCE_OP(ReduceMin) \ + REDUCE_OP(ReduceProd) \ + REDUCE_OP(ReduceSum) \ + REDUCE_OP(ReduceSumSquare) + +#define LIST_UNARY_OPS() \ + UNARY_OP(Abs) \ + UNARY_OP(Affine) \ + UNARY_OP(Ceil) \ + UNARY_OP(Elu) \ + UNARY_OP(Exp) \ + UNARY_OP(Floor) \ + UNARY_OP(HardSigmoid) \ + UNARY_OP(LeakyRelu) \ + UNARY_OP(Log) \ + UNARY_OP(Neg) \ + UNARY_OP(ParametricSoftplus) \ + UNARY_OP(Reciprocal) \ + UNARY_OP(Relu) \ + UNARY_OP(ScaledTanh) \ + UNARY_OP(Selu) \ + UNARY_OP(Sigmoid) \ + UNARY_OP(Softplus) \ + UNARY_OP(Softsign) \ + UNARY_OP(Sqrt) \ + UNARY_OP(Tanh) \ + UNARY_OP(ThresholdedRelu) + +#define LIST_VARIADIC_OPS() \ + VARIADIC_OP(Max) \ + VARIADIC_OP(Min) \ + VARIADIC_OP(Sum) + +#define LIST_ALL_GENERIC_OPS() \ + LIST_BINARY_OPS() \ + LIST_BINARY_CMP_OPS() \ + LIST_REDUCE_OPS() \ + LIST_POOL_OPS() \ + LIST_UNARY_OPS() \ + LIST_VARIADIC_OPS() \ + ADD_OP_ITEM(Cast) \ + ADD_OP_ITEM(Clip) \ + ADD_OP_ITEM(Concat) \ + ADD_OP_ITEM(Conv) \ + ADD_OP_ITEM(Crop) \ + ADD_OP_ITEM(Dropout) \ + ADD_OP_ITEM(Flatten) \ + ADD_OP_ITEM(Gather) \ + ADD_OP_ITEM(Gemm) \ + ADD_OP_ITEM(Identity) \ + ADD_OP_ITEM(LogSoftmax) \ + ADD_OP_ITEM(LSTM) \ + ADD_OP_ITEM(MatMul) \ + ADD_OP_ITEM(MatMulInteger) \ + ADD_OP_ITEM(Pad) \ + ADD_OP_ITEM(Reshape) \ + ADD_OP_ITEM(Slice) \ + ADD_OP_ITEM(Softmax) \ + ADD_OP_ITEM(Split) \ + ADD_OP_ITEM(Squeeze) \ + ADD_OP_ITEM(Transpose) \ + ADD_OP_ITEM(Unsqueeze) \ + ADD_OP_ITEM(Where) + +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/profile.h b/onnxruntime/core/codegen/common/profile.h new file mode 100644 index 0000000000000..642ae83db723b --- /dev/null +++ b/onnxruntime/core/codegen/common/profile.h @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +// uncomment this line or use -DCODEGEN_ENABLE_PROFILER in compiler options to enable profiler events in codegen +//#define CODEGEN_ENABLE_PROFILER + +#ifdef CODEGEN_ENABLE_PROFILER +#include "core/common/profiler.h" + +namespace onnxruntime { + +class ProfilerEvent { + public: + ProfilerEvent(const std::string& name) : name_(name) { + ts_ = profiling::Profiler::Instance().StartTime(); + } + + ~ProfilerEvent() { + profiling::Profiler::Instance().EndTimeAndRecordEvent(profiling::EventCategory::NODE_EVENT, name_, ts_); + } + + private: + TimePoint ts_; + const std::string name_; +}; + +} // namespace onnxruntime + +#define CODEGEN_PROFILER_EVENT(name) onnxruntime::ProfilerEvent name##_profiler_event(#name) + +#else + +#define CODEGEN_PROFILER_EVENT(name) + +#endif diff --git a/onnxruntime/core/codegen/common/registry.h b/onnxruntime/core/codegen/common/registry.h new file mode 100644 index 0000000000000..1ec06d4d8d96c --- /dev/null +++ b/onnxruntime/core/codegen/common/registry.h @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include +#include +#include + +namespace onnxruntime { +namespace codegen { + +// RegistryBase is a customized unordered_map +// that keep ownership of passes, +// including 1) IR builder passes +// 2) Weight layout transformer passes +// 3) Scheduler passses, etc. + +template +class RegistryBase { + public: + RegistryBase() = default; + + bool Contains(const std::string& name) const { + return contents_.count(name) > 0; + } + + CONTENT_TYPE* Get(const std::string& name) const { + if (contents_.find(name) != contents_.end()) + return contents_.at(name).get(); + return nullptr; + } + + CONTENT_TYPE* RegisterOrGet( + const std::string& name, + std::unique_ptr&& ptr) { + if (!Contains(name)) + contents_.emplace(name, std::move(ptr)); + return Get(name); + } + + CONTENT_TYPE* RegisterOrGet( + std::unique_ptr&& ptr) { + return RegisterOrGet(ptr->Name(), std::move(ptr)); + } + + bool Register( + const std::string& name, + std::unique_ptr&& ptr) { + if (!Contains(name)) { + contents_.emplace(name, std::move(ptr)); + return true; + } + return false; + } + + bool Register( + std::unique_ptr&& ptr) { + return Register(ptr->Name(), std::move(ptr)); + } + + protected: + std::unordered_map> contents_; + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RegistryBase); +}; + +// Put common Registry Management utilities if these is any + +} // namespace codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/settings.cc b/onnxruntime/core/codegen/common/settings.cc new file mode 100644 index 0000000000000..c046f2892088d --- /dev/null +++ b/onnxruntime/core/codegen/common/settings.cc @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/common/settings.h" + +#include "core/common/logging/logging.h" +#include +#include + +namespace onnxruntime { +namespace codegen { + +CodeGenSettings& CodeGenSettings::Instance() { + static CodeGenSettings settings; + return settings; +} + +CodeGenSettings::CodeGenSettings() {} + +void CodeGenSettings::InsertOptions(const std::map& options) { + for (const auto& option : options) { + const auto& key = option.first; + const auto& value = option.second; + + auto iter = options_.find(key); + // found existing ones + if (iter != options_.end()) { + if (iter->second != value) { + LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << "CodeGenSettings: option" + << key << " is overridded from: " + << iter->second << " to: " << value; + iter->second = value; + } + } else { + options_.insert(std::make_pair(key, value)); + } + } +} + +void CodeGenSettings::DumpOptions() const { + std::ostringstream stream; + stream << "CodeGenSettings: dump all options" << std::endl; + for (const auto& option : options_) { + stream << " " << option.first << " = " << option.second << std::endl; + } + LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str(); +} + +std::string CodeGenSettings::GetOptionValue(const std::string& key) const { + const auto& iter = options_.find(key); + if (iter == options_.end()) { + LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << "CodeGenSettings::GetOptionValue: unrecognized option" << key; + return ""; + } + return iter->second; +} + +bool CodeGenSettings::HasOption(const std::string& key) const { + return options_.count(key) > 0; +} + +bool CodeGenSettings::OptionMatches(const std::string& key, const std::string& value) const { + if (!HasOption(key)) + return false; + +#ifdef _WIN32 + return 0 == _stricmp(options_.at(key).c_str(), value.c_str()); +#else + return 0 == strcasecmp(options_.at(key).c_str(), value.c_str()); +#endif +} + +} // namespace codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/settings.h b/onnxruntime/core/codegen/common/settings.h new file mode 100644 index 0000000000000..95a2282ccb1ff --- /dev/null +++ b/onnxruntime/core/codegen/common/settings.h @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include + +namespace onnxruntime { +namespace codegen { + +// use log level warning as default to make sure logs are outputted +#define CODEGEN_SETTINGS_LOG_LEVEL WARNING + +// This stores codegen settings to control dumps, execution preference, etc. +// CodeGenSettings could come from command line options or environment variables +// Or could come from a static variables in source code +class CodeGenSettings { + public: + // generic built-in options + constexpr static const char* kDumpAllOptions = "dump_all_options"; + constexpr static const char* kCodeGenDumpModule = "codegen_dump_module"; // dump tvm module + constexpr static const char* kCodeGenDumpLower = "codegen_dump_lower"; // dump lowered func + constexpr static const char* kCodeGenDumpSchedule = "codegen_dump_schedule"; // dump scheduler + + void InsertOptions(const std::map& options); + void DumpOptions() const; + std::string GetOptionValue(const std::string& key) const; + bool HasOption(const std::string& key) const; + bool OptionMatches(const std::string& key, const std::string& value) const; + static CodeGenSettings& Instance(); + + private: + CodeGenSettings(); + + std::map options_; +}; + +} // namespace codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/target_info.h b/onnxruntime/core/codegen/common/target_info.h new file mode 100644 index 0000000000000..da063545f0a1e --- /dev/null +++ b/onnxruntime/core/codegen/common/target_info.h @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { + +// CodeGenTarget holds meta info for backend code generation +// and will be lowered to a target of corresponding backend +// code generation, e.g. TVM's Target. +class CodeGenTarget { + public: + CodeGenTarget() {} + CodeGenTarget(const std::string& target_name) + : target_name_(target_name) {} + + virtual int NaturalVectorWidth(int /*bits*/) const { + return 1; + } + + const std::string& GetTargetName() const { + return target_name_; + } + + virtual ~CodeGenTarget() = default; + + private: + std::string target_name_{"unknown"}; // default name is unknown +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/utils.cc b/onnxruntime/core/codegen/common/utils.cc new file mode 100644 index 0000000000000..45c2436a18a82 --- /dev/null +++ b/onnxruntime/core/codegen/common/utils.cc @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/common/utils.h" + +#include +#include + +namespace onnxruntime { + +std::unique_ptr GetEnv(const char* var) { + char* val = nullptr; +#if _MSC_VER + size_t len; + + if (_dupenv_s(&val, &len, var)) { + // Something went wrong, just return nullptr. + return nullptr; + } +#else + val = getenv(var); +#endif // _MSC_VER + + if (val == nullptr) { + return nullptr; + } + + // On windows, we will have to explicitly free val. Instead of returning val + // to its caller and make distinguish between windows and linux, we return + // a unique_ptr, and it will be destroyed automatically after the caller + // completes. + size_t len_val = strlen(val) + 1; + auto p = std::make_unique(len_val); + // use explicit loop to get ride of VC's warning on unsafe copy + for (size_t i = 0; i < len_val; ++i) { + p[i] = val[i]; + } + return p; +} + +bool IsEnvVarDefined(const char* var) { + auto val = GetEnv(var); + return val != nullptr; +} + +int64_t TotalSize(const std::vector& shape) { + int64_t total = 1; + for (auto s : shape) { + total *= s; + } + return total; +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/utils.h b/onnxruntime/core/codegen/common/utils.h new file mode 100644 index 0000000000000..40f300888d680 --- /dev/null +++ b/onnxruntime/core/codegen/common/utils.h @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include +#include + +namespace onnxruntime { + +// Holding utility functions that are not tied to TVM and ORT + +std::unique_ptr GetEnv(const char* var); + +// Check if an environment variable is set +bool IsEnvVarDefined(const char* var); + +int64_t TotalSize(const std::vector& shape); + +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/common.h b/onnxruntime/core/codegen/mti/common.h new file mode 100644 index 0000000000000..be3896bed48fb --- /dev/null +++ b/onnxruntime/core/codegen/mti/common.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +#define MTI_ASSERT(condition) \ + if (!(condition)) { \ + std::string error_msg = "Not satsified: " #condition \ + ": line " + std::to_string(__LINE__) + \ + " in file " + std::string(__FILE__) + "\n"; \ + throw std::runtime_error(error_msg); \ + } + diff --git a/onnxruntime/core/codegen/mti/debug/tvm_print.cc b/onnxruntime/core/codegen/mti/debug/tvm_print.cc new file mode 100644 index 0000000000000..0491636032b47 --- /dev/null +++ b/onnxruntime/core/codegen/mti/debug/tvm_print.cc @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/debug/tvm_print.h" + +#include "core/codegen/common/utils.h" +#include "core/codegen/common/dump_array.h" +#include "core/codegen/mti/common.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.print") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) { + DLTensor* X = args[0]; + DLTensor* Y = args[1]; + + DLDataType dtype = X->dtype; + std::vector shape; + int64_t total_size = 1; + for (int i = 0; i < X->ndim; ++i) { + shape.push_back(X->shape[i]); + total_size *= X->shape[i]; + } + + // pass X to Y + memcpy(static_cast(Y->data) + Y->byte_offset, + static_cast(X->data) + X->byte_offset, + total_size * dtype.bits / 8); + + if (tvm::runtime::TypeMatch(dtype, kDLFloat, 32)) { + float* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); + DumpArray("float tensor:", data, shape); + } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 8)) { + int8_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); + DumpArray("int8 tensor:", data, shape); + } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 16)) { + int16_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); + DumpArray("int16 tensor:", data, shape); + } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 32)) { + int32_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); + DumpArray("int32 tensor:", data, shape); + } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 8)) { + uint8_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); + DumpArray("uint8 tensor:", data, shape); + } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 16)) { + uint16_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); + DumpArray("uint16 tensor:", data, shape); + } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 32)) { + uint32_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); + DumpArray("uint32 tensor:", data, shape); + } else { + MTI_ASSERT(0 && "not implemented!"); + } + }); + +tvm::Array +PrintTVMTensorExtern(const tvm::Tensor& X, + const std::string& name) { + return topi::detail::make_extern( + {X->shape}, + {X->dtype}, + {X}, + [&](tvm::Array ins, tvm::Array outs) { + return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.print"), + topi::detail::pack_buffer(ins[0]), + topi::detail::pack_buffer(outs[0])}); + }, + name + "_print", "", {}); +} + +tvm::Tensor PrintImmutable(const tvm::Tensor& X) { + auto outputs = PrintTVMTensorExtern(X, X->op->name + "_print"); + return outputs[0]; +} + +void Print(tvm::Tensor& X) { + X = PrintImmutable(X); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/debug/tvm_print.h b/onnxruntime/core/codegen/mti/debug/tvm_print.h new file mode 100644 index 0000000000000..91a334785a2a4 --- /dev/null +++ b/onnxruntime/core/codegen/mti/debug/tvm_print.h @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Array PrintTVMTensorExtern( + const tvm::Tensor& X, + const std::string& name = "PrintTVM2DTensorExtern"); + +tvm::Tensor PrintImmutable(const tvm::Tensor& X); + +void Print(tvm::Tensor& X); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/binary_ops.cc b/onnxruntime/core/codegen/mti/math/binary_ops.cc new file mode 100644 index 0000000000000..f3048799458f4 --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/binary_ops.cc @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/math/binary_ops.h" + +#include "core/codegen/mti/math/unary_ops.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/cast_ops.h" +#include + +// Using namespace topi for override operator +-*/ +using namespace topi; + +namespace onnxruntime { +namespace tvm_codegen { + +#define TVM_BINARY_OP1(op, expr) \ + tvm::Tensor op(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { \ + return Rename(expr, name); \ + } \ + tvm::Tensor op(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { \ + return Rename(expr, name); \ + } + +#define TVM_BINARY_OP(op, expr) \ + TVM_BINARY_OP1(op, expr) \ + tvm::Tensor op(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { \ + return Rename(expr, name); \ + } + +TVM_BINARY_OP(Add, lhs + rhs); +TVM_BINARY_OP(Div, lhs / rhs); +TVM_BINARY_OP(Max, maximum(lhs, rhs)); +TVM_BINARY_OP(Min, minimum(lhs, rhs)); +TVM_BINARY_OP(Mul, lhs* rhs); +TVM_BINARY_OP1(PRelu, Relu(lhs) - rhs * Relu(0 - lhs)); +TVM_BINARY_OP(Sub, lhs - rhs); + +tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { + return topi::equal(lhs, rhs, name); +} +tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { + return topi::equal(lhs, rhs, name); +} +tvm::Tensor Equal(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { + return topi::equal(lhs, rhs, name); +} + +tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { + return topi::greater(lhs, rhs, name); +} +tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { + return topi::greater(lhs, rhs, name); +} +tvm::Tensor Greater(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { + return topi::greater(lhs, rhs, name); +} + +tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { + return topi::less(lhs, rhs, name); +} +tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { + return topi::less(lhs, rhs, name); +} +tvm::Tensor Less(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { + return topi::less(lhs, rhs, name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/binary_ops.h b/onnxruntime/core/codegen/mti/math/binary_ops.h new file mode 100644 index 0000000000000..dd51ce5e7917d --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/binary_ops.h @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Add(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "add"); +tvm::Tensor Add(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "add"); +tvm::Tensor Add(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "add"); +tvm::Tensor Div(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "div"); +tvm::Tensor Div(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "div"); +tvm::Tensor Div(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "div"); +tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "equal"); +tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "equal"); +tvm::Tensor Equal(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "equal"); +tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "greater"); +tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "greater"); +tvm::Tensor Greater(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "greater"); +tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "less"); +tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "less"); +tvm::Tensor Less(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "less"); +tvm::Tensor Max(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "max"); +tvm::Tensor Max(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "max"); +tvm::Tensor Max(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "max"); +tvm::Tensor Min(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "min"); +tvm::Tensor Min(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "min"); +tvm::Tensor Min(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "min"); +tvm::Tensor Mul(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "mul"); +tvm::Tensor Mul(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "mul"); +tvm::Tensor Mul(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "mul"); +tvm::Tensor PRelu(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "prelu"); +tvm::Tensor PRelu(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "prelu"); +tvm::Tensor Sub(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "sub"); +tvm::Tensor Sub(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "sub"); +tvm::Tensor Sub(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "sub"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/gemm.cc b/onnxruntime/core/codegen/mti/math/gemm.cc new file mode 100644 index 0000000000000..b5e5da5301775 --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/gemm.cc @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/math/gemm.h" + +#include "core/codegen/mti/math/matmul_ops.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include + +// Using namespace topi for override operator +-*/ +using namespace topi; + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Gemm(const tvm::Tensor& A, const tvm::Tensor& B, const tvm::Tensor& C, + bool trans_A, bool trans_B, float alpha, float beta, + const std::string& name) { + auto A_dot_B = MatMul2D(A, B, trans_A, trans_B, name + "_matmul2d"); + if (beta != 0) { + return Rename(alpha * A_dot_B + (beta * C), name); + } else { + return Rename(alpha * A_dot_B, name); + } +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/gemm.h b/onnxruntime/core/codegen/mti/math/gemm.h new file mode 100644 index 0000000000000..3bb205c13fdc9 --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/gemm.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Gemm(const tvm::Tensor& p_A, const tvm::Tensor& p_B, const tvm::Tensor& p_C, + bool trans_A, bool trans_B, float alpha, float beta, + const std::string& name = "gemm"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/logsoftmax.cc b/onnxruntime/core/codegen/mti/math/logsoftmax.cc new file mode 100644 index 0000000000000..cd8c2edae6959 --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/logsoftmax.cc @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/math/logsoftmax.h" + +#include "core/codegen/mti/tensor/reshape_ops.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor LogSoftmax(const tvm::Tensor& input, int64_t axis, const std::string& name) { + tvm::Tensor flatten_t = Flatten(input, axis, "logsoftmax_flatten"); + return Reshape(topi::nn::log_softmax(flatten_t, name), input->shape, "logsoftmax_reshape"); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/logsoftmax.h b/onnxruntime/core/codegen/mti/math/logsoftmax.h new file mode 100644 index 0000000000000..606a32806434b --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/logsoftmax.h @@ -0,0 +1,11 @@ +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor LogSoftmax(const tvm::Tensor& input, int64_t axis, const std::string& name = "logsoftmax"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.cc b/onnxruntime/core/codegen/mti/math/matmul_ops.cc new file mode 100644 index 0000000000000..672aa3a6cf8db --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/matmul_ops.cc @@ -0,0 +1,138 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/math/matmul_ops.h" + +#include "core/codegen/mti/common.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a, bool trans_b, const std::string& name) { + return topi::matmul(A, B, trans_a, trans_b, name); +} + +/* + * Generic Matrix Multiplication + * + * If both arguments are 2-D, they are multiplied like conventional matrices. + * + * If either argument is N-D and N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly. + * + * If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. + * After matrix multiplication the prepended 1 is removed. + * + * If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. + * After matrix multiplication the appended 1 is removed. + */ +tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name) { + int64_t a_rank = static_cast(A->shape.size()); + int64_t b_rank = static_cast(B->shape.size()); + const auto& A_shape = A->shape; + const auto& B_shape = B->shape; + if (a_rank == 2 && b_rank == 2) { + // 2-D X 2-D + return MatMul2D(A, B); + } else if (a_rank == 1 && b_rank == 1) { + // 1-D X 1-D + auto k = tvm::reduce_axis(tvm::Range(0, A_shape[0]), "k"); + + return tvm::compute( + {}, + [&](const tvm::Array& /*indices*/) { + return tvm::sum(A[k] * B[k], {k}); + }, + name); + } else if (a_rank == 1) { + // 1-D X n-D + auto k = tvm::reduce_axis(tvm::Range(0, A_shape[0]), "k"); + + auto l = [&](const tvm::Array& indices) { + auto ndims = indices.size(); + MTI_ASSERT(ndims >= 1); + tvm::Array b_indices; + for (size_t bi = 0; bi < ndims - 1; ++bi) { + b_indices.push_back(indices[bi]); + } + b_indices.push_back(k); + b_indices.push_back(indices[ndims - 1]); + return tvm::sum(A({k}) * B(b_indices), {k}); + }; + return tvm::compute(ConcatShapes(SliceShapeToDimension(B_shape, -2), SliceShapeFromDimension(B_shape, -1)), l, name); + } else if (b_rank == 1) { + // n-D X 1-D + auto k = tvm::reduce_axis(tvm::Range(0, B_shape[0]), "k"); + + auto l = [&](const tvm::Array& indices) { + tvm::Array a_indices(indices.begin(), indices.end()); + a_indices.push_back(k); + return tvm::sum(A(a_indices) * B({k}), {k}); + }; + return tvm::compute(SliceShapeToDimension(A->shape, -1), l, name); + } else { + // n-D X m-D + MTI_ASSERT(a_rank >= 2 && b_rank >= 2); + auto k = tvm::reduce_axis(tvm::Range(0, A_shape[a_rank - 1]), "k"); + + auto l = [&](const tvm::Array& indices) { + auto ndims = static_cast(indices.size()); + MTI_ASSERT(ndims > 2); + tvm::Array a_indices, b_indices; + + // handle broadcasting + int i = 0, a_idx = 0, b_idx = 0; + bool a_greater = a_rank > b_rank; + for (; i < std::abs(a_rank - b_rank); ++i) { + if (a_greater) { + a_indices.push_back(indices[i]); + a_idx++; + } else { + b_indices.push_back(indices[i]); + b_idx++; + } + } + for (; i < ndims - 2; ++i, ++a_idx, ++b_idx) { + auto tp = indices[i].type(); + if (IsOne(A_shape, a_idx)) { + a_indices.push_back(tvm::make_zero(tp)); + b_indices.push_back(indices[i]); + } else if (IsOne(B_shape, b_idx)) { + b_indices.push_back(tvm::make_zero(tp)); + a_indices.push_back(indices[i]); + } else { + a_indices.push_back(indices[i]); + b_indices.push_back(indices[i]); + } + } + + MTI_ASSERT(a_idx == a_rank - 2 && b_idx == b_rank - 2); + a_indices.push_back(indices[ndims - 2]); + a_indices.push_back(k); + + b_indices.push_back(k); + b_indices.push_back(indices[ndims - 1]); + + return tvm::sum(A(a_indices) * B(b_indices), {k}); + }; + + tvm::Array output_shape; + int64_t output_rank = std::max(a_rank, b_rank); + MTI_ASSERT(tvm::ir::Equal(A_shape[a_rank - 1], B_shape[b_rank - 2])); + for (int64_t i = 0; i < output_rank - 2; i++) { + tvm::Expr broadcasted_dim = tvm::make_const(HalideIR::Int(32), 1); + bool broadcasted = + BroadcastDim(A_shape, i, output_rank, broadcasted_dim) && + BroadcastDim(B_shape, i, output_rank, broadcasted_dim); + MTI_ASSERT(broadcasted); + output_shape.push_back(broadcasted_dim); + } + output_shape.push_back(A_shape[a_rank - 2]); + output_shape.push_back(B_shape[b_rank - 1]); + return tvm::compute(output_shape, l, name); + } +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.h b/onnxruntime/core/codegen/mti/math/matmul_ops.h new file mode 100644 index 0000000000000..c149486a87fab --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/matmul_ops.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a = false, bool trans_b = false, const std::string& name = "matmul2d"); + +tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name = "matmul"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/reduce_ops.cc b/onnxruntime/core/codegen/mti/math/reduce_ops.cc new file mode 100644 index 0000000000000..7d179e2b04316 --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/reduce_ops.cc @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/math/reduce_ops.h" + +#include "core/codegen/mti/math/binary_ops.h" +#include "core/codegen/mti/math/unary_ops.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor ArgMax(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name) { + return Rename(topi::argmax(X, ToTvmArrayInt({axis}), keep_dims), name); +} + +tvm::Tensor ArgMin(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name) { + return Rename(topi::argmin(X, ToTvmArrayInt({axis}), keep_dims), name); +} + +tvm::Tensor ReduceL1(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + return ReduceSum(Abs(X), axes, keep_dims, name); +} + +tvm::Tensor ReduceL2(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + return Sqrt(ReduceSumSquare(X, axes, keep_dims), name); +} + +tvm::Tensor ReduceLogSum(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + return Log(ReduceSum(X, axes, keep_dims), name); +} + +tvm::Tensor ReduceLogSumExp(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + tvm::Tensor reduce_max = ReduceMax(X, axes, true); + tvm::Tensor exp_delta = Exp(Sub(X, reduce_max)); + tvm::Tensor reduce_max_keep_dims = ReduceMax(X, axes, keep_dims); + return Add(ReduceLogSum(exp_delta, axes, keep_dims), reduce_max_keep_dims, name); +} + +tvm::Tensor ReduceMax(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + return Rename(topi::max(X, ToTvmArrayInt(axes), keep_dims), name); +} + +tvm::Tensor ReduceMean(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + tvm::Tensor reduce_sum = ReduceSum(X, axes, keep_dims); + tvm::Expr count = tvm::make_const(reduce_sum->dtype, 1.0f); + if (axes.empty()) { + for (const auto& dim : X->shape) + count = count * dim; + } else { + for (int64_t axis : axes) { + int64_t i = HandleNegativeAxis(axis, X->shape.size()); + count = count * X->shape[i]; + } + } + return tvm::compute( + reduce_sum->shape, + [&](const tvm::Array& i) { + return reduce_sum(i) / count; + }, + name); +} + +tvm::Tensor ReduceMin(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + return Rename(topi::min(X, ToTvmArrayInt(axes), keep_dims), name); +} + +tvm::Tensor ReduceProd(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + auto prod = [](tvm::Expr source, tvm::Array rdom) { + tvm::Var x("x", source.type()), y("y", source.type()); + tvm::Expr Rename_element = tvm::make_const(source.type(), 1.0f); + tvm::ir::CommReducer combiner = + tvm::ir::CommReducerNode::make({x}, {y}, {x * y}, {Rename_element}); + return tvm::ir::Reduce::make(combiner, {source}, rdom, tvm::make_const(tvm::Bool(1), true), 0); + }; + + return Rename(topi::CommReduce(X, ToTvmArrayInt(axes), prod, keep_dims, true), name); +} + +tvm::Tensor ReduceSum(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + return Rename(topi::sum(X, ToTvmArrayInt(axes), keep_dims), name); +} + +tvm::Tensor ReduceSumSquare(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { + return Rename(topi::sum(Mul(X, X), ToTvmArrayInt(axes), keep_dims), name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/reduce_ops.h b/onnxruntime/core/codegen/mti/math/reduce_ops.h new file mode 100644 index 0000000000000..f782df5e6515f --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/reduce_ops.h @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor ArgMax(const tvm::Tensor& X, + int64_t axis, + bool keep_dims, + const std::string& name = "argmax"); + +tvm::Tensor ArgMin(const tvm::Tensor& X, + int64_t axis, + bool keep_dims, + const std::string& name = "argmin"); + +tvm::Tensor ReduceL1(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_l1"); + +tvm::Tensor ReduceL2(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_l2"); + +tvm::Tensor ReduceLogSum(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_log_sum"); + +tvm::Tensor ReduceLogSumExp(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "argmareduce_log_sum_exp"); + +tvm::Tensor ReduceMax(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_max"); + +tvm::Tensor ReduceMean(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_mean"); + +tvm::Tensor ReduceMin(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_min"); + +tvm::Tensor ReduceProd(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_prod"); + +tvm::Tensor ReduceSum(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_sum"); + +tvm::Tensor ReduceSumSquare(const tvm::Tensor& X, + const std::vector& axes, + bool keep_dims, + const std::string& name = "reduce_sum_square"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/softmax.cc b/onnxruntime/core/codegen/mti/math/softmax.cc new file mode 100644 index 0000000000000..d7404137bb873 --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/softmax.cc @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/math/softmax.h" + +#include "core/codegen/mti/tensor/reshape_ops.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Softmax(const tvm::Tensor& input, int64_t axis, const std::string& name) { + tvm::Tensor flatten_t = Flatten(input, axis, "softmax_flatten"); + return Reshape(topi::nn::softmax(flatten_t, 1, name), input->shape, "softmax_reshape"); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/softmax.h b/onnxruntime/core/codegen/mti/math/softmax.h new file mode 100644 index 0000000000000..fb16fbaeb56a2 --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/softmax.h @@ -0,0 +1,11 @@ +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Softmax(const tvm::Tensor& input, int64_t axis, const std::string& name = "softmax"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/unary_ops.cc b/onnxruntime/core/codegen/mti/math/unary_ops.cc new file mode 100644 index 0000000000000..7f45a9115fb0b --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/unary_ops.cc @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/math/unary_ops.h" + +#include "core/codegen/common/settings.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include +#include +#include +#include + +// Using namespace topi for override operator +-*/ +using namespace topi; + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Abs(const tvm::Tensor& X, const std::string& name) { + return abs(X, name); +} + +tvm::Tensor Affine(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { + return Rename(alpha * X + beta, name); +} + +tvm::Tensor Ceil(const tvm::Tensor& X, const std::string& name) { + return topi::ceil(X, name); +} + +tvm::Tensor Clip(const tvm::Tensor& X, float min_value, float max_value, const std::string& name) { + auto Y = tvm::compute( + X->shape, + [&](const tvm::Array& indices) { + return tvm::min(tvm::max(X(indices), min_value), max_value); + }, + name); + return Y; +} + +tvm::Tensor Elu(const tvm::Tensor& X, float alpha, const std::string& name) { + return Rename(Relu(X) - alpha * Relu(1 - Exp(X)), name); +} + +tvm::Tensor Exp(const tvm::Tensor& X, const std::string& name) { + return tvm::compute( + X->shape, + [&](const tvm::Array& indices) { + return tvm::exp(X(indices)); + }, + name); +} + +tvm::Tensor Floor(const tvm::Tensor& X, const std::string& name) { + return topi::floor(X, name); +} + +tvm::Tensor HardSigmoid(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { + return maximum(0, minimum(1, alpha * X + beta), name); +} + +tvm::Tensor LeakyRelu(const tvm::Tensor& X, float alpha, const std::string& name) { + return Rename(Relu(X) - alpha * Relu(0 - X), name); +} + +tvm::Tensor Log(const tvm::Tensor& X, const std::string& name) { + return tvm::compute( + X->shape, + [&](const tvm::Array& indices) { + return tvm::log(X(indices)); + }, + name); +} + +tvm::Tensor Neg(const tvm::Tensor& X, const std::string& name) { + return negative(X, name); +} + +tvm::Tensor ParametricSoftplus(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { + return Rename(alpha * Softplus(beta * X), name); +} + +tvm::Tensor Reciprocal(const tvm::Tensor& X, const std::string& name) { + return Rename(1 / X, name); +} + +tvm::Tensor Relu(const tvm::Tensor& X, const std::string& name) { + return maximum(X, 0, name); +} + +tvm::Tensor ScaledTanh(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { + return Rename(alpha * Tanh(beta * X), name); +} + +tvm::Tensor Selu(const tvm::Tensor& X, float alpha, float gamma, const std::string& name) { + return Rename(gamma * (-alpha * Relu(1 - Exp(X)) + Relu(X)), name); +} + +tvm::Tensor Sigmoid(const tvm::Tensor& X, const std::string& name) { + return tvm::compute( + X->shape, + [&](const tvm::Array& indices) { + return tvm::ir::Select::make(X(indices) > 0, + 1 / (1 + tvm::exp(-X(indices))), + tvm::exp(X(indices)) / (tvm::exp(X(indices)) + 1)); + }, + name); +} + +tvm::Tensor SignNoZero(const tvm::Tensor& X, const std::string& name) { + return Rename(greater_equal(X, 0) * 2 - 1, name); +} + +tvm::Tensor Softplus(const tvm::Tensor& X, const std::string& name) { + return Rename(Log(1 + Exp(Neg(Abs(X)))) + Relu(X), name); +} + +tvm::Tensor Softsign(const tvm::Tensor& X, const std::string& name) { + return Rename(X / (1 + Abs(X)), name); +} + +tvm::Tensor Sqrt(const tvm::Tensor& X, const std::string& name) { + return sqrt(X, name); +} + +tvm::Tensor Tanh(const tvm::Tensor& X, const std::string& name) { + return tvm::compute( + X->shape, + [&](const tvm::Array& indices) { + return tvm::ir::Select::make(X(indices) < 0, + (tvm::exp(2 * X(indices)) - 1) / (tvm::exp(2 * X(indices)) + 1), + (1 - tvm::exp(-2 * X(indices))) / (1 + tvm::exp(-2 * X(indices)))); + }, + name); +} + +tvm::Tensor ThresholdedRelu(const tvm::Tensor& X, float alpha, const std::string& name) { + return topi::where(greater(X, alpha), X, topi::full_like(X, tvm::make_zero(X->dtype)), name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/unary_ops.h b/onnxruntime/core/codegen/mti/math/unary_ops.h new file mode 100644 index 0000000000000..ae1f17099fa7e --- /dev/null +++ b/onnxruntime/core/codegen/mti/math/unary_ops.h @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Abs(const tvm::Tensor& X, const std::string& name = "abs"); +tvm::Tensor Affine(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "affine"); +tvm::Tensor Ceil(const tvm::Tensor& X, const std::string& name = "ceil"); +tvm::Tensor Clip(const tvm::Tensor& X, float min_value, float max_value, const std::string& name = "clip"); +tvm::Tensor Elu(const tvm::Tensor& X, float alpha, const std::string& name = "elu"); +tvm::Tensor Exp(const tvm::Tensor& X, const std::string& name = "exp"); +tvm::Tensor Floor(const tvm::Tensor& X, const std::string& name = "floor"); +tvm::Tensor HardSigmoid(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "hard_sigmoid"); +tvm::Tensor LeakyRelu(const tvm::Tensor& X, float alpha, const std::string& name = "leaky_relu"); +tvm::Tensor Log(const tvm::Tensor& X, const std::string& name = "log"); +tvm::Tensor Neg(const tvm::Tensor& X, const std::string& name = "neg"); +tvm::Tensor ParametricSoftplus(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "parametric_softplus"); +tvm::Tensor Reciprocal(const tvm::Tensor& X, const std::string& name = "reciprocal"); +tvm::Tensor Relu(const tvm::Tensor& X, const std::string& name = "relu"); +tvm::Tensor ScaledTanh(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "scaled_tanh"); +tvm::Tensor Selu(const tvm::Tensor& X, float alpha, float gamma, const std::string& name = "selu"); +tvm::Tensor Sigmoid(const tvm::Tensor& X, const std::string& name = "sigmoid"); +tvm::Tensor SignNoZero(const tvm::Tensor& X, const std::string& name = "sign_no_zero"); +tvm::Tensor Softplus(const tvm::Tensor& X, const std::string& name = "softplus"); +tvm::Tensor Softsign(const tvm::Tensor& X, const std::string& name = "softsign"); +tvm::Tensor Sqrt(const tvm::Tensor& X, const std::string& name = "sqrt"); +tvm::Tensor Tanh(const tvm::Tensor& X, const std::string& name = "tanh"); +tvm::Tensor ThresholdedRelu(const tvm::Tensor& X, float alpha, const std::string& name = "thresholded_relu"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/mti_tvm_utils.cc b/onnxruntime/core/codegen/mti/mti_tvm_utils.cc new file mode 100644 index 0000000000000..e905a34432a6e --- /dev/null +++ b/onnxruntime/core/codegen/mti/mti_tvm_utils.cc @@ -0,0 +1,162 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/mti_tvm_utils.h" + +#include "core/codegen/common/settings.h" +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Array ToTvmArray(const std::vector& shape) { + tvm::Array arr; + for (size_t i = 0; i < shape.size(); ++i) { + arr.push_back(tvm::Expr(static_cast(shape[i]))); + } + return arr; +} + +tvm::Array ToTvmArrayInt(const std::vector& shape) { + tvm::Array arr; + for (size_t i = 0; i < shape.size(); ++i) { + arr.push_back(shape[i]); + } + return arr; +} + +tvm::Expr SizeToDimension(const tvm::Array& shape, int64_t axis) { + tvm::Expr size(1); + auto rank = shape.size(); + if (static_cast(axis) != rank) { + axis = HandleNegativeAxis(axis, rank); + } + for (size_t d = 0; d < std::min(rank, static_cast(axis)); ++d) + size = tvm::ir::Simplify(size * shape[d]); + return size; +} + +tvm::Expr SizeFromDimension(const tvm::Array& shape, int64_t axis) { + tvm::Expr size(1); + auto rank = shape.size(); + if (static_cast(axis) != rank) { + axis = HandleNegativeAxis(axis, rank); + } + for (size_t d = static_cast(axis); d < rank; ++d) + size = tvm::ir::Simplify(size * shape[d]); + return size; +} + +tvm::Expr RoundUp(tvm::Expr value, tvm::Expr alignment) { + return tvm::ir::Simplify((value + alignment - 1) / alignment * alignment); +} + +tvm::Array ConcatShapes( + const tvm::Array& shape1, + const tvm::Array& shape2) { + tvm::Array result; + for (size_t i = 0; i < shape1.size(); i++) + result.push_back(shape1[i]); + for (size_t i = 0; i < shape2.size(); i++) + result.push_back(shape2[i]); + return result; +} + +tvm::Tensor Rename(tvm::Tensor X, const std::string& name) { + const_cast(X->op->name) = name; + return X; +} + +tvm::Array SliceShape(const tvm::Array& shape, const std::vector& axes) { + tvm::Array new_shape; + for (auto axis : axes) { + CHECK(axis < static_cast(shape.size())); + new_shape.push_back(shape[axis]); + } + return new_shape; +} + +tvm::Array SliceShapeFromDimension(const tvm::Array& shape, int64_t axis) { + int64_t rank = static_cast(shape.size()); + axis = HandleNegativeAxis(axis, rank); + std::vector axes; + for (auto i = axis; i < rank; ++i) + axes.push_back(i); + return SliceShape(shape, axes); +} + +tvm::Array SliceShapeToDimension(const tvm::Array& shape, int64_t axis) { + int64_t rank = static_cast(shape.size()); + axis = HandleNegativeAxis(axis, rank); + std::vector axes; + for (auto i = 0; i < axis; ++i) + axes.push_back(i); + return SliceShape(shape, axes); +} + +bool IsOne(const tvm::Array& shape, int64_t axis) { + int64_t rank = static_cast(shape.size()); + axis = HandleNegativeAxis(axis, rank); + const auto& dim = shape[axis]; + auto* p = tvm::as_const_int(dim); + return p != nullptr && *p == 1; +} + +tvm::Tensor Promote(const tvm::Expr& expr, const tvm::Array& shape, const std::string& name) { + return tvm::compute( + shape, + [&](const tvm::Array&) { + return expr; + }, + name); +} + +void DumpTVMModuleToFile(const std::string& filename_prefix, tvm::runtime::Module& module) { + const codegen::CodeGenSettings& settings = codegen::CodeGenSettings::Instance(); + if (!settings.HasOption(codegen::CodeGenSettings::kCodeGenDumpModule)) + return; + + static int dump_module_cnt = 0; + // ISSUE: note that all option values are converted to lower case. It doesn't cause + // any issue currently, because all supported formats (i.e. file exts) are of lower case. + // Just keep in mind that we might have issue if somehow we started to support dump + // formats with upper case, although it's quite unlikely. + std::string format = settings.GetOptionValue(codegen::CodeGenSettings::kCodeGenDumpModule); + std::string module_filename = filename_prefix + "_" + std::to_string(dump_module_cnt++) + "." + format; + module->SaveToFile(module_filename, format); +} + +tvm::Tensor MakeZeroTensor(const tvm::Array& shape, + HalideIR::Type type, + const std::string& name) { + auto l = [&](const tvm::Array& /*indices*/) { + return tvm::make_zero(type); + }; + return tvm::compute(shape, l, name); +} + +bool BroadcastDim(const tvm::Array& shape, size_t i, size_t output_rank, tvm::Expr& dim) { + if (i >= output_rank - shape.size()) { + auto new_dim = shape[shape.size() - output_rank + i]; + if (tvm::ir::Equal(new_dim, dim)) + return true; + + const int64_t* p_new = tvm::as_const_int(new_dim); + if (p_new != nullptr && *p_new == 1) { + return true; + } else { + const int64_t* p_old = tvm::as_const_int(dim); + if (p_old != nullptr && *p_old == 1) { + dim = new_dim; + return true; + } + } + return false; + } + // auto broadcast to outer dims + return true; +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/mti_tvm_utils.h b/onnxruntime/core/codegen/mti/mti_tvm_utils.h new file mode 100644 index 0000000000000..3f65658554f2c --- /dev/null +++ b/onnxruntime/core/codegen/mti/mti_tvm_utils.h @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include "core/codegen/mti/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Array ToTvmArray(const std::vector& shape); + +tvm::Array ToTvmArrayInt(const std::vector& shape); + +// Helper function to compute sub shape size to axis (not included) +tvm::Expr SizeToDimension(const tvm::Array& shape, int64_t axis); + +// Helper function to compute sub shape size from axis (included) +tvm::Expr SizeFromDimension(const tvm::Array& shape, int64_t axis); + +// Helper function to align +tvm::Expr RoundUp(tvm::Expr value, tvm::Expr alignment); + +tvm::Array ConcatShapes( + const tvm::Array& shape1, + const tvm::Array& shape2); + +// Helper function to rename tvm::Tensor +tvm::Tensor Rename(tvm::Tensor X, const std::string& name); + +// Helper function to slice TVM shape +tvm::Array SliceShape(const tvm::Array& shape, const std::vector& axes); + +// Helper function to slice TVM shape from axis (inclusive). +// Basically, this function returns the shape of [axis, shape.size()-1] +tvm::Array SliceShapeFromDimension(const tvm::Array& shape, int64_t axis); + +// this function returns the shape of [0, axis-1] +tvm::Array SliceShapeToDimension(const tvm::Array& shape, int64_t axis); + +// check if dimension is 1 +bool IsOne(const tvm::Array& shape, int64_t axis); + +// Helper function to convert tvm::Expr to tvm::Tensor +tvm::Tensor Promote(const tvm::Expr& expr, + const tvm::Array& shape, + const std::string& name = "PromoteExpr"); + +tvm::Tensor MakeZeroTensor(const tvm::Array& shape, HalideIR::Type type, const std::string& name); + +void DumpTVMModuleToFile(const std::string& filename_prefix, tvm::runtime::Module& module); + +bool BroadcastDim(const tvm::Array& shape, size_t i, size_t output_rank, tvm::Expr& dim); + +inline int64_t HandleNegativeAxis(int64_t axis, int64_t rank) { + MTI_ASSERT(axis >= -rank && axis <= rank - 1); + return axis = axis < 0 ? (axis + rank) : axis; +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/conv_ops.cc b/onnxruntime/core/codegen/mti/nn/conv_ops.cc new file mode 100644 index 0000000000000..e2d4acc8843ad --- /dev/null +++ b/onnxruntime/core/codegen/mti/nn/conv_ops.cc @@ -0,0 +1,193 @@ +#include "core/codegen/mti/nn/conv_ops.h" + +#include "core/codegen/mti/math/matmul_ops.h" +#include "core/codegen/mti/tensor/pad_ops.h" +#include "core/codegen/mti/tensor/reshape_ops.h" +#include "core/codegen/mti/tensor/transpose.h" + +namespace onnxruntime { +namespace tvm_codegen { + +static tvm::Tensor PadTensor1D(const tvm::Tensor& input, + const tvm::Array& padding, + size_t width_axis, + const std::string& name) { + auto pad_left = padding[0]; + auto pad_right = padding[1]; + + tvm::Array pad_before(std::vector(input->shape.size(), 0)); + pad_before.Set(width_axis, pad_left); + tvm::Array pad_after(std::vector(input->shape.size(), 0)); + pad_after.Set(width_axis, pad_right); + + const int64_t* padding_w0 = tvm::as_const_int(pad_left); + const int64_t* padding_w1 = tvm::as_const_int(pad_right); + + const bool do_pad = ((padding_w0 != nullptr && *padding_w0) || + (padding_w1 != nullptr && *padding_w1)); + + return do_pad ? Pad(input, pad_before, pad_after, + 0, "constant", name + "_input_padded") + : input; +} + +tvm::Tensor Conv1D(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& out_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name) { + size_t channel_axis = 1; + size_t width_axis = 2; + + auto stride_width = stride[width_axis - 2]; + + auto input_padded = PadTensor1D(input, padding, width_axis, name); + auto rc = tvm::reduce_axis((tvm::Range(0, filter->shape[1])), "rc"); + auto rx = tvm::reduce_axis((tvm::Range(0, filter->shape[2])), "rx"); + + return tvm::compute( + out_shape, + [&](const tvm::Array& output) { + tvm::Array indices; + for (const tvm::Var& var : output) { + indices.push_back(var); + } + indices.Set(channel_axis, rc); + indices.Set(width_axis, output[width_axis] * stride_width + rx); + + return tvm::sum(input_padded(indices) * filter({output[1], rc, rx}), + {rc, rx}); + }, + name); +} + +tvm::Tensor Conv2D(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& output_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name) { + return Conv2D_native(input, filter, output_shape, stride, padding); +} + +static tvm::Tensor PadTensor2D(const tvm::Tensor& input, + const tvm::Array& padding, + size_t height_axis, + size_t width_axis, + const std::string& name) { + auto pad_top = padding[0]; + auto pad_left = padding[1]; + auto pad_bottom = padding[2]; + auto pad_right = padding[3]; + + tvm::Array pad_before(std::vector(input->shape.size(), 0)); + pad_before.Set(height_axis, pad_top); + pad_before.Set(width_axis, pad_left); + + tvm::Array pad_after(std::vector(input->shape.size(), 0)); + pad_after.Set(height_axis, pad_bottom); + pad_after.Set(width_axis, pad_right); + + const int64_t* padding_h0 = tvm::as_const_int(pad_top); + const int64_t* padding_w0 = tvm::as_const_int(pad_left); + const int64_t* padding_h1 = tvm::as_const_int(pad_bottom); + const int64_t* padding_w1 = tvm::as_const_int(pad_right); + + const bool do_pad = ((padding_h0 != nullptr && *padding_h0) || + (padding_w0 != nullptr && *padding_w0)) || + ((padding_h1 != nullptr && *padding_h1) || + (padding_w1 != nullptr && *padding_w1)); + + return do_pad ? Pad(input, pad_before, pad_after, + 0, "constant", name + "_input_padded") + : input; +} + +tvm::Tensor Conv2D_native(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& out_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name) { + size_t channel_axis = 1; + size_t height_axis = 2; + size_t width_axis = 3; + + auto stride_height = stride[height_axis - 2]; + auto stride_width = stride[width_axis - 2]; + + auto input_padded = PadTensor2D(input, padding, height_axis, width_axis, name); + + auto rc = tvm::reduce_axis((tvm::Range(0, filter->shape[1])), "rc"); + auto ry = tvm::reduce_axis((tvm::Range(0, filter->shape[2])), "ry"); + auto rx = tvm::reduce_axis((tvm::Range(0, filter->shape[3])), "rx"); + + return tvm::compute( + out_shape, + [&](const tvm::Array& output) { + tvm::Array indices; + for (const tvm::Var& var : output) { + indices.push_back(var); + } + indices.Set(channel_axis, rc); + indices.Set(height_axis, output[height_axis] * stride_height + ry); + indices.Set(width_axis, output[width_axis] * stride_width + rx); + + return tvm::sum(input_padded(indices) * filter({output[1], rc, ry, rx}), + {rc, ry, rx}); + }, + name); +} + +tvm::Tensor Conv2D_gemm(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& out_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name) { + size_t height_axis = 2; + size_t width_axis = 3; + + auto stride_height = stride[height_axis - 2]; + auto stride_width = stride[width_axis - 2]; + + auto input_padded = PadTensor2D(input, padding, height_axis, width_axis, name); + + tvm::Array img_col_tmp(std::vector(6, 0)); + img_col_tmp.Set(0, out_shape[0]); + img_col_tmp.Set(1, out_shape[2]); + img_col_tmp.Set(2, out_shape[3]); + img_col_tmp.Set(3, filter->shape[1]); + img_col_tmp.Set(4, filter->shape[2]); + img_col_tmp.Set(5, filter->shape[3]); + + auto img_col = tvm::compute( + img_col_tmp, + [&](const tvm::Array& output) { + tvm::Array indices; + indices.push_back(output[0]); + indices.push_back(output[3]); + indices.push_back(output[1] * stride_height + output[4]); + indices.push_back(output[2] * stride_width + output[5]); + return input_padded(indices); + }, + name); + + tvm::Array input_col_shape(std::vector(2, 0)); + input_col_shape.Set(0, img_col_tmp[1] * img_col_tmp[2]); + input_col_shape.Set(1, img_col_tmp[3] * img_col_tmp[4] * img_col_tmp[5]); + auto input_col = Reshape(img_col, input_col_shape); + + tvm::Array filter_row_shape(std::vector(2, 0)); + filter_row_shape.Set(0, filter->shape[0]); + filter_row_shape.Set(1, filter->shape[1] * filter->shape[2] * filter->shape[3]); + auto filter_row = Reshape(filter, filter_row_shape, name); + + auto Y = MatMul2D(input_col, filter_row, false, true, name); + auto Y_T = Transpose(Y, /*axes=*/{}, name); + return Reshape(Y_T, out_shape, name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/conv_ops.h b/onnxruntime/core/codegen/mti/nn/conv_ops.h new file mode 100644 index 0000000000000..1396c216865a7 --- /dev/null +++ b/onnxruntime/core/codegen/mti/nn/conv_ops.h @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Conv1D(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& output_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name = "conv1d"); + +tvm::Tensor Conv2D(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& output_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name = "conv2d"); + +tvm::Tensor Conv2D_native(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& output_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name = "conv2d_native"); + +tvm::Tensor Conv2D_gemm(const tvm::Tensor& input, + const tvm::Tensor& filter, + const tvm::Array& output_shape, + const tvm::Array& stride, + const tvm::Array& padding, + const std::string& name = "conv2d_gemm"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/lstm.cc b/onnxruntime/core/codegen/mti/nn/lstm.cc new file mode 100644 index 0000000000000..1148b0924e869 --- /dev/null +++ b/onnxruntime/core/codegen/mti/nn/lstm.cc @@ -0,0 +1,140 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/nn/lstm.h" + +#include "core/codegen/mti/math/binary_ops.h" +#include "core/codegen/mti/math/unary_ops.h" +#include "core/codegen/mti/math/matmul_ops.h" +#include "core/codegen/mti/math/reduce_ops.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/reshape_ops.h" +#include "core/codegen/mti/tensor/split.h" + +namespace onnxruntime { +namespace tvm_codegen { + +/* +`X` - input tensor +`i` - input gate +`o` - output gate +`f` - forget gate +`c` - cell gate +`t` - time step (t-1 means previous time step) + +`W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates +`R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates +`Wb[iofc]` - W bias vectors for input, output, forget, and cell gates +`Rb[iofc]` - R bias vectors for input, output, forget, and cell gates +`P[iof]` - P peephole weight vector for input, output, and forget gates +`WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates +`RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates +`WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates +`RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates +`PB[iof]` - P peephole weight vector for backward input, output, and forget gates + +`H` - Hidden state +`num_directions` - 2 if direction == bidirectional else 1 + +Equations (Default: f=Sigmoid, g=Tanh, h=Tanh): + it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi) + ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf) + ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc) + Ct = ft (.) Ct-1 + it (.) ct + ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) + Ht = ot (.) h(Ct) +*/ + +void LSTM_cell( + const LSTMAttributes& lstm_attrs, + const tvm::Tensor& X, + const tvm::Tensor& W, + const tvm::Tensor& R, + const tvm::Tensor& B, + bool has_B, + const tvm::Tensor& prev_H, + const tvm::Tensor& prev_C, + const tvm::Tensor& P, + bool has_P, + tvm::Tensor& Y_h, + tvm::Tensor& Y_c) { + // Input projection: Xt*(W[iofc]^T) for forward direction or Xt*(WB[iofc]^T) for reverse direction + // (batch_size, input_size) * trans(4 * hidden_size, input_size) => (batch_size, 4 * hidden_size) + tvm::Tensor input_proj = MatMul2D(X, W, /*trans_a*/ false, /*trans_b*/ true); + + // Hidden projection: Ht-1*(R[iofc]^T) for forward direction or Ht-1*(RB[iofc]^T) for reverse direction + // (batch_size, hidden_size) * trans(4 * hidden_size, hidden_size) => (batch_size, 4 * hidden_size) + tvm::Tensor hidden_proj = MatMul2D(prev_H, R, /*trans_a*/ false, /*trans_b*/ true); + + // (batch_size, 4 * hidden_size) + tvm::Tensor sum_proj = Add(input_proj, hidden_proj); + + // Concatenation of [Wb[iofc], Rb[iofc]] or [WBb[iofc], RBb[iofc]] + if (has_B) { + // (8 * hidden_size) -> (2, 4 * hidden_size) -> (1, 4 * hidden_size), should be done in const folding + tvm::Tensor reduce_B = + ReduceSum(Reshape(B, {2, 4 * static_cast(lstm_attrs.hidden_size)}), {0}, /*keep_dims*/ true); + // (batch_size, 4 * hidden_size) via broadcasting reduce_B + sum_proj = Add(sum_proj, reduce_B); + } + + std::vector iofc_sum_split_sizes(4, lstm_attrs.hidden_size); + // Split sum_proj into iofc, where each gate proj is of (batch_size, hidden_size) + tvm::Array iofc_sum_projs = Split(sum_proj, ToTvmArray(iofc_sum_split_sizes), /*axis*/ 1); + MTI_ASSERT(iofc_sum_projs.size() == 4); + tvm::Tensor i_proj = iofc_sum_projs[0], + o_proj = iofc_sum_projs[1], + f_proj = iofc_sum_projs[2], + c_proj = iofc_sum_projs[3]; + + tvm::Tensor P_i, P_o, P_f; + if (has_P) { + std::vector iof_p_split_sizes(3, lstm_attrs.hidden_size); + // Split P into P_i, P_o, P_f, in const pre-processing (P_i, P_f might be merged?) + // where each P_[iof] has the shape of (hidden_size) + tvm::Array iof_P_projs = Split(P, ToTvmArray(iof_p_split_sizes), /*axis*/ 0); + MTI_ASSERT(iof_P_projs.size() == 3); + P_i = iof_P_projs[0], + P_o = iof_P_projs[1], + P_f = iof_P_projs[2]; + + // (batch_size, hidden_size) via broadcasting P_[if] + i_proj = Add(i_proj, Mul(P_i, prev_C)); + f_proj = Add(f_proj, Mul(P_f, prev_C)); + } + + // TODO: handle more general cases for activations f, h, g and activation_alpha and + // activation_beta. We may consider to move some code such as ActivationInfo from deep_cpu_lstm + // into a common header file, because the code can be used here. + + // Note that by default f = Sigmoid, g = Tanh, h = Tanh + + // it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi) + // shape: (batch_size, hidden_size) + tvm::Tensor i_t = Sigmoid(i_proj); + // ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf) + // shape: (batch_size, hidden_size) + tvm::Tensor f_t = Sigmoid(f_proj); + // ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc) + // shape: (batch_size, hidden_size) + tvm::Tensor c_t = Tanh(c_proj); + + // Ct = ft (.) Ct-1 + it (.) ct + // shape: (batch_size, hidden_size) + Y_c = Add(Mul(f_t, prev_C), Mul(i_t, c_t), Y_c->op->name); + + // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) + // shape: (batch_size, hidden_size) + if (has_P) { + o_proj = Add(o_proj, Mul(P_o, Y_c)); + } + // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) + // shape: (batch_size, hidden_size) + o_proj = Sigmoid(o_proj); + // Ht = ot (.) h(Ct) + // shape: (batch_size, hidden_size) + Y_h = Mul(o_proj, Tanh(Y_c), Y_h->op->name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/lstm.h b/onnxruntime/core/codegen/mti/nn/lstm.h new file mode 100644 index 0000000000000..851fa880c4427 --- /dev/null +++ b/onnxruntime/core/codegen/mti/nn/lstm.h @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +// A bubble now. But don't remove it +// TODO: refactor the LSTMcell building to a tvm function +// and move it here + +namespace onnxruntime { +namespace tvm_codegen { + +struct LSTMAttributes { + LSTMAttributes(int64_t hidden_size_p) : hidden_size(hidden_size_p) {} + int64_t hidden_size; +}; + +void LSTM_cell( + const LSTMAttributes& lstm_attrs, + const tvm::Tensor& X, + const tvm::Tensor& W, + const tvm::Tensor& R, + const tvm::Tensor& B, + bool has_B, + const tvm::Tensor& prev_H, + const tvm::Tensor& prev_C, + const tvm::Tensor& P, + bool has_P, + tvm::Tensor& Y_h, + tvm::Tensor& Y_c); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.cc b/onnxruntime/core/codegen/mti/nn/pool_ops.cc new file mode 100644 index 0000000000000..5af944186c178 --- /dev/null +++ b/onnxruntime/core/codegen/mti/nn/pool_ops.cc @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/nn/pool_ops.h" + +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// TODO: topi only support 2d-pool, MaxPool1d and MaxPool3d will need to be added if necessary. +// only support version < 8 for topi doesn't come with implementation to output index tensor +tvm::Tensor MaxPool( + const tvm::Tensor& input, + const tvm::Array& kernel_size, + const tvm::Array& stride_size, + const tvm::Array& padding_size, + const std::string& layout, + bool count_include_pad) { + return topi::nn::pool(input, kernel_size, stride_size, padding_size, + topi::nn::kMaxPool, + false, + layout, + count_include_pad); +} + +tvm::Tensor AveragePool( + const tvm::Tensor& input, + const tvm::Array& kernel_size, + const tvm::Array& stride_size, + const tvm::Array& padding_size, + const std::string& layout, + bool count_include_pad) { + return topi::nn::pool(input, kernel_size, stride_size, padding_size, + topi::nn::kAvgPool, + false, + layout, + count_include_pad); +} + +tvm::Tensor GlobalMaxPool( + const tvm::Tensor& input, + const std::string& layout) { + return topi::nn::global_pool(input, + topi::nn::kMaxPool, + layout); +} + +tvm::Tensor GlobalAveragePool( + const tvm::Tensor& input, + const std::string& layout) { + return topi::nn::global_pool(input, + topi::nn::kAvgPool, + layout); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.h b/onnxruntime/core/codegen/mti/nn/pool_ops.h new file mode 100644 index 0000000000000..23fbda913e277 --- /dev/null +++ b/onnxruntime/core/codegen/mti/nn/pool_ops.h @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor MaxPool(const tvm::Tensor& input, + const tvm::Array& kernel_size, + const tvm::Array& stride_size, + const tvm::Array& padding_size, + const std::string& layout, + bool count_include_pad); + +tvm::Tensor AveragePool(const tvm::Tensor& input, + const tvm::Array& kernel_size, + const tvm::Array& stride_size, + const tvm::Array& padding_size, + const std::string& layout, + bool count_include_pad); + +tvm::Tensor GlobalMaxPool(const tvm::Tensor& input, + const std::string& layout); + +tvm::Tensor GlobalAveragePool(const tvm::Tensor& input, + const std::string& layout); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/cast_ops.cc b/onnxruntime/core/codegen/mti/tensor/cast_ops.cc new file mode 100644 index 0000000000000..a8fc86488d82b --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/cast_ops.cc @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/cast_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Cast(const tvm::Tensor& X, tvm::Type type, const std::string& name) { + return topi::cast(X, type, name); +} + +// handle cases where bool is reprented as uint8 (e.g. in ONNX). +tvm::Tensor CastToUInt8Bool(const tvm::Tensor& X, const std::string& name) { + return tvm::compute( + X->shape, + [&](const tvm::Array& indices) { + auto val = X(indices); + // A special cast from float16 to bool, first cast up to float32, + // to workaround a float16 bug in many TVM backends. + // Intel Skylake is one of them. https://github.com/dmlc/tvm/issues/2959 + // TODO: remove it, after TVM is fixed + if (X->dtype == HalideIR::Float(16)) + val = tvm::cast(HalideIR::Float(32), val); + return tvm::ir::Select::make(topi::equal(val, tvm::make_zero(val.type())), + tvm::make_zero(HalideIR::UInt(8)), + tvm::make_const(HalideIR::UInt(8), 1)); + }, + name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/cast_ops.h b/onnxruntime/core/codegen/mti/tensor/cast_ops.h new file mode 100644 index 0000000000000..02f6f9cb1fde7 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/cast_ops.h @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Cast(const tvm::Tensor& X, tvm::Type type, const std::string& name = "cast"); +tvm::Tensor CastToUInt8Bool(const tvm::Tensor& X, const std::string& name = "cast_uint8_bool"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc b/onnxruntime/core/codegen/mti/tensor/concat_ops.cc new file mode 100644 index 0000000000000..13e8a4edc320a --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/concat_ops.cc @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/concat_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "gsl/gsl_util" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Concat(const tvm::Array& inputs, + int64_t axis, + const std::string& name) { + return ConcatSafe(inputs, axis, name); +} + +// Note topi's implementation requires control flow within iterations to avoid out-of-bound access. +// Therefore, MTI implements a ConcatSafe that does not have out-of-bound access, +// and does not requires control or predicate. +tvm::Tensor ConcatSafe(const tvm::Array& inputs, + int64_t axis, + const std::string& name) { + axis = HandleNegativeAxis(axis, gsl::narrow(inputs[0]->shape.size())); + MTI_ASSERT(axis < gsl::narrow(inputs[0]->shape.size()) && "axis out of bounds"); + + tvm::Array axis_sizes; + for (auto t : inputs) { + axis_sizes.push_back(t->shape[axis]); + } + + tvm::Expr join_size = axis_sizes[0]; + for (size_t i = 1; i < axis_sizes.size(); ++i) { + join_size += axis_sizes[i]; + } + join_size = tvm::ir::Simplify(join_size); + tvm::Array out_shape; + for (size_t i = 0; i < inputs[0]->shape.size(); ++i) { + out_shape.push_back(i == gsl::narrow(axis) ? join_size : inputs[0]->shape[i]); + } + + return tvm::compute( + out_shape, [&](const tvm::Array& ovars) { + tvm::Array indices; + + // preset + tvm::Expr min = 0; + tvm::Expr extent = axis_sizes[0]; + tvm::Expr offset = 0; + tvm::Expr ret; + + //input i = 0 + for (size_t j = 0; j < ovars.size(); ++j) { + if (j == gsl::narrow(axis)) { + tvm::Expr ivar = ovars[j]; + indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min)); + } else { + indices.push_back(ovars[j]); + } + } + ret = inputs[0](indices); + + for (size_t i = 1; i < inputs.size(); ++i) { + offset += extent; + tvm::Expr min = 0; + extent = axis_sizes[i]; + auto j = gsl::narrow(axis); + tvm::Expr ivar = ovars[j] - offset; + indices.Set(j, tvm::max(tvm::min(ivar, min + extent - 1), min)); + + ret = tvm::ir::Select::make(ivar >= 0, + inputs[i](indices), + ret); + } + + return ret; + }, + name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/concat_ops.h b/onnxruntime/core/codegen/mti/tensor/concat_ops.h new file mode 100644 index 0000000000000..153afebb44615 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/concat_ops.h @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Concat(const tvm::Array& inputs, int64_t axis, const std::string& name = "concat"); +tvm::Tensor ConcatSafe(const tvm::Array& inputs, int64_t axis, const std::string& name = "concat_safe"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/crop.cc b/onnxruntime/core/codegen/mti/tensor/crop.cc new file mode 100644 index 0000000000000..3fe569100df12 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/crop.cc @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/crop.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Crop(const tvm::Tensor& t, + const tvm::Array& border, + const tvm::Array& scale, + const std::string& name) { + MTI_ASSERT(t->shape.size() == 4); + tvm::Expr N = t->shape[0]; + tvm::Expr C = t->shape[1]; + tvm::Expr H = t->shape[2]; + tvm::Expr W = t->shape[3]; + + MTI_ASSERT(border.size() == 4); + tvm::Expr leftBorder = border[0]; + tvm::Expr topBorder = border[1]; + tvm::Expr rightBorder = border[2]; + tvm::Expr bottomBorder = border[3]; + + tvm::Expr bottomLimit = H - bottomBorder; + tvm::Expr rightLimit = W - rightBorder; + + if (!scale.empty()) { + CHECK_EQ(scale.size(), 2); + bottomLimit = topBorder + scale[0]; + rightLimit = leftBorder + scale[1]; + } + + tvm::Array output_shape; + output_shape.push_back(tvm::ir::Simplify(N)); + output_shape.push_back(tvm::ir::Simplify(C)); + output_shape.push_back(tvm::ir::Simplify(bottomLimit - topBorder)); + output_shape.push_back(tvm::ir::Simplify(rightLimit - leftBorder)); + + auto l = [&](const tvm::Array& ovars) { + tvm::Array indices; + + indices.push_back(tvm::min(ovars[0], output_shape[0] - 1)); + indices.push_back(tvm::min(ovars[1], output_shape[1] - 1)); + indices.push_back(tvm::min(topBorder + ovars[2], topBorder + output_shape[2] - 1)); + indices.push_back(tvm::min(leftBorder + ovars[3], leftBorder + output_shape[3] - 1)); + + return t(indices); + }; + + return tvm::compute(output_shape, l, name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/crop.h b/onnxruntime/core/codegen/mti/tensor/crop.h new file mode 100644 index 0000000000000..ffb6a05c70504 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/crop.h @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Crop(const tvm::Tensor& t, + const tvm::Array& border, + const tvm::Array& scale = {}, + const std::string& name = "crop"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/gather.cc b/onnxruntime/core/codegen/mti/tensor/gather.cc new file mode 100644 index 0000000000000..283d29c6eaa5d --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/gather.cc @@ -0,0 +1,53 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/gather.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "gsl/gsl_util" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Gather(const tvm::Tensor& t, + int64_t axis, + const tvm::Tensor& indices, + const std::string& name) { + // handle negative axis + axis = HandleNegativeAxis(axis, gsl::narrow(t->shape.size())); + size_t axis_t = gsl::narrow(axis); + + tvm::Array output_shape; + for (size_t i = 0; i < axis_t; ++i) + output_shape.push_back(t->shape[i]); + + for (size_t i = 0; i < indices->shape.size(); ++i) + output_shape.push_back(indices->shape[i]); + + for (size_t i = axis_t + 1; i < t->shape.size(); ++i) + output_shape.push_back(t->shape[i]); + + auto l = [&](const tvm::Array& ovars) { + tvm::Array ivars; + for (size_t i = 0; i < t->shape.size(); ++i) { + if (i < axis_t) { + ivars.push_back(ovars[i]); + } else if (i == axis_t) { + tvm::Array idx_vars; + for (size_t d = 0; d < indices->shape.size(); ++d) + idx_vars.push_back(ovars[axis_t + d]); + ivars.push_back(tvm::cast(tvm::Int(32), indices(idx_vars))); // tvm indices must be Int32 + } else { + ivars.push_back(ovars[i - 1 + indices->shape.size()]); + } + } + return tvm::ir::Select::make((ivars[axis_t] >= 0) && (ivars[axis_t] < t->shape[axis_t]), + t(ivars), tvm::make_zero(t->dtype)); + }; + + return tvm::compute(output_shape, l, name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/gather.h b/onnxruntime/core/codegen/mti/tensor/gather.h new file mode 100644 index 0000000000000..a44bf3e4127d5 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/gather.h @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Gather(const tvm::Tensor& t, + int64_t axis, + const tvm::Tensor& indices, + const std::string& name = "gather"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/pad_ops.cc b/onnxruntime/core/codegen/mti/tensor/pad_ops.cc new file mode 100644 index 0000000000000..2f688290d109e --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/pad_ops.cc @@ -0,0 +1,121 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/pad_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// Note topi::pad does not support modes {edge, reflect} +// Therefore, MTI implements a generic Pad +tvm::Tensor Pad(const tvm::Tensor& t, + const tvm::Array& pad_before, + const tvm::Array& pad_after, + float pad_value, + const std::string& mode, + const std::string& name) { + MTI_ASSERT(pad_before.size() >= 1); + MTI_ASSERT(pad_before.size() == pad_after.size()); + MTI_ASSERT(pad_before.size() == t->shape.size()); + + tvm::Array output_shape; + for (size_t i = 0; i < t->shape.size(); ++i) { + output_shape.push_back( + tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i])); + } + + auto l = [&](const tvm::Array& ovars) { + tvm::Array conds; + tvm::Array indices; + tvm::Array coords; + + for (size_t i = 0; i < t->shape.size(); ++i) { + tvm::Expr ivar = ovars[i] - pad_before[i]; + tvm::Expr min = 0; + tvm::Expr extent = t->shape[i]; + + conds.push_back(ivar < min); + conds.push_back(ivar >= min + extent); + indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min)); + + if (mode == "reflect") { + // calculate indices for reflect mode + tvm::Expr limit = extent - 1; + tvm::Expr coord = ivar - min; + // Avoid mod zero when tensor shape has 1, + // e.g. input shape is [1, 3, 3] instead of [3, 3] + auto* p_limit = tvm::as_const_int(limit); + if (p_limit != nullptr && *p_limit != 0) + coord = (coord + 2 * limit) % (2 * limit); // avoid negative value + coord = coord - limit; + coord = tvm::abs(coord); + coord = limit - coord; + coord = coord + min; + coords.push_back(coord); + } + } + + if (mode == "reflect") { + return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make), + t(coords), t(indices)); + } else if (mode == "constant") { + return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make), + tvm::make_const(t->dtype, pad_value), t(indices)); + } + + // default mode is edge + return t(indices); + }; + + return tvm::compute(output_shape, l, name); +} + +tvm::Tensor Pad(const tvm::Tensor& t, + const tvm::Array& output_shape, + const tvm::Expr& pad_value, + const std::string& name) { + MTI_ASSERT(t->dtype == pad_value.type()); + + auto l = [&](const tvm::Array& ovars) { + tvm::Array conds; + tvm::Array indices; + + for (size_t i = 0; i < t->shape.size(); ++i) { + tvm::Expr ivar = ovars[i]; + tvm::Expr min = 0; + tvm::Expr extent = t->shape[i]; + + conds.push_back(ivar < min); + conds.push_back(ivar >= min + extent); + indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min)); + } + + return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make), + pad_value, t(indices)); + }; + + return tvm::compute(output_shape, l, name); +} + +tvm::Tensor PadLastDim(const tvm::Tensor& t, + const int32_t align_size, + const tvm::Expr& pad_value, + const std::string& name) { + auto input_shape = t->shape; + tvm::Array out_shape; + size_t input_shape_rank = input_shape.size(); + for (size_t i = 0; i < input_shape_rank - 1; ++i) { + out_shape.push_back(input_shape[i]); + } + out_shape.push_back( + (input_shape[input_shape_rank - 1] + align_size - 1) / + align_size * align_size); + + return Pad(t, out_shape, pad_value, name + "_pad"); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/pad_ops.h b/onnxruntime/core/codegen/mti/tensor/pad_ops.h new file mode 100644 index 0000000000000..6e8e350d71e97 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/pad_ops.h @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// ONNX Pad semantics +tvm::Tensor Pad(const tvm::Tensor& t, + const tvm::Array& pad_before, + const tvm::Array& pad_after, + float pad_value = 0.0f, + const std::string& mode = "constant", + const std::string& name = "pad"); + +// Other common Pad interfaces +// Pad for a given shape +tvm::Tensor Pad(const tvm::Tensor& t, + const tvm::Array& output_shape, + const tvm::Expr& pad_value, + const std::string& name = "pad"); + +// Pad for the last dim only. +// This is widely used for weight layout to guard alignment +tvm::Tensor PadLastDim(const tvm::Tensor& t, + const int32_t align_size, + const tvm::Expr& pad_value, + const std::string& name = "pad_last_dim"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc b/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc new file mode 100644 index 0000000000000..817fb32c2837a --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/reshape_ops.h" + +#include "core/codegen/mti/common.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Flatten(const tvm::Tensor& X, int64_t axis, const std::string& name) { + const auto& input_shape = X->shape; + return Reshape(X, {SizeToDimension(input_shape, axis), SizeFromDimension(input_shape, axis)}, name); +} + +tvm::Tensor Identity(const tvm::Tensor& X, const std::string& name) { + return Reshape(X, X->shape, name); +} + +tvm::Tensor Reshape(const tvm::Tensor& X, const tvm::Array& new_shape, const std::string& name) { + if (new_shape.size() > 0) { + auto X_dim = SizeToDimension(X->shape, X->shape.size()); + auto new_dim = SizeToDimension(new_shape, new_shape.size()); + auto* pX_dim = tvm::as_const_int(X_dim); + auto* pNew_dim = tvm::as_const_int(new_dim); + + if (pX_dim != nullptr && pNew_dim != nullptr) { + MTI_ASSERT(*pX_dim == *pNew_dim); + } + return topi::reshape(X, new_shape, name); + } else { + // generate empty dim tensor with origial input data value + tvm::Array tmp_shape; + tmp_shape.push_back(1); + auto tmp_tensor = topi::reshape(X, tmp_shape); + return tvm::compute( + new_shape, + [&](const tvm::Array&) { + return tmp_tensor[0]; + }, + name); + } +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/reshape_ops.h b/onnxruntime/core/codegen/mti/tensor/reshape_ops.h new file mode 100644 index 0000000000000..e23d62e4c57b0 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/reshape_ops.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Flatten(const tvm::Tensor& X, int64_t axis, const std::string& name = "flatten"); +tvm::Tensor Identity(const tvm::Tensor& X, const std::string& name = "identity"); +tvm::Tensor Reshape(const tvm::Tensor& X, const tvm::Array& new_shape, const std::string& name = "reshape"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/slice.cc b/onnxruntime/core/codegen/mti/tensor/slice.cc new file mode 100644 index 0000000000000..4caf3946ce6a9 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/slice.cc @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/slice.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +static const int64_t max_range = INT_MAX; + +tvm::Expr position(const tvm::Expr& dim, const tvm::Integer& offset) { + if (offset->value >= max_range) + return dim; + else if (offset->value < 0) + return dim + offset; + else + return offset; +} + +tvm::Tensor Slice(const tvm::Tensor& X, + const tvm::Array& starts, + const tvm::Array& ends, + const std::string& name) { + tvm::Array output_shape; + for (size_t i = 0; i < X->shape.size(); ++i) { + tvm::Expr start = position(X->shape[i], starts[i]); + tvm::Expr end = position(X->shape[i], ends[i]); + output_shape.push_back(tvm::ir::Simplify(end - start)); + } + return tvm::compute( + output_shape, + [&](const tvm::Array& ovars) { + tvm::Array ivars; + for (size_t i = 0; i < X->shape.size(); ++i) + ivars.push_back(ovars[i] + tvm::ir::Simplify(position(X->shape[i], starts[i]))); + + return X(ivars); + }, + name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/slice.h b/onnxruntime/core/codegen/mti/tensor/slice.h new file mode 100644 index 0000000000000..26f53650b1b6d --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/slice.h @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Slice(const tvm::Tensor& X, + const tvm::Array& starts, + const tvm::Array& ends, + const std::string& name = "slice"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/split.cc b/onnxruntime/core/codegen/mti/tensor/split.cc new file mode 100644 index 0000000000000..7264f94e390b4 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/split.cc @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/split.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "gsl/gsl_util" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// Similar to numpy, topi::split takes split indices rather than the +// sizes of the splits. Thus we implement our own. +tvm::Array Split(const tvm::Tensor& X, + const tvm::Array& split_sizes, + int64_t axis, + const std::string& name) { + MTI_ASSERT(axis < gsl::narrow(X->shape.size())); + size_t axis_t = gsl::narrow(axis); + + tvm::Array> output_shapes; + int num_splits = gsl::narrow(split_sizes.size()); + for (auto& s : split_sizes) { + tvm::Array shape; + for (size_t i = 0; i < axis_t; i++) { + shape.push_back(X->shape[i]); + } + shape.push_back(s); + for (size_t i = axis_t + 1; i < X->shape.size(); i++) { + shape.push_back(X->shape[i]); + } + output_shapes.push_back(shape); + } + + tvm::Array res; + int idx = 0; + for (int i_split = 0; i_split < num_splits; ++i_split) { + tvm::Expr s = split_sizes[i_split]; + auto l = [&](const tvm::Array& indices) { + tvm::Array new_indices; + for (size_t i = 0; i < axis_t; i++) { + new_indices.push_back(indices[i]); + } + new_indices.push_back(indices[axis_t] + idx); + for (size_t i = axis_t + 1; i < X->shape.size(); i++) { + new_indices.push_back(indices[i]); + } + MTI_ASSERT(topi::detail::IsConstInt(s)); + MTI_ASSERT(new_indices.size() == X->shape.size()); + int size = topi::detail::GetConstInt(s); + idx += size; + return X(new_indices); + }; + res.push_back(tvm::compute(output_shapes[i_split], l, name)); + } + + MTI_ASSERT(topi::detail::IsConstInt(X->shape[axis_t])); + int size_of_splitted_axis = static_cast(topi::detail::GetConstInt(X->shape[axis_t])); + MTI_ASSERT(idx == size_of_splitted_axis); + return res; +} + +tvm::Array SplitWithIndices(const tvm::Tensor& X, + const tvm::Array& split_sizes, + int64_t axis, + const std::string& name) { + return topi::split(X, split_sizes, gsl::narrow(axis), name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/split.h b/onnxruntime/core/codegen/mti/tensor/split.h new file mode 100644 index 0000000000000..bcb9c47d936dd --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/split.h @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// ONNX Split semantics +tvm::Array Split(const tvm::Tensor& X, + const tvm::Array& split_sizes, + int64_t axis, + const std::string& name = "split"); + +// Another common Split interface +// Split with chunck indices +tvm::Array SplitWithIndices(const tvm::Tensor& X, + const tvm::Array& split_sizes, + int64_t axis, + const std::string& name = "split_with_indices"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/tile.cc b/onnxruntime/core/codegen/mti/tensor/tile.cc new file mode 100644 index 0000000000000..57cf7097c5ae7 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/tile.cc @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/tile.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include "gsl/gsl_util" + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Tile(const tvm::Tensor& t, + const std::vector& repeats, + const std::string& name) { + MTI_ASSERT(repeats.size() == t->shape.size()); + tvm::Array output_shape; + + bool repeats_zero = false; + for (size_t i = 0; i < t->shape.size(); ++i) { + if (repeats[i] == 0) + repeats_zero = true; + output_shape.push_back(t->shape[i] * gsl::narrow(repeats[i])); + } + + auto l = [&](const tvm::Array& ovars) { + if (repeats_zero) + return tvm::make_zero(t->dtype); + + tvm::Array ivars; + for (size_t i = 0; i < t->shape.size(); ++i) { + tvm::Expr ovar = ovars[i]; + ivars.push_back(ovar % t->shape[i]); + } + return t(ivars); + }; + + return tvm::compute(output_shape, l, name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/tile.h b/onnxruntime/core/codegen/mti/tensor/tile.h new file mode 100644 index 0000000000000..7ce331fb5ea95 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/tile.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Tile(const tvm::Tensor& t, + const std::vector& repeats, + const std::string& name = "tile"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/transpose.cc b/onnxruntime/core/codegen/mti/tensor/transpose.cc new file mode 100644 index 0000000000000..873ff8d7f1708 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/transpose.cc @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/tensor/transpose.h" + +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Transpose(const tvm::Tensor& X, const tvm::Array& axes, const std::string& name) { + return topi::transpose(X, axes, name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/transpose.h b/onnxruntime/core/codegen/mti/tensor/transpose.h new file mode 100644 index 0000000000000..a2a98fedf1e79 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/transpose.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Transpose(const tvm::Tensor& X, + const tvm::Array& axes, + const std::string& name = "transpose"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/where.cc b/onnxruntime/core/codegen/mti/tensor/where.cc new file mode 100644 index 0000000000000..2bdac3cae7ef5 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/where.cc @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/where.h" + +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Where(const tvm::Tensor& B, + const tvm::Tensor& X, + const tvm::Tensor& Y, + const std::string& name) { + size_t rank = std::max(std::max(B->shape.size(), X->shape.size()), Y->shape.size()); + tvm::Array output_shape; + for (size_t i = 0; i < rank; ++i) { + tvm::Expr dim = tvm::make_const(HalideIR::Int(32), 1); + bool broadcasted = + BroadcastDim(B->shape, i, rank, dim) && + BroadcastDim(X->shape, i, rank, dim) && + BroadcastDim(Y->shape, i, rank, dim); + MTI_ASSERT(broadcasted); + output_shape.push_back(dim); + } + + return topi::where(topi::broadcast_to(B, output_shape), + topi::broadcast_to(X, output_shape), + topi::broadcast_to(Y, output_shape), + name); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/where.h b/onnxruntime/core/codegen/mti/tensor/where.h new file mode 100644 index 0000000000000..68c5288eb3580 --- /dev/null +++ b/onnxruntime/core/codegen/mti/tensor/where.h @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Where(const tvm::Tensor& B, + const tvm::Tensor& X, + const tvm::Tensor& Y, + const std::string& name = "where"); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h b/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h new file mode 100644 index 0000000000000..1463e50bd72fb --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/passes/utils/codegen_context.h" +#include "core/codegen/common/op_macro.h" +#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// This macro declares a TVM IR builder +// based on ORT OP type with postfix DefaultTVM +#define DECLARE_GENERIC_OP_IR_CREATOR_CLASS(OP) \ + DECLARE_OP_IR_CREATOR_CLASS(OP, DefaultTVM) + +// This macro returns a TVM IR builder class name +// based ORT OP type with postfix DefaultTVM +#define GENERIC_OP_IR_CREATOR_CLASS(OP) \ + CREATOR_CLASS(OP, DefaultTVM##IRCreator) + +#define GENERIC_OP_IR_CREATOR_STRING(OP) \ + STRINGIZE(GENERIC_OP_IR_CREATOR_CLASS(OP)) + +// define all ops for DefaultTVM +#define ADD_OP_ITEM(OP) DECLARE_GENERIC_OP_IR_CREATOR_CLASS(OP) +#define BINARY_OP(OP) ADD_OP_ITEM(OP) +#define BINARY_CMP_OP(OP) ADD_OP_ITEM(OP) +#define POOL_OP(OP) ADD_OP_ITEM(OP) +#define UNARY_OP(OP) ADD_OP_ITEM(OP) +#define VARIADIC_OP(OP) ADD_OP_ITEM(OP) +#define REDUCE_INDEXED_OP(OP) ADD_OP_ITEM(OP) +#define REDUCE_OP(OP) ADD_OP_ITEM(OP) + +LIST_ALL_GENERIC_OPS() + +#undef ADD_OP_ITEM +#undef BINARY_OP +#undef BINARY_CMP_OP +#undef POOL_OP +#undef REDUCE_OP +#undef REDUCE_INDEXED_OP +#undef UNARY_OP +#undef VARIADIC_OP + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc new file mode 100644 index 0000000000000..9452146621ac7 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/common/op_macro.h" +#include "core/codegen/mti/math/binary_ops.h" +#include "core/codegen/mti/tensor/cast_ops.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// helper local macro defines Evaluate of BINARY_OP OpIRCreators +#define BINARY_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + tvm::Tensor Y = name(inputs[0], inputs[1], node.Name()); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +LIST_BINARY_OPS() + +#undef BINARY_OP + +// helper local macro defines Evaluate of BINARY_CMP_OP OpIRCreators +#define BINARY_CMP_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + tvm::Tensor Y = Cast(name(inputs[0], inputs[1], node.Name()), HalideIR::UInt(8), "cast_bool_" #name); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +LIST_BINARY_CMP_OPS() + +#undef BINARY_CMP_OP + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc new file mode 100644 index 0000000000000..88383624f87b5 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/math/unary_ops.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Clip OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Clip)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + + float max_value, min_value; + ORT_RETURN_IF_ERROR(info.GetAttr("max", &max_value)); + ORT_RETURN_IF_ERROR(info.GetAttr("min", &min_value)); + + tvm::Tensor Y = Clip(inputs[0], min_value, max_value, node.Name() + "_Clip"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc new file mode 100644 index 0000000000000..64f995076e1bb --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/math/gemm.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Gemm OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Gemm)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& /*ctx_codegen*/, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + tvm::Tensor A = inputs[0]; + tvm::Tensor B = inputs[1]; + tvm::Tensor C = inputs[2]; + + int64_t trans_A, trans_B; + ORT_RETURN_IF_ERROR(attrs.GetAttr("transA", &trans_A)); + ORT_RETURN_IF_ERROR(attrs.GetAttr("transB", &trans_B)); + + float alpha, beta; + ORT_ENFORCE(attrs.GetAttr("alpha", &alpha).IsOK()); + ORT_ENFORCE(attrs.GetAttr("beta", &beta).IsOK()); + + tvm::Tensor Y = Gemm(A, B, C, trans_A != 0, trans_B != 0, alpha, beta, node.Name() + "_Gemm"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc new file mode 100644 index 0000000000000..cb09518bf63d1 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/math/logsoftmax.h" +#include "core/framework/op_kernel_info.h" +#include "core/providers/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of LogSoftmax OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(LogSoftmax)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + + int64_t axis_i64; + ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis_i64)); + axis_i64 = HandleNegativeAxis(axis_i64, gsl::narrow_cast(inputs[0]->shape.size())); + + tvm::Tensor Y = LogSoftmax(inputs[0], axis_i64, node.Name() + "_LogSoftmax"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc new file mode 100644 index 0000000000000..ab1ac237bfa5d --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/math/matmul_ops.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of MatMul OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(MatMul)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + tvm::Tensor Y = MatMul(inputs[0], inputs[1], node.Name() + "_MatMul"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc new file mode 100644 index 0000000000000..60841d049e734 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/math/binary_ops.h" +#include "core/codegen/mti/math/matmul_ops.h" +#include "core/codegen/mti/tensor/cast_ops.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of MatMulInteger OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(MatMulInteger)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + const auto& lhs_tensor = inputs[0]; + const auto& rhs_tensor = inputs[1]; + auto& name = node.Name(); + + // A generic path, cast to int32 + // Support skipped trailing inputs + auto lhs = (node.InputDefs().size() >= 3 && node.InputDefs()[2]->Exists()) + ? Sub(Cast(lhs_tensor, HalideIR::Int(32)), Cast(inputs[2], HalideIR::Int(32))) + : Cast(lhs_tensor, HalideIR::Int(32)); + auto rhs = (node.InputDefs().size() >= 4 && node.InputDefs()[3]->Exists()) + ? Sub(Cast(rhs_tensor, HalideIR::Int(32)), Cast(inputs[3], HalideIR::Int(32))) + : Cast(rhs_tensor, HalideIR::Int(32)); + tvm::Tensor Y = MatMul(lhs, rhs, name + "_MatMulInteger"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc new file mode 100644 index 0000000000000..f29a3f3e7cdf7 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc @@ -0,0 +1,111 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/common/op_macro.h" +#include "core/codegen/mti/math/reduce_ops.h" +#include "core/codegen/mti/tensor/cast_ops.h" +#include "core/codegen/mti/tensor/reshape_ops.h" +#include "core/framework/op_kernel_info.h" +#include "core/providers/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +using ReduceIndexedFunc = tvm::Tensor (*)(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name); +using ReduceFunc = tvm::Tensor (*)(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name); + +// helper class for for REDUCE_INDEXED_OP +class FuncReduceIndexed { + public: + FuncReduceIndexed(const Node& node, ReduceIndexedFunc func, const std::string& name) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + axis_ = info.GetAttrOrDefault("axis", 0); + int64_t keepdims_i = 1; + ORT_ENFORCE(info.GetAttr("keepdims", &keepdims_i).IsOK()); + keep_dims_ = (keepdims_i == 1); + func_ = func; + name_ = name; + } + + tvm::Tensor operator()(const tvm::Tensor& X) const { + auto axis = HandleNegativeAxis(axis_, gsl::narrow_cast(X->shape.size())); + tvm::Tensor index32 = func_(X, axis, keep_dims_, name_); + return Cast(index32, tvm::Int(64)); + } + + private: + int64_t axis_; + bool keep_dims_; + ReduceIndexedFunc func_; + std::string name_; +}; + +// helper class for REDUCE_OP +class FuncReduce { + public: + FuncReduce(const Node& node, ReduceFunc func, const std::string& name) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + axes_ = info.GetAttrsOrDefault("axes"); + int64_t keepdims_i = 1; + ORT_ENFORCE(info.GetAttr("keepdims", &keepdims_i).IsOK()); + keep_dims_ = (keepdims_i == 1); + func_ = func; + name_ = name; + } + + tvm::Tensor operator()(const tvm::Tensor& X) const { + std::vector axes; + for (auto i : axes_) + axes.push_back(HandleNegativeAxis(i, gsl::narrow_cast(X->shape.size()))); + + return func_(X, axes, keep_dims_, name_); + } + + private: + std::vector axes_; + bool keep_dims_; + ReduceFunc func_; + std::string name_; +}; + +// helper macro defines Evaluate of REDUCE_OP OpIRCreators +#define REDUCE_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + tvm::Tensor Y; \ + if (ShapeRank(node.OutputDefs()[0]) == 0) { \ + tvm::Tensor temp = FuncReduce(node, &name, #name)(inputs[0]); \ + Y = Reshape(temp, {}); \ + } else { \ + Y = FuncReduce(node, &name, #name)(inputs[0]); \ + } \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +// helper macro defines Evaluate of REDUCE_INDEXED_OP OpIRCreators +#define REDUCE_INDEXED_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + tvm::Tensor Y = FuncReduceIndexed(node, &name, #name)(inputs[0]); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +LIST_REDUCE_OPS() + +#undef REDUCE_OP +#undef REDUCE_INDEXED_OP + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc new file mode 100644 index 0000000000000..7b13de5a94e48 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/math/softmax.h" +#include "core/framework/op_kernel_info.h" +#include "core/providers/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Softmax OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Softmax)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + + int64_t axis_i64; + ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis_i64)); + + axis_i64 = HandleNegativeAxis(axis_i64, gsl::narrow_cast(inputs[0]->shape.size())); + tvm::Tensor Y = Softmax(inputs[0], axis_i64, node.Name() + "_Softmax"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc new file mode 100644 index 0000000000000..bd5b89c718435 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc @@ -0,0 +1,136 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/common/op_macro.h" +#include "core/codegen/mti/math/unary_ops.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// helper class for unary_ops with alpha +class FuncWithAlpha { + public: + FuncWithAlpha(const Node& node) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); + } + + protected: + float alpha_; +}; + +// helper class for unary_ops with alpha and beta +class FuncWithAlphaBeta { + public: + FuncWithAlphaBeta(const Node& node) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); + ORT_ENFORCE(attrs.GetAttr("beta", &beta_).IsOK()); + } + + protected: + float alpha_; + float beta_; +}; + +// helper class for unary_ops with alpha and gamma +class FuncWithAlphaGamma { + public: + FuncWithAlphaGamma(const Node& node) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); + ORT_ENFORCE(attrs.GetAttr("gamma", &gamma_).IsOK()); + } + + protected: + float alpha_; + float gamma_; +}; + +// helper macro declares unary_ops helper class without attribute +#define FuncClass(name) \ + class Func##name { \ + public: \ + Func##name(const Node&) {} \ + tvm::Tensor operator()(const tvm::Tensor& X) const { \ + return name(X); \ + } \ + } + +// helper macro declares unary_ops helper class with alpha +#define FuncClassAlpha(name) \ + class Func##name : public FuncWithAlpha { \ + public: \ + Func##name(const Node& node) : FuncWithAlpha(node) {} \ + tvm::Tensor operator()(const tvm::Tensor& X) const { \ + return name(X, alpha_); \ + } \ + } + +// helper macro declares unary_ops helper class with alpha and beta +#define FuncClassAlphaBeta(name) \ + class Func##name : public FuncWithAlphaBeta { \ + public: \ + Func##name(const Node& node) : FuncWithAlphaBeta(node) {} \ + tvm::Tensor operator()(const tvm::Tensor& X) const { \ + return name(X, alpha_, beta_); \ + } \ + } + +// helper macro declares unary_ops helper class with alpha and gamma +#define FuncClassAlphaGamma(name) \ + class Func##name : public FuncWithAlphaGamma { \ + public: \ + Func##name(const Node& node) : FuncWithAlphaGamma(node) {} \ + tvm::Tensor operator()(const tvm::Tensor& X) const { \ + return name(X, alpha_, gamma_); \ + } \ + } + +FuncClass(Abs); +FuncClassAlphaBeta(Affine); +FuncClass(Ceil); +FuncClassAlpha(Elu); +FuncClass(Exp); +FuncClass(Floor); +FuncClassAlphaBeta(HardSigmoid); +FuncClassAlpha(LeakyRelu); +FuncClass(Log); +FuncClass(Neg); +FuncClassAlphaBeta(ParametricSoftplus); +FuncClass(Reciprocal); +FuncClass(Relu); +FuncClassAlphaBeta(ScaledTanh); +FuncClassAlphaGamma(Selu); +FuncClass(Sigmoid); +FuncClass(Softplus); +FuncClass(Softsign); +FuncClass(Sqrt); +FuncClass(Tanh); +FuncClassAlpha(ThresholdedRelu); + +// helper macro defines Evaluate of UNARY_OP OpIRCreators +#define UNARY_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + tvm::Tensor Y = Func##name(node)(inputs[0]); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +// helper local macros to replace some calls in LIST_UNARY_OPS +LIST_UNARY_OPS() + +#undef UNARY_OP + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc new file mode 100644 index 0000000000000..9559a713c2876 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/math/binary_ops.h" +#include "core/codegen/mti/tensor/reshape_ops.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +tvm::Tensor Sum(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { + return Add(lhs, rhs, name); +} + +// helper local macro defines Evaluate of BINARY_OP OpIRCreators +#define VARIADIC_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + tvm::Tensor Y = Identity(inputs[0], node.Name() + "0"); \ + for (size_t i = 1; i < inputs.size(); ++i) \ + Y = name(Y, inputs[i], node.Name() + std::to_string(i)); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +LIST_VARIADIC_OPS() + +#undef VARIADIC_OP + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc new file mode 100644 index 0000000000000..c3a9e5950acce --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc @@ -0,0 +1,131 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/nn/conv_ops.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/concat_ops.h" +#include "core/codegen/mti/tensor/split.h" +#include "core/codegen/passes/utils/ort_tvm_utils.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +Status GENERIC_OP_IR_CREATOR_CLASS(Conv)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + + // Attributes + int64_t group; + std::string auto_pad; + std::vector kernel_shape, strides, dilations, pads; + + info.GetAttrOrDefault("group", &group, 1); + info.GetAttrOrDefault("auto_pad", &auto_pad, "NOTSET"); + + ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape).IsOK()); + ORT_ENFORCE(kernel_shape.size() <= 2, "Only support 1D/2D convolution currently!"); + ORT_ENFORCE(info.GetAttrs("strides", strides).IsOK()); + + dilations = info.GetAttrs("dilations", dilations).IsOK() ? dilations : std::vector(kernel_shape.size(), 1); + ORT_ENFORCE(dilations == std::vector(kernel_shape.size(), 1), "Only support dilation is 1 currently"); + + pads = info.GetAttrs("pads", pads).IsOK() ? pads : std::vector(kernel_shape.size() * 2, 0); + + // auto_pad + if (auto_pad != "NOTSET") { + auto rank = inputs[0]->shape.size() - 2; + ORT_ENFORCE(rank > 0); + for (uint64_t i = 0; i < rank; i++) { + if (auto_pad == "VALID") { + pads[i] = 0; + pads[i + rank] = 0; + } else if (auto_pad == "SAME_UPPER" || auto_pad == "SAME_LOWER") { + // TODO: handle symbolic dim + ORT_ENFORCE(ShapeHasValue(node.InputDefs()[0], 2 + i)); + + int64_t input_dim_value = ShapeValue(node.InputDefs()[0], 2 + i); + int64_t output_dim_value = (input_dim_value + strides[i] - 1) / strides[i]; + int64_t pad_needed = (output_dim_value - 1) * strides[i] + kernel_shape[i] - input_dim_value; + + pads[i] = auto_pad == "SAME_LOWER" ? (pad_needed + 1) / 2 : pad_needed / 2; + pads[i + rank] = pad_needed - pads[i]; + } else { + ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unknown auto_pad value"); + } + } + } + + // Inputs + tvm::Tensor X = inputs[0]; + tvm::Tensor W = inputs[1]; + // Outputs + tvm::Tensor Y; + tvm::Array Y_shape = ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen); + + // 1-D convolution + if (kernel_shape.size() == 1) { + Y = Conv1D(X, W, Y_shape, ToTvmArray(strides), ToTvmArray(pads), node.Name() + "_Conv1D"); + } + // 2-D convolution + else if (kernel_shape.size() == 2) { + if (group == 1) { + Y = Conv2D(X, W, Y_shape, ToTvmArray(strides), ToTvmArray(pads), node.Name() + "_Conv2D"); + } else { + int64_t channel_out = ShapeValue(node.InputDefs()[1], 0); + int64_t channel_in = ShapeValue(node.InputDefs()[1], 1); + ORT_ENFORCE(channel_out % group == 0); + + int64_t cout_group = channel_out / group; + Y_shape.Set(1, Y_shape[1] / gsl::narrow_cast(group)); + + tvm::Array split_index0; + tvm::Array split_index1; + + for (int i = 1; i < group; i++) { + split_index0.push_back(i * channel_in); + split_index1.push_back(i * cout_group); + } + + auto input_groups = SplitWithIndices(X, split_index0, 1); + auto weight_groups = SplitWithIndices(W, split_index1, 0); + + // FIXME: This will trigger a llvm buffer overflow when group is too large + // TODO: fix this change it to batched gemm/conv + tvm::Array output_tensors; + for (int i = 0; i < group; i++) { + auto output_tensor = Conv2D(input_groups[i], + weight_groups[i], + Y_shape, + ToTvmArray(strides), + ToTvmArray(pads), + node.Name() + "_Conv2D"); + output_tensors.push_back(output_tensor); + } + Y = Concat(output_tensors, 1); + } + } + + // Add bias if provided + // Support skipped trailing inputs + if (node.InputDefs().size() > 2 && node.InputDefs()[2]->Exists()) { + tvm::Tensor B = inputs[2]; + Y = tvm::compute( + Y_shape, + [&](const tvm::Array& indices) { + return Y(indices) + B(indices[1]); + }); + } + + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc new file mode 100644 index 0000000000000..5c2557142dd0e --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/nn/lstm.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// In the cell computation, we don't have the "direction" dimention and sequence dimension, +// which have been processed outside of the cell. +// Here we implement an LTSM cell. +// For those args (inputs/outputs) of hidden states we put AFTER regular args (inputs/outputs) +// with a pre-defined order +// In a LSTM, the order is H and then C. +// Ouputs of LSTM is Y_h and then Y_c +Status GENERIC_OP_IR_CREATOR_CLASS(LSTM)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + std::string direction_attr; + ORT_RETURN_IF_ERROR(attrs.GetAttr("direction", &direction_attr)); + int64_t hidden_size; + ORT_RETURN_IF_ERROR(attrs.GetAttr("hidden_size", &hidden_size)); + + // input tensor with shape [seq_length, batch_size, input_size] + const tvm::Tensor& X = inputs[0]; // input tensor with shape [seq_length, batch_size, input_size] + const tvm::Tensor& W = inputs[1]; // weights tensor with shape [4*hidden_size, input_size] + const tvm::Tensor& R = inputs[2]; // recurrence tensor with shape [4*hidden_size, hidden_size] + const tvm::Tensor& B = inputs[3]; // optional bias tensor with shape [8*hidden_size] + bool has_B = node.InputDefs()[3]->Exists(); + + // Unsupported the 4th inputs + // optional tensor specifying sequence lengths in a batch, shape: [batch_size] + // const tvm::Tensor* seq_len = inputs[4] ? &inputs[4]->tensor : nullptr; + + const tvm::Tensor& prev_H = inputs[5]; // optional initial H, shape: [batch_size, hidden_size] + const tvm::Tensor& prev_C = inputs[6]; // optional initial C, shape: [batch_size, hidden_size] + + const tvm::Tensor& P = inputs[7]; // optional peepholes tensor with shape [3*hidde_size] + bool has_P = node.InputDefs()[7]->Exists(); + + tvm::Tensor Y_h; // shape: [batch_size, hidden_size] + tvm::Tensor Y_c; // shape: [batch_size, hidden_size] + LSTMAttributes lstm_attrs(hidden_size); + LSTM_cell(lstm_attrs, X, W, R, B, has_B, prev_H, prev_C, P, has_P, Y_h, Y_c); + + // Since we only generate lstm cell, lstm's states need to be always outputs, + // regardless whethere they are skipped or not. + // The skipped trailing outputs need to be handled by Execution + outputs.push_back(Y_h); + outputs.push_back(Y_c); + + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc new file mode 100644 index 0000000000000..556d175a96601 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/nn/pool_ops.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// helper class for pool_ops with arguments +class FuncWithPoolingArgument { + public: + FuncWithPoolingArgument(const Node& node, const std::string& op_name) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + int64_t storage_order{0}; // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0. + + ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape_).IsOK(), "No kernel shape is set."); + if (kernel_shape_.size() != 2) + ORT_NOT_IMPLEMENTED(kernel_shape_.size(), "d pooling is not implementated"); + if (!info.GetAttrs("pads", pads_).IsOK() || pads_.empty()) { + pads_.resize(kernel_shape_.size() * 2, 0); + } + if (!info.GetAttrs("strides", strides_).IsOK() || strides_.empty()) { + strides_.resize(kernel_shape_.size(), 1); + } + if (op_name == "AveragePool") { + int64_t temp; + ORT_ENFORCE(info.GetAttr("count_include_pad", &temp).IsOK()); + count_include_pad_ = (temp != 0); + } + + if (op_name == "MaxPool") { + // TODO: add version check or not? remove version check since only after version 8 would have storage_order, otherwise, it would be zero + storage_order = info.GetAttrOrDefault("storage_order", 0 /*default_value*/); + if (storage_order != 1) { + layout_ = "NCWH"; + } + } + } + + std::vector kernel_shape_; + std::vector pads_; + std::vector strides_; + std::string layout_ = "NCHW"; + bool count_include_pad_ = false; +}; + +// A local macro to create Pool Ops + +// helper macro defines Evaluate of of POOL_OP OpIRCreators +#define POOL_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + if (outputs.size() > 1) ORT_NOT_IMPLEMENTED("output size = 2 is not implementated"); \ + FuncWithPoolingArgument argment(node, #name); \ + tvm::Tensor Y = name(inputs[0], ToTvmArray(argment.kernel_shape_), ToTvmArray(argment.strides_), ToTvmArray(argment.pads_), argment.layout_, argment.count_include_pad_); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } // namespace tvm_codegen + +POOL_OP(MaxPool) +POOL_OP(AveragePool) + +#undef POOL_OP + +// helper macro defines Evaluate of of GlobalPOOL_OP OpIRCreators +#define POOL_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext&, \ + tvm::Array& outputs) { \ + if (inputs[0]->shape.size() != 4) \ + ORT_NOT_IMPLEMENTED(gsl::narrow_cast(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \ + tvm::Tensor Y = name(inputs[0], "NCHW"); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +POOL_OP(GlobalMaxPool) +POOL_OP(GlobalAveragePool) + +#undef POOL_OP + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc new file mode 100644 index 0000000000000..bd324fd359edf --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/tensor/cast_ops.h" +#include "core/codegen/passes/utils/ort_tvm_utils.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Cast OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Cast)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + int64_t to; + ORT_RETURN_IF_ERROR(attrs.GetAttr("to", &to)); + auto to_type_proto = gsl::narrow_cast(to); + + tvm::Tensor X = inputs[0]; + tvm::Tensor Y; + if (to_type_proto == ONNX_NAMESPACE::TensorProto_DataType_BOOL) { + // special case for bool as ONNX bool is uint8, while in tvm it's uint1 + Y = CastToUInt8Bool(X, node.Name() + "_Cast"); + } else { + Y = Cast(X, ToTvmType(to_type_proto), node.Name() + "_Cast"); + } + + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc new file mode 100644 index 0000000000000..418296889419e --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/tensor/concat_ops.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Concat OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Concat)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + + int64_t axis; + ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis)); + + tvm::Tensor Y = Concat(inputs, axis, node.Name() + "_Concat"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc new file mode 100644 index 0000000000000..46adb7e984f2d --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/crop.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Crop OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Crop)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + if (inputs[0]->shape.size() != 4) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input is expected to have four dimensions corresponding to [N,C,H,W]"); + } + + std::vector border; + std::vector scale; + + ORT_ENFORCE(attrs.GetAttrs("border", border).IsOK()); + // scale is optional and status is false when omit + attrs.GetAttrs("scale", scale); + + if (border.size() != 4) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Attribute border needs to be specified with four border elements"); + } + + tvm::Tensor Y = Crop(inputs[0], ToTvmArray(border), ToTvmArray(scale), node.Name() + "_Crop"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc new file mode 100644 index 0000000000000..3a5d801b6839f --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/tensor/gather.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Gather OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Gather)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + int64_t axis; + ORT_ENFORCE(attrs.GetAttr("axis", &axis).IsOK()); + + tvm::Tensor Y = Gather(inputs[0], axis, inputs[1], node.Name() + "_Gather"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc new file mode 100644 index 0000000000000..ecff2c7b73847 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc @@ -0,0 +1,49 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/pad_ops.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Pad OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Pad)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + std::string mode; + std::vector pads; + float value; + + ORT_ENFORCE(attrs.GetAttr("mode", &mode).IsOK()); + ORT_ENFORCE(attrs.GetAttrs("pads", pads).IsOK()); + ORT_ENFORCE(attrs.GetAttr("value", &value).IsOK()); + + if (mode != "constant" && mode != "edge" && mode != "reflect") + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad: Unsupported padding mode!"); + + if (pads.size() != 2 * inputs[0]->shape.size()) + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad: pads rank does not match inputs rank!"); + + std::vector pad_before, pad_after; + size_t offset = pads.size() / 2; + for (size_t i = 0; i < offset; i++) { + pad_before.push_back(pads[i]); + pad_after.push_back(pads[i + offset]); + } + + tvm::Tensor Y = Pad(inputs[0], ToTvmArray(pad_before), ToTvmArray(pad_after), value, mode, node.Name() + "_Pad"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc new file mode 100644 index 0000000000000..ec5862a8a688c --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/reshape_ops.h" +#include "core/codegen/passes/utils/ort_tvm_utils.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Dropout OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Dropout)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + tvm::Tensor Y = Identity(inputs[0]); + outputs.push_back(Y); + + // optional mask + // Support skipped trailing outputs + if (node.OutputDefs().size() > 1 && node.OutputDefs()[1]->Exists()) { + // A fake mask with all zeros + tvm::Tensor mask = MakeZeroTensor(inputs[0]->shape, inputs[0]->dtype, "mask"); + outputs.push_back(mask); + } + + return Status::OK(); +} + +// Evaluate of Flatten OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Flatten)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + int64_t axis; + ORT_RETURN_IF_ERROR(attrs.GetAttr("axis", &axis)); + + tvm::Tensor Y = Flatten(inputs[0], axis, node.Name() + "_Flatten"); + outputs.push_back(Y); + return Status::OK(); +} + +// Evaluate of Identity OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Identity)::Evaluate( + const tvm::Array& inputs, + const Node&, + CodeGenContext&, + tvm::Array& outputs) { + tvm::Tensor Y = Identity(inputs[0]); + outputs.push_back(Y); + return Status::OK(); +} + +// Evaluate of Reshape OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Reshape)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Reshape"); + outputs.push_back(Y); + return Status::OK(); +} + +// Evaluate of Squeeze OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Squeeze)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Squeeze"); + outputs.push_back(Y); + return Status::OK(); +} + +// Evaluate of Unsqueeze OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Unsqueeze)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Unsqueeze"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc new file mode 100644 index 0000000000000..4c27da39db0c4 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" +#include "core/codegen/passes/utils/ort_tvm_utils.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/slice.h" +#include "core/framework/op_kernel_info.h" + +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// local constexpr for INT_MAX +constexpr int64_t max_range = INT_MAX; + +Status SliceCommon(const tvm::Array& inputs, + const Node& node, + tvm::Array& outputs, + const std::vector& starts, + const std::vector& ends, + const std::vector& axes1) { + ORT_RETURN_IF_NOT(nullptr != node.InputDefs()[0]); + const ONNX_NAMESPACE::TensorShapeProto* shape_proto = node.InputDefs()[0]->Shape(); + + std::vector axes; + if (axes1.size() == 0) { + for (size_t i = 0; i < starts.size(); ++i) { + axes.push_back(gsl::narrow_cast(i)); + } + } else { + axes = axes1; + } + + tvm::Array tvm_starts, tvm_ends; + bool empty = false; + + for (int dim = 0; dim < shape_proto->dim_size(); ++dim) { + auto axes_iter = std::find(axes.begin(), axes.end(), dim); + const ONNX_NAMESPACE::TensorShapeProto_Dimension& proto_dim = shape_proto->dim(dim); + bool found_in_axes = (axes_iter != axes.end()); + if (!found_in_axes) { + tvm_starts.push_back(0); + if (proto_dim.has_dim_value()) { + tvm_ends.push_back(proto_dim.dim_value()); + } else { + tvm_ends.push_back(max_range); + } + } else { + auto axes_index = axes_iter - axes.begin(); + int64_t start = starts[axes_index]; + int64_t end = ends[axes_index]; + if (proto_dim.has_dim_value()) { + int64_t dim_max = proto_dim.dim_value(); + if (start < 0) start += dim_max; + if (end < 0) end += dim_max; + start = std::min(dim_max, std::max(static_cast(0), start)); + end = std::min(dim_max, std::max(start, end)); + } + tvm_starts.push_back(start); + tvm_ends.push_back(end); + empty = empty || (start == end); + } + } + + tvm::Tensor Y; + if (empty) { + tvm::Array shape; + for (size_t dim = 0; dim < gsl::narrow_cast(shape_proto->dim_size()); ++dim) { + shape.push_back(tvm::ir::Simplify(tvm_ends[dim] - tvm_starts[dim])); + } + Y = MakeZeroTensor(shape, inputs[0]->dtype, node.Name() + "_zeros"); + } else { + Y = Slice(inputs[0], tvm_starts, tvm_ends, node.Name() + "_Slice"); + } + + outputs.push_back(Y); + return Status::OK(); +} + +// Evaluate of Slice OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Slice)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + + // NOTE that in opset 10, Slice has changed starts/ends/axes from attribute to input + // which may lead to dynamic output shape. + int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); + ORT_RETURN_IF_NOT(version <= 9, "Dynamic Slice is not supported yet"); + + std::vector starts, ends; + ORT_RETURN_IF_ERROR(info.GetAttrs("starts", starts)); + ORT_RETURN_IF_ERROR(info.GetAttrs("ends", ends)); + ORT_RETURN_IF_NOT(starts.size() == ends.size()); + + auto axes = info.GetAttrsOrDefault("axes"); + + return SliceCommon(inputs, node, outputs, starts, ends, axes); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc new file mode 100644 index 0000000000000..7a190b5617042 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/split.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Split OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Split)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + + int64_t axis; + ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis)); + axis = HandleNegativeAxis(axis, gsl::narrow_cast(inputs[0]->shape.size())); + std::vector split_sizes; + + int64_t split_size_sum = 0; + if (info.GetAttrs("split", split_sizes).IsOK()) { + // optional + split_size_sum = std::accumulate(split_sizes.cbegin(), split_sizes.cend(), 0LL); + ORT_RETURN_IF_NOT(std::all_of(split_sizes.cbegin(), split_sizes.cend(), [](int64_t value) { return value > 0; }), + "Invalid value in 'split' attribute. All values must be > 0"); + + // check split sizes + for (size_t i = 0; i < node.OutputDefs().size(); ++i) { + ORT_RETURN_IF_NOT(split_sizes[i] == ShapeValue(node.OutputDefs()[i], gsl::narrow(axis))); + } + + } else { + for (size_t i = 0; i < node.OutputDefs().size(); ++i) { + split_sizes.push_back(ShapeValue(node.OutputDefs()[i], gsl::narrow(axis))); + split_size_sum += split_sizes[i]; + } + } + + // check total size + if (ShapeHasValue(node.InputDefs()[0], axis)) { + int64_t input_axis_dim = ShapeValue(node.InputDefs()[0], axis); + if (split_size_sum != input_axis_dim) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Cannot split using values in 'split' attribute. Axis=", axis, + " Dim being splitted=", input_axis_dim, + " Sum of sizes in 'split' (must equal size of selected axis) was ", split_size_sum); + } + } + + tvm::Array output_tensors = Split(inputs[0], ToTvmArray(split_sizes), axis, node.Name() + "_Split"); + for (size_t i = 0; i < node.OutputDefs().size(); ++i) { + outputs.push_back(output_tensors[i]); + } + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc new file mode 100644 index 0000000000000..f4d7bb1da5e97 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/transpose.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Transpose OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Transpose)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + size_t input_0_shape_rank = inputs[0]->shape.size(); + std::vector permute; + attrs.GetAttrs("perm", permute); + if (permute.size() != 0 && permute.size() != input_0_shape_rank) + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Transpose: Incorrect permute size"); + + std::vector default_permute; + const std::vector* perm; + if (permute.size() > 0) { + perm = &permute; + } else { + default_permute.resize(input_0_shape_rank); + for (size_t i = 0; i < input_0_shape_rank; ++i) { + default_permute[i] = gsl::narrow(input_0_shape_rank - 1 - i); + } + perm = &default_permute; + } + + tvm::Tensor Y = Transpose(inputs[0], ToTvmArrayInt(*perm), node.Name() + "_Transpose"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc new file mode 100644 index 0000000000000..9d6df7c1c430d --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/all_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/mti/tensor/where.h" +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Evaluate of Transpose OpIRCreator +Status GENERIC_OP_IR_CREATOR_CLASS(Where)::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext&, + tvm::Array& outputs) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + + tvm::Tensor Y = Where(inputs[0], inputs[1], inputs[2], node.Name() + "_Where"); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc new file mode 100644 index 0000000000000..6933681dda6c0 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc @@ -0,0 +1,125 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h" + +#include "core/codegen/common/op_macro.h" +#include "core/codegen/passes/op_ir_creator/all_ops.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +TVMIRBuilder::TVMIRBuilder(const std::string& name) + : name_(name) {} + +const std::string& TVMIRBuilder::Name() const { + return name_; +} + +void TVMIRBuilder::InsertDispatcher(std::unique_ptr&& ptr) { + dispatchers_.push_back(std::move(ptr)); +} + +void TVMIRBuilder::ClearAllDispatchers() { + dispatchers_.clear(); +} + +void TVMIRBuilder::DumpAllOpIRCreators() const { + int count = 0; + for (auto& d : dispatchers_) { + std::cout << "************ TVM OpIRDispatcher " + << count << " : " + << d->Name() + << " ************" << std::endl; + + d->ForEach([](const std::string& key, OpIRCreator* builder) { + std::cout << "Key " << key + << ", Creator " << builder->Name() << std::endl; + }); + + ++count; + } +} + +// Evaluate finds ONE proper OpIRCreator and build the corresponding OpIR +// If a TVMIRBuilder has more than one OpIRCreator for an ORT Op, +// the first one will be used. +// Please adjust registration order and dispatcher in TVMIRBuilder +// to make sure the proper OpIRCreator is called. +Status TVMIRBuilder::Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + OpIRCreator* candidate = nullptr; + for (auto& d : dispatchers_) { + candidate = d->Find(node); + if (nullptr != candidate) + break; + } + + if (nullptr == candidate) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented: ", node.OpType()); + } + + ORT_RETURN_IF_ERROR(candidate->Evaluate(inputs, node, ctx_codegen, outputs)); + + return Status::OK(); +} + +// BEGIN: Generic IR creator classes +#define ADD_OP_ITEM(name) \ + op_ir_registry->Register(std::move(std::make_unique())); + +#define BINARY_OP(name) ADD_OP_ITEM(name) +#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) +#define POOL_OP(name) ADD_OP_ITEM(name) +#define REDUCE_OP(name) ADD_OP_ITEM(name) +#define REDUCE_INDEXED_OP(name) ADD_OP_ITEM(name) +#define UNARY_OP(name) ADD_OP_ITEM(name) +#define VARIADIC_OP(name) ADD_OP_ITEM(name) + +void RegisterAllGenericOpIRCreators(OpIRRegistry* op_ir_registry) { + LIST_ALL_GENERIC_OPS(); +} + +#undef ADD_OP_ITEM +#undef BINARY_OP +#undef BINARY_CMP_OP +#undef POOL_OP +#undef REDUCE_OP +#undef REDUCE_INDEXED_OP +#undef UNARY_OP +#undef VARIADIC_OP + +// BEGIN: Plugin Generic IR creator classes +#define ADD_OP_ITEM(name) \ + dispatcher->Register(#name, registry->Get(GENERIC_OP_IR_CREATOR_STRING(name))); + +#define BINARY_OP(name) ADD_OP_ITEM(name) +#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) +#define POOL_OP(name) ADD_OP_ITEM(name) +#define REDUCE_OP(name) ADD_OP_ITEM(name) +#define REDUCE_INDEXED_OP(name) ADD_OP_ITEM(name) +#define UNARY_OP(name) ADD_OP_ITEM(name) +#define VARIADIC_OP(name) ADD_OP_ITEM(name) + +void RegisterGenericOrtOpTypeDispatcher(const std::shared_ptr& builder, + const OpIRRegistry* registry) { + auto dispatcher = std::make_unique("GenericOrtOpTypeOpIRCreators"); + LIST_ALL_GENERIC_OPS() + builder->InsertDispatcher(std::move(dispatcher)); +} + +#undef ADD_OP_ITEM +#undef BINARY_OP +#undef BINARY_CMP_OP +#undef POOL_OP +#undef REDUCE_OP +#undef REDUCE_INDEXED_OP +#undef UNARY_OP +// END: Generic IR creators classes + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h new file mode 100644 index 0000000000000..c80056e619d6d --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/passes/utils/codegen_context.h" +#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// TVMIRBuilder contains all applicable TVM OpIRCreators +// OpIRCreators are stored in multiple dispatchers +// that check different conditions of an ORT Node. + +// If an ORT Node satisfies more than one OpIRCreators, +// the first dispatched pass will be applied. + +class TVMIRBuilder { + public: + TVMIRBuilder(const std::string& name); + ~TVMIRBuilder() = default; + + // A debug dumps all existing in this TVMIRBuilders + void DumpAllOpIRCreators() const; + + // Evaluates an OpIRCreator that first satisfies condtions of all dispatchers + Status Evaluate( + const tvm::Array& inputs, + const Node& node, + CodeGenContext& ctx, + tvm::Array& outputs); + + // Inserts a dispatcher and move its ownership to this TVMIRBuilder + void InsertDispatcher(std::unique_ptr&& ptr); + + // Clears all dispatchers in this TVMIRBuilder + void ClearAllDispatchers(); + + // Dumps the name of this TVMIRBuilder + const std::string& Name() const; + + private: + std::vector> dispatchers_; + std::string name_; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMIRBuilder); +}; + +// Utility function to register all builtin generic OpIRCreators into an OpIRRegistry. +// It creates instances of all generic OpIRCreators +// and registers them to op_ir_registry +void RegisterAllGenericOpIRCreators(OpIRRegistry* op_ir_registry); + +// Utility function to bind all builtin generic OpIRCreators to a TVMIRBuilder. +// It creates an instance of a Dispatcher that contains all generic OpIRCreators created above +// and uses OrtOpType to dispatch OpIRCreators. +// Then, it registers the created Dispatcher to a TVMIRBuilder, builder. +void RegisterGenericOrtOpTypeDispatcher(const std::shared_ptr& builder, + const OpIRRegistry* registry); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc new file mode 100644 index 0000000000000..992272753f5a4 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h" + +#include "core/codegen/common/common.h" +#include "core/codegen/common/dispatcher.h" +#include "core/codegen/passes/utils/codegen_context.h" + +namespace onnxruntime { +namespace codegen { +// Explicit instantiation for OpIRCreator +template class CreatorBase&, + const Node&, + tvm_codegen::CodeGenContext&, + tvm::Array&, + Status>; + +// Explicit instantiation for OpIRCreators' dispatcher +template class DispatcherBase; + +} // namespace codegen + +namespace tvm_codegen { + +// One dispatcher is based on ORT OpType +OpIRCreator* OP_IR_DISPATCHER_CLASS(OpType)::Find(const Node& node) { + return DispatcherBase::Get(node.OpType()); +} + +// Another dispatcher is based ORT NodeArg name (GetKey) +OpIRCreator* OP_IR_DISPATCHER_CLASS(NodeName)::Find(const Node& node) { + return DispatcherBase::Get(GetKey(&node)); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h new file mode 100644 index 0000000000000..fe2648462e4f5 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h @@ -0,0 +1,84 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/common/creator.h" +#include "core/codegen/common/dispatcher.h" +#include "core/codegen/common/registry.h" +#include "core/graph/graph.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +class CodeGenContext; + +// OpIRCreator lowers an Ort Node to its corresponding TVM IRs +using OpIRCreator = codegen::CreatorBase< + const tvm::Array&, + const Node&, + CodeGenContext&, + tvm::Array&, + Status>; + +// OpIRDispatcher is the base dispatcher for TVM IR Builder +// It checks whether an Ort Node satisfying a criteria (in Find) +// and dispatches a corresponding OpIRCreator. +class OpIRDispatcher : public codegen::DispatcherBase { + public: + OpIRDispatcher(const std::string& name) + : DispatcherBase(name) {} + + ~OpIRDispatcher() = default; + + virtual OpIRCreator* Find(const Node&) = 0; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OpIRDispatcher); +}; + +// Macro returns an OpIRCreators' dispatcher's name +#define OP_IR_DISPATCHER_CLASS(OP) \ + TVM##OP##IRCreator + +// Macro declares an OpIRCreators' dispatcher +#define DECLARE_OP_IR_DISPATCHER_CLASS(OP) \ + class OP_IR_DISPATCHER_CLASS(OP) : public OpIRDispatcher { \ + public: \ + TVM##OP##IRCreator(const std::string& name) \ + : OpIRDispatcher(name) {} \ + ~TVM##OP##IRCreator() = default; \ + OpIRCreator* Find(const Node&) override; \ + \ + private: \ + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OP_IR_DISPATCHER_CLASS(OP)); \ + }; + +// Declare two common dispatchers for TVM Op IR builders +// One dispatcher is based on Ort OpType +DECLARE_OP_IR_DISPATCHER_CLASS(OpType) +// Another dispatcher is based Ort NodeArg name +DECLARE_OP_IR_DISPATCHER_CLASS(NodeName) + +// OpIRCreator Registry is a registry holds all OpIRCreators +using OpIRRegistry = codegen::RegistryBase; + +// Macro declares an OpIRCreator +#define DECLARE_OP_IR_CREATOR_CLASS(OP, PREFIX) \ + DECLARE_CREATOR_CLASS(OP, PREFIX##IRCreator, \ + const tvm::Array&, \ + const Node&, \ + tvm_codegen::CodeGenContext&, \ + tvm::Array&, \ + Status) + +// Macro returns an OpIRCreator's name with prefix +#define OP_IR_CREATOR_CLASS_EX(OP, PREFIX, ARCH) \ + CREATOR_CLASS(OP, PREFIX##ARCH##IRCreator) + +// Macro declares an OpIRCreator with prefix and arch +#define DECLARE_OP_IR_CREATOR_CLASS_EX(OP, PREFIX, ARCH) \ + DECLARE_OP_IR_CREATOR_CLASS(OP, PREFIX##ARCH) + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/all_schedules.h b/onnxruntime/core/codegen/passes/scheduler/all_schedules.h new file mode 100644 index 0000000000000..cb75c7fa639c0 --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/all_schedules.h @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/passes/scheduler/tvm_scheduler.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// AlwaysRoot is for debug purpose +DECLARE_TVM_SCHEDULER_CLASS(AlwaysRoot, GenericTVMRule) +// Create schedule for TVM Rule +DECLARE_TVM_SCHEDULER_CLASS(Extern, GenericTVMRule) +DECLARE_TVM_SCHEDULER_CLASS(Reduce, GenericTVMRule) + +//Crete scheduler for ORT OpType, Softmax +DECLARE_TVM_SCHEDULER_CLASS(Softmax, GenericOrtOpType) + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc b/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc new file mode 100644 index 0000000000000..59f492d164b14 --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/scheduler/all_schedules.h" + +#include "core/codegen/passes/scheduler/schedule_utils.h" + +namespace onnxruntime { +namespace tvm_codegen { + +bool TVM_SCHEDULER_CLASS(Softmax, GenericOrtOpType)::Evaluate( + const tvm::Tensor& tensor, + const Node*, + CodeGenContext&, + ScheduleContext& ctx_sched) { + // compute root the exp since it is reused more than once + auto& tensor_exp = tensor->op->InputTensors()[0]; + return InsertRootSchedule(tensor_exp, ctx_sched); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc new file mode 100644 index 0000000000000..8f1485235697c --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc @@ -0,0 +1,164 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/scheduler/schedule_utils.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// Check the schedule of tensor +// If it has no compute_root, Insert compute_root to tensor, and record it to ctx.scheduled_tensors +bool InsertRootSchedule( + const tvm::Tensor& tensor, + ScheduleContext& ctx) { + auto it = ctx.scheduled_tensors.find(tensor->op.get()); + if (it != ctx.scheduled_tensors.end()) { + if (it->second == ScheduleType::ScheduleClosure || + it->second == ScheduleType::ScheduleRoot) { + return false; + } + it->second = ScheduleType::ScheduleRoot; + } else { + ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleRoot)); + } + ctx.schedule[tensor->op].compute_root(); + return true; +} + +// Check the schedule of tensor +// If it is not labeled as closure, lable it. +bool InsertClosure(const tvm::Tensor& tensor, + ScheduleContext& ctx) { + auto it = ctx.scheduled_tensors.find(tensor->op.get()); + if (it != ctx.scheduled_tensors.end()) { + if (it->second == ScheduleType::ScheduleClosure) + return false; + it->second = ScheduleType::ScheduleClosure; + } else { + ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleClosure)); + } + return true; +} + +// Combination of InsertRootSchedule and InsertClosure +bool InsertRootScheduleAndClosure( + const tvm::Tensor& tensor, + ScheduleContext& ctx) { + auto it = ctx.scheduled_tensors.find(tensor->op.get()); + if (it != ctx.scheduled_tensors.end()) { + if (it->second == ScheduleType::ScheduleClosure) { + return false; + } + it->second = ScheduleType::ScheduleClosure; + } else { + ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleClosure)); + } + ctx.schedule[tensor->op].compute_root(); + return true; +} + +// Check the schedule of tensor +// If it is not scheduled, try to vectorize it. +// Note TryVectorization has to use with compute_root. +// Therefore, there is a safty check of tensor's schedule +bool TryVectorization( + const tvm::Tensor& tensor, + int64_t natural_vector_size, + ScheduleContext& ctx) { + auto it = ctx.scheduled_tensors.find(tensor->op.get()); + if (it != ctx.scheduled_tensors.end()) { + if (it->second > ScheduleType::ScheduleInline) { + return false; + } + } + + auto shape = tensor->shape; + auto rank = shape.size(); + if (rank < 1) { + return false; + } + const int64_t* tail_dim = as_const_int(shape[rank - 1]); + + if (nullptr != tail_dim) { + auto extern_op = tensor->op.as(); + if (nullptr != extern_op) { + return false; + } + + auto compute_op = tensor->op.as(); + + if (nullptr != compute_op) { + auto axis = compute_op->axis; + tvm::IterVar x = axis[rank - 1]; + if ((*tail_dim) > natural_vector_size) { + if ((*tail_dim) % natural_vector_size == 0) { + tvm::IterVar xi, xo; + ctx.schedule[tensor->op].split(x, static_cast(natural_vector_size), &xo, &xi); + ctx.schedule[tensor->op].vectorize(xi); + return true; + } + } else if (*tail_dim > 0) { + // don't vectorize if dim is 0 + ctx.schedule[tensor->op].vectorize(x); + return true; + } + } + } + return false; +} + +// Check the schedule of tensor +// If it is not scheduled, try to add compute_inline on it. +// Note TryInlineSchedule cannot be used with compute_root. +// Therefore, there is a safty check of tensor's schedule. +bool TryInlineSchedule( + const tvm::Tensor& tensor, + ScheduleContext& ctx) { + auto it = ctx.scheduled_tensors.find(tensor->op.get()); + if (it != ctx.scheduled_tensors.end()) { + if ((int)it->second < (int)ScheduleType::ScheduleInline) { + ctx.schedule[tensor->op].compute_inline(); + it->second = ScheduleType::ScheduleInline; + return true; + } else { + return false; + } + } + ctx.schedule[tensor->op].compute_inline(); + ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleInline)); + return true; +} + +// Check the schedule of tensor's inputs, and call InsertRootSchedule for each of them +bool InputRootSchedule( + const tvm::Tensor& tensor, + ScheduleContext& ctx) { + bool status = false; + for (auto& t : tensor->op->InputTensors()) { + if (t->op->InputTensors().size() > 0) { + bool status_root = InsertRootSchedule(t, ctx); + status = status || status_root; + } + } + return status; +} + +// Check the schedule of tensor's inputs, +// and call InsertRootSchedule and TryVectorization for each of them +bool InputRootScheduleWithVectorization( + const tvm::Tensor& tensor, + int64_t natural_vector_size, + ScheduleContext& ctx) { + bool status = false; + for (auto& t : tensor->op->InputTensors()) { + if (t->op->InputTensors().size() > 0) { + bool status_vec = TryVectorization(t, natural_vector_size, ctx); + bool status_root = InsertRootSchedule(t, ctx); + status = status || status_root || status_vec; + } + } + return status; +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h new file mode 100644 index 0000000000000..f928928f30c57 --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// Check the schedule of tensor +// If it has no compute_root, Insert compute_root to tensor, +// and record it to ctx.scheduled_tensors +bool InsertRootSchedule( + const tvm::Tensor& tensor, + ScheduleContext& ctx); + +// Check the schedule of tensor +// If it is not labeled as closure, lable it. +bool InsertClosure( + const tvm::Tensor& tensor, + ScheduleContext& ctx); + +// Combination of InsertRootSchedule and InsertClosure +bool InsertRootScheduleAndClosure( + const tvm::Tensor& tensor, + ScheduleContext& ctx); + +// Check the schedule of tensor +// If it is not scheduled, try to vectorize it. +// Note TryVectorization has to use with compute_root. +// Therefore, there is a safty check of tensor's schedule +bool TryVectorization( + const tvm::Tensor& tensor, + int64_t natural_vector_size, + ScheduleContext& ctx); + +// Check the schedule of tensor +// If it is not scheduled, try to add compute_inline on it. +// Note TryInlineSchedule cannot be used with compute_root. +// Therefore, there is a safty check of tensor's schedule. +bool TryInlineSchedule( + const tvm::Tensor& tensor, + ScheduleContext& ctx); + +// Check the schedule of tensor's inputs, +// and call InsertRootSchedule for each of them +bool InputRootSchedule( + const tvm::Tensor& tensor, + ScheduleContext& ctx); + +// Check the schedule of tensor's inputs, +// and call InsertRootSchedule and TryVectorization for each of them +bool InputRootScheduleWithVectorization( + const tvm::Tensor& tensor, + int64_t natural_vector_size, + ScheduleContext& ctx); + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc new file mode 100644 index 0000000000000..33162deddc983 --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/scheduler/all_schedules.h" + +#include "core/codegen/passes/scheduler/schedule_utils.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// This is for debug +bool TVM_SCHEDULER_CLASS(AlwaysRoot, GenericTVMRule)::Evaluate( + const tvm::Tensor& tensor, + const Node*, + CodeGenContext&, + ScheduleContext& ctx_sched) { + return InsertRootSchedule(tensor, ctx_sched); +} + +// For External tvm::Tensor +bool TVM_SCHEDULER_CLASS(Extern, GenericTVMRule)::Evaluate( + const tvm::Tensor& tensor, + const Node*, + CodeGenContext&, + ScheduleContext& ctx_sched) { + bool status = InsertRootScheduleAndClosure(tensor, ctx_sched); + bool status_input = InputRootSchedule(tensor, ctx_sched); + return status || status_input; +} + +// For Reduce Compute tvm::Tensor +bool TVM_SCHEDULER_CLASS(Reduce, GenericTVMRule)::Evaluate( + const tvm::Tensor& tensor, + const Node*, + CodeGenContext&, + ScheduleContext& ctx_sched) { + return InsertRootScheduleAndClosure(tensor, ctx_sched); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc new file mode 100644 index 0000000000000..6f0ffa14e8abb --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc @@ -0,0 +1,104 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/scheduler/tvm_schedule_builder.h" + +#include "core/codegen/common/op_macro.h" +#include "core/codegen/common/settings.h" +#include "core/common/common.h" +#include "core/common/logging/logging.h" + +namespace onnxruntime { +namespace tvm_codegen { + +TVMScheduleBuilder::TVMScheduleBuilder(const std::string& name) + : name_(name) { +} + +const std::string& TVMScheduleBuilder::Name() const { + return name_; +} + +void TVMScheduleBuilder::InsertDispatcher(std::unique_ptr&& ptr) { + dispatchers_.push_back(std::move(ptr)); +} + +void TVMScheduleBuilder::ClearDispatcher() { + dispatchers_.clear(); +} + +void TVMScheduleBuilder::DumpAllSchedulers() const { + std::ostringstream stream; + int count = 0; + stream << "[CODEGEN_DUMP_SCHEDULE]" << std::endl; + for (auto& d : dispatchers_) { + stream << "************ TVM Scheduler Dispatcher " + << count << " : " + << d->Name() + << " ************" << std::endl; + + d->ForEach([&stream](const std::string& key, Scheduler* op) { + stream << "Key " << key + << ", Creater " << op->Name() << std::endl; + }); + + ++count; + } + + LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str(); +} + +Status TVMScheduleBuilder::Evaluate( + const tvm::Tensor& tensor, + const Node* node, + CodeGenContext& ctx_codegen, + ScheduleContext& sched) { + Scheduler* candidate = nullptr; + + for (auto& d : dispatchers_) { + candidate = d->Find(tensor, node, ctx_codegen); + if (nullptr != candidate) + break; + } + + bool enable_dump_schedule = codegen::CodeGenSettings::Instance().HasOption(codegen::CodeGenSettings::kCodeGenDumpSchedule); + + if (nullptr == candidate) { + if (nullptr != node) + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented: ", node->OpType()); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented an internal tvm::Tensor: ", tensor->op->name); + } + + bool status = candidate->Evaluate(tensor, node, ctx_codegen, sched); + + if (enable_dump_schedule) { + std::ostringstream stream; + if (nullptr != node) { + stream << std::endl; + stream << "[CODEGEN_DUMP_SCHEDULE] " + << "Schedule Node: " << node->Name() << std::endl; + } else { + stream << std::endl; + } + + if (status) { + stream << "[CODEGEN_DUMP_SCHEDULE] " + << "Schedule tvm::Tesnor " + << tensor->op->name + << " with " + << candidate->Name() << std::endl; + } else { + stream << "[CODEGEN_DUMP_SCHEDULE] " + << "Schedule tvm::Tesnor " + << tensor->op->name + << " is suppressed " << std::endl; + } + + LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str(); + } + + return Status::OK(); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h new file mode 100644 index 0000000000000..9f0a1b3ef45c2 --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/passes/scheduler/tvm_scheduler.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +// TVMScheduleBuilder contains all applicable TVM scheduler passes. +// Scheduler passes are stored in multiple dispatchers +// that check different conditions of a tvm::Tensor. + +// If a tvm::Tensor satisfies more than one TVM scheduler passes, +// the first dispatched pass will be applied. + +class TVMScheduleBuilder { + public: + // TODO: add more parameter in consructor to support different target + TVMScheduleBuilder(const std::string& name); + ~TVMScheduleBuilder() = default; + + void DumpAllSchedulers() const; + + Status Evaluate( + const tvm::Tensor& tensor, + const Node* node, + CodeGenContext& ctx, + ScheduleContext& sched); + + void InsertDispatcher(std::unique_ptr&& ptr); + void ClearDispatcher(); + + const std::string& Name() const; + + private: + std::vector> dispatchers_; + std::string name_; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMScheduleBuilder); +}; + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc new file mode 100644 index 0000000000000..071200a234e33 --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/scheduler/tvm_scheduler.h" + +#include "core/codegen/common/common.h" +#include "core/codegen/common/dispatcher.h" +#include "core/codegen/passes/utils/codegen_context.h" + +namespace onnxruntime { +namespace codegen { +// explicit instantiation +template class CreatorBase; + +template class DispatcherBase; + +} // namespace codegen + +namespace tvm_codegen { + +static const std::string TMVOpRuleKey_Extern("TVMOpRule_Extern"); +static const std::string TMVOpRuleKey_ComputeReduce("TVMOpRule_ComputeReduce"); +static const std::string TMVOpRuleKey_ComputeRegular("TVMOpRule_ComputeRegular"); +static const std::string TMVOpRuleKey_AlwaysRoot("TMVOpRuleKey_AlwaysRoot"); +static const std::string TMVOpRuleKey_NoRule("TVMOpRule_NoRule"); + +const std::string& GetTVMOpRule(TVMOpRuleType rule) { + if (rule == TVMOpRuleType::Extern) { + return TMVOpRuleKey_Extern; + } else if (rule == TVMOpRuleType::ComputeReduce) { + return TMVOpRuleKey_ComputeReduce; + } else if (rule == TVMOpRuleType::AlwaysRoot) { + return TMVOpRuleKey_AlwaysRoot; + } + return TMVOpRuleKey_NoRule; +} + +const std::string& GetTVMOpRule(const tvm::Tensor& tensor) { + auto extern_op = tensor->op.as(); + + if (nullptr != extern_op) { + return TMVOpRuleKey_Extern; + } + + auto compute_op = tensor->op.as(); + if (nullptr != compute_op) { + if (compute_op->reduce_axis.size() > 0) { + return TMVOpRuleKey_ComputeReduce; + } + } + + return TMVOpRuleKey_NoRule; +} + +Scheduler* SCHEDULE_DISPATCHER_CLASS(OrtOpType):: + Find(const tvm::Tensor&, const Node* node, tvm_codegen::CodeGenContext&) { + if (nullptr == node) + return nullptr; + return DispatcherBase::Get(node->OpType()); +} + +Scheduler* SCHEDULE_DISPATCHER_CLASS(TVMOpRule):: + Find(const tvm::Tensor& tensor, const Node*, tvm_codegen::CodeGenContext&) { + return DispatcherBase::Get(GetTVMOpRule(tensor)); +} + +Scheduler* SCHEDULE_DISPATCHER_CLASS(OrtOpName):: + Find(const tvm::Tensor&, const Node* node, tvm_codegen::CodeGenContext&) { + if (nullptr == node) + return nullptr; + return DispatcherBase::Get(GetKey(node)); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h new file mode 100644 index 0000000000000..413e0fb504e89 --- /dev/null +++ b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h @@ -0,0 +1,128 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/codegen/common/creator.h" +#include "core/codegen/common/registry.h" +#include "core/codegen/passes/utils/codegen_context.h" +#include "core/graph/graph.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// These are current generic TVMOpRule we used. +enum class TVMOpRuleType : int { + Extern = 0, + ComputeReduce = 1, + ComputeRegular = 2, + AlwaysRoot = 3, // for debug + NoRule, +}; + +const std::string& GetTVMOpRule(const tvm::Tensor& tensor); +const std::string& GetTVMOpRule(TVMOpRuleType rule); + +// These are current generic ScheduleType in tvm_codegen +enum class ScheduleType : int { + ScheduleNone = 0, + ScheduleInline = 1, + ScheduleAt = 2, + ScheduleRoot = 3, + ScheduleClosure = 4, +}; + +// Data struct to bundle tvm::Schedule and scheduled tensor +struct ScheduleContext { + ScheduleContext(const tvm::Array& ops) { + schedule = tvm::create_schedule(ops); + } + tvm::Schedule schedule; + std::map scheduled_tensors; +}; + +// Scheduler inserts a tvm::Schedule content to a tvm::Tensor +using Scheduler = codegen::CreatorBase< + const tvm::Tensor&, + const Node*, + tvm_codegen::CodeGenContext&, + ScheduleContext&, + bool>; + +// TVMScheduleDispatcher is the base dispatcher for TVM Schedule Builder +// It checks whether a pair of {tvm::Tensor, Ort Node} satisfying a criteria (in Find) +// and dispatches a corresponding Scheduler. +class TVMScheduleDispatcher : public codegen::DispatcherBase { + public: + TVMScheduleDispatcher(const std::string& name) + : DispatcherBase(name) {} + + ~TVMScheduleDispatcher() = default; + + virtual Scheduler* Find(const tvm::Tensor&, + const Node*, + tvm_codegen::CodeGenContext&) = 0; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMScheduleDispatcher); +}; + +// Macro returns an Schedulers' dispatcher's name +#define SCHEDULE_DISPATCHER_CLASS(TYPE) \ + TVM##TYPE##Schedulers + +// Macro declares an Schedulers' dispatcher +#define DECLARE_SCHEDULE_DISPATCHER_CLASS(TYPE) \ + class SCHEDULE_DISPATCHER_CLASS(TYPE) : public tvm_codegen::TVMScheduleDispatcher { \ + public: \ + TVM##TYPE##Schedulers(const std::string& name) \ + : TVMScheduleDispatcher(name) {} \ + ~TVM##TYPE##Schedulers() = default; \ + tvm_codegen::Scheduler* Find(const tvm::Tensor&, \ + const Node*, \ + tvm_codegen::CodeGenContext&) override; \ + \ + private: \ + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVM##TYPE##Schedulers); \ + }; + +// Common dispatchers are listed here +// For a special pattern, it can be created later. +// One dispatcher is based on Ort OpType +DECLARE_SCHEDULE_DISPATCHER_CLASS(OrtOpType) +// One dispatcher is based on TVMOpRule +DECLARE_SCHEDULE_DISPATCHER_CLASS(TVMOpRule) +// One dispatcher is based Ort NodeArg name +DECLARE_SCHEDULE_DISPATCHER_CLASS(OrtOpName) + +// Scheduler Registry is a registry holds all Schedulers +using TVMScheduleRegistry = codegen::RegistryBase; + +// Macro declares TVM scheduler class +#define DECLARE_TVM_SCHEDULER_CLASS(OP, PRETFIX) \ + DECLARE_CREATOR_CLASS(OP, PRETFIX##Scheduler, \ + const tvm::Tensor&, \ + const Node*, \ + tvm_codegen::CodeGenContext&, \ + tvm_codegen::ScheduleContext&, \ + bool) + +// Macro returns TVM scheduler's name with prefix +#define TVM_SCHEDULER_CLASS(OP, PREFIX) \ + CREATOR_CLASS(OP, PREFIX##Scheduler) + +// Macro returns TVM scheduler's name as string +#define TVM_SCHEDULER_STRING(OP, PREFIX) \ + STRINGIZE(TVM_SCHEDULER_CLASS(OP, PREFIX)) + +// Macro returns TVM scheduler's name with prefix and arch +#define TVM_SCHEDULER_CLASS_EX(OP, PREFIX, ARCH) \ + CREATOR_CLASS(OP, PREFIX##ARCH##Scheduler) + +// Macro declares TVM scheduler class with prefix and arch +#define DECLARE_TVM_SCHEDULER_CLASS_EX(OP, PREFIX, ARCH) \ + DECLARE_TVM_SCHEDULER_CLASS(OP, PREFIX##ARCH) + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/codegen_context.cc b/onnxruntime/core/codegen/passes/utils/codegen_context.cc new file mode 100644 index 0000000000000..2f1a59b4a92eb --- /dev/null +++ b/onnxruntime/core/codegen/passes/utils/codegen_context.cc @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/utils/codegen_context.h" + +#include "core/codegen/common/common.h" + +namespace onnxruntime { +namespace tvm_codegen { + +CodeGenContext::CodeGenContext( + const codegen::CodeGenHandle* handle) + : handle_(handle), unname_symbol_counter_(0) {} + +tvm::Var CodeGenContext::GetOrCreateDynamicDim(const std::string& name) { + if (dynamic_dims_.count(name) == 0) + dynamic_dims_.emplace(name, tvm::Var(name)); + + return dynamic_dims_.at(name); +} + +std::string CodeGenContext::CreateUnnamedSymbol() { + return "unnamed_" + std::to_string(unname_symbol_counter_++); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/codegen_context.h b/onnxruntime/core/codegen/passes/utils/codegen_context.h new file mode 100644 index 0000000000000..641552bd3b2e8 --- /dev/null +++ b/onnxruntime/core/codegen/passes/utils/codegen_context.h @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/common/handle.h" +#include "core/codegen/common/common.h" +#include "core/common/common.h" +#include "core/framework/data_types.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// CodeGenContext is a data structure involving across passes +// Compiler developers can use it to store meta data +// to support fine-grained control of code generation +class CodeGenContext { + public: + CodeGenContext(const codegen::CodeGenHandle* handle); + + virtual ~CodeGenContext() = default; + + // returns tvm::Var for the dynamic dim + tvm::Var GetOrCreateDynamicDim(const std::string& name); + + const codegen::CodeGenHandle* GetCodeGenHandle() const { + return handle_; + } + + std::string CreateUnnamedSymbol(); + + protected: + std::unordered_map dynamic_dims_; + + const codegen::CodeGenHandle* handle_; + + int unname_symbol_counter_; +}; + +// Add Promote for CodeGenContext +DYNAMIC_PROMOTE(CodeGenContext) + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc new file mode 100644 index 0000000000000..f7906b71e1189 --- /dev/null +++ b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc @@ -0,0 +1,186 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/utils/ort_tvm_utils.h" + +#include "core/codegen/common/profile.h" +#include "core/codegen/passes/utils/codegen_context.h" +#include "core/providers/common.h" +#include "gsl/gsl_util" + +#include + +namespace onnxruntime { +namespace tvm_codegen { + +#define RETURN_DLDATATYPE_IF_MATCH(type, type_code) \ + if (ml_type == DataTypeImpl::GetType()) { \ + return {type_code, sizeof(type) * 8, 1}; \ + } + +// DLDataType: {DLDataTypeCode, bits, lanes} +DLDataType ToTvmDLDataType(MLDataType ml_type) { + if (ml_type->IsTensorType()) + ml_type = static_cast(ml_type)->GetElementType(); + + RETURN_DLDATATYPE_IF_MATCH(int8_t, kDLInt); + RETURN_DLDATATYPE_IF_MATCH(uint8_t, kDLUInt); + RETURN_DLDATATYPE_IF_MATCH(int16_t, kDLInt); + RETURN_DLDATATYPE_IF_MATCH(uint16_t, kDLUInt); + RETURN_DLDATATYPE_IF_MATCH(int32_t, kDLInt); + RETURN_DLDATATYPE_IF_MATCH(uint32_t, kDLUInt); + RETURN_DLDATATYPE_IF_MATCH(int64_t, kDLInt); + RETURN_DLDATATYPE_IF_MATCH(uint64_t, kDLUInt); + RETURN_DLDATATYPE_IF_MATCH(bool, kDLUInt); + + RETURN_DLDATATYPE_IF_MATCH(float, kDLFloat); + RETURN_DLDATATYPE_IF_MATCH(double, kDLFloat); + RETURN_DLDATATYPE_IF_MATCH(MLFloat16, kDLFloat); + + ORT_NOT_IMPLEMENTED("converting MLDataType ", ml_type, " to tvm DLDataType is not implemented"); +} + +tvm::Type ToTvmType(ONNX_NAMESPACE::TensorProto_DataType proto_type) { + switch (proto_type) { + // Note that bool is uint1 in tvm, but uint8 in ONNX, so it always require special handling + //case ONNX_NAMESPACE::TensorProto_DataType_BOOL: + // return tvm::UInt(1); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_INT16: + return tvm::Int(16); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_INT32: + return tvm::Int(32); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_INT64: + return tvm::Int(64); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_UINT8: + return tvm::UInt(8); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_UINT16: + return tvm::UInt(16); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_UINT32: + return tvm::UInt(32); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_UINT64: + return tvm::UInt(64); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: + return tvm::Float(32); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: + return tvm::Float(64); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_INT8: + return tvm::Int(8); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: + return tvm::Float(16); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_STRING: + ORT_THROW("Casting to and from strings is not supported yet."); /*break;*/ + case ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED: + ORT_THROW("Cast op must have 'to' argument of type DataType"); /*break;*/ + default: + ORT_THROW("Unexpected 'to' argument value: ", proto_type); + } +} + +tvm::Array ShapeToTvmArray(const NodeArg* def, CodeGenContext& ctx) { + ORT_ENFORCE(nullptr != def); + const ONNX_NAMESPACE::TensorShapeProto* shape_proto = def->Shape(); + ORT_ENFORCE(nullptr != shape_proto); + + tvm::Array arr; + for (int i = 0; i < shape_proto->dim_size(); ++i) { + arr.push_back(ShapeDimToTvmDim(shape_proto->dim(i), ctx)); + } + return arr; +} + +tvm::Expr ShapeDimToTvmDim(const ONNX_NAMESPACE::TensorShapeProto_Dimension& dim, CodeGenContext& ctx) { + if (dim.has_dim_param()) { + return ctx.GetOrCreateDynamicDim(dim.dim_param()); + } else if (dim.has_dim_value()) { + return tvm::Expr(gsl::narrow_cast(dim.dim_value())); + } + return ctx.GetOrCreateDynamicDim(ctx.CreateUnnamedSymbol()); +} + +#ifdef CODEGEN_ENABLE_PROFILER +struct event_in_bracket_and_id { + bool in_bracket; + int id; +}; +std::unordered_map g_codegen_profiler_event_ids; +std::vector> g_codegen_profiler_events(1024); + +TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.profile_event") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* ret) { + DLTensor* X = args[0]; + DLTensor* Y = args[1]; + int event_id = args[2]; + bool is_begin = args[3]; + if (!is_begin) { + DCHECK(event_id < g_codegen_profiler_event_ids.size()); + profiling::Profiler::Instance().EndTimeAndRecordEvent( + profiling::EventCategory::NODE_EVENT, + g_codegen_profiler_events[event_id].first, + g_codegen_profiler_events[event_id].second); + } + + { + CODEGEN_PROFILER_EVENT(profile_stub); + int64_t elem_count = 1; + for (int i = 0; i < X->ndim; ++i) { + elem_count *= X->shape[i]; + } + // there's overhead in this copy, so put begin after copy and end before copy + memcpy(static_cast(Y->data) + Y->byte_offset, + static_cast(X->data) + X->byte_offset, + elem_count * X->dtype.bits / 8); + } + + if (is_begin) { + DCHECK(g_codegen_profiler_events.size() > event_id); + DCHECK(!g_codegen_profiler_events[event_id].first.empty()); + DCHECK(g_codegen_profiler_event_ids[g_codegen_profiler_events[event_id].first].id == event_id); + g_codegen_profiler_events[event_id].second = + profiling::Profiler::Instance().StartTime(); + } + }); + +tvm::Tensor ProfileBegin(tvm::Tensor X, const std::string& event_name) { + int event_id; + if (g_codegen_profiler_event_ids.count(event_name) == 0) { + event_id = g_codegen_profiler_event_ids.size(); + ORT_ENFORCE(event_id < g_codegen_profiler_events.size()); + } else { + ORT_ENFORCE(!g_codegen_profiler_event_ids[event_name].in_bracket); + event_id = g_codegen_profiler_event_ids[event_name].id; + } + g_codegen_profiler_event_ids[event_name] = {true, event_id}; + g_codegen_profiler_events[event_id].first = event_name; + return topi::detail::make_extern( + {X->shape}, {X->dtype}, {X}, + [&](tvm::Array ins, tvm::Array outs) { + return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.profile_event"), + topi::detail::pack_buffer(ins[0]), + topi::detail::pack_buffer(outs[0]), + event_id, + true}); + }, + event_name + "_begin", "", {})[0]; +} + +tvm::Tensor ProfileEnd(tvm::Tensor X, const std::string& event_name) { + ORT_ENFORCE(g_codegen_profiler_event_ids.at(event_name).in_bracket); + g_codegen_profiler_event_ids.at(event_name).in_bracket = false; + int event_id = g_codegen_profiler_event_ids.at(event_name).id; + ORT_ENFORCE(event_id < g_codegen_profiler_events.size()); + ORT_ENFORCE(g_codegen_profiler_events[event_id].first == event_name); + return topi::detail::make_extern( + {X->shape}, {X->dtype}, {X}, + [&](tvm::Array ins, tvm::Array outs) { + return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.profile_event"), + topi::detail::pack_buffer(ins[0]), + topi::detail::pack_buffer(outs[0]), + event_id, + false}); + }, + event_name + "_end", "", {})[0]; +} +#endif + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h new file mode 100644 index 0000000000000..f13e91a2d5cea --- /dev/null +++ b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/common/common.h" +#include "core/framework/data_types.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +class CodeGenContext; + +// Helper function that converts a onnxruntime MLDataType to TVM DLDataType +DLDataType ToTvmDLDataType(MLDataType ml_type); + +tvm::Type ToTvmType(ONNX_NAMESPACE::TensorProto_DataType proto_type); + +tvm::Array ShapeToTvmArray(const NodeArg* def, CodeGenContext& ctx); + +tvm::Expr ShapeDimToTvmDim(const ONNX_NAMESPACE::TensorShapeProto_Dimension& dim, CodeGenContext& ctx); + +#ifdef CODEGEN_ENABLE_PROFILER +// Helper functions to inspect into lowered function +tvm::Tensor ProfileBegin(tvm::Tensor X, const std::string& event_name); + +tvm::Tensor ProfileEnd(tvm::Tensor X, const std::string& event_name); +#endif + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc new file mode 100644 index 0000000000000..df1767b81032a --- /dev/null +++ b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/weight_layout/transpose_2d.h" + +#include "core/codegen/passes/utils/codegen_context.h" + +namespace onnxruntime { +namespace tvm_codegen { + +constexpr auto local_layout_name = "transpose_2d"; + +const std::string WeightLayoutTranspose2D::GetKey( + ONNX_NAMESPACE::TensorProto_DataType proto_type) { + return WeightLayout::GetKey(local_layout_name, proto_type, 2, 0.0f); +} + +WeightLayoutTranspose2D::WeightLayoutTranspose2D( + ONNX_NAMESPACE::TensorProto_DataType proto_type) + : WeightLayout(local_layout_name, proto_type, 2, 0.0f) {} + +CoordTransFunc WeightLayoutTranspose2D::ToActual(const tvm::Tensor& /*X*/) const { + return [&](const tvm::Array& nominal_coord) { + ORT_ENFORCE(nominal_coord.size() == 2); + const auto& y = nominal_coord[0]; + const auto& x = nominal_coord[1]; + return tvm::Array{ + x, + y}; + }; +} + +CoordTransFunc WeightLayoutTranspose2D::ToNominal(const tvm::Tensor& /*X*/) const { + return [&](const tvm::Array& actual_coord) { + ORT_ENFORCE(actual_coord.size() == 2); + const auto& y = actual_coord[0]; + const auto& x = actual_coord[1]; + return tvm::Array{ + x, + y}; + }; +} + +tvm::Array WeightLayoutTranspose2D::ToActualShape(const tvm::Tensor& X) const { + tvm::Array new_shape = { + X->shape[1], + X->shape[0]}; + return new_shape; +} + +std::vector WeightLayoutTranspose2D::ToActualShape(const Tensor* X) const { + ORT_ENFORCE(X != nullptr); + auto old_shape = X->Shape().GetDims(); + + ORT_ENFORCE(old_shape.size() == 2); + + std::vector new_shape = { + old_shape[1], + old_shape[0]}; + + return new_shape; +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h new file mode 100644 index 0000000000000..65babaaec8dac --- /dev/null +++ b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/codegen/passes/weight_layout/weight_layout.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// WeightLayoutTranspose2D for transposing a 2D weight +// [W, H] => [H, W] +class WeightLayoutTranspose2D : public WeightLayout { + public: + static const std::string GetKey(ONNX_NAMESPACE::TensorProto_DataType proto_type); + + public: + WeightLayoutTranspose2D(ONNX_NAMESPACE::TensorProto_DataType proto_type); + + ~WeightLayoutTranspose2D() = default; + + CoordTransFunc ToNominal(const tvm::Tensor& X) const override; + CoordTransFunc ToActual(const tvm::Tensor& X) const override; + tvm::Array ToActualShape(const tvm::Tensor& X) const override; + std::vector ToActualShape(const Tensor* X) const override; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutTranspose2D); +}; + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc new file mode 100644 index 0000000000000..b1ddb791a3b3d --- /dev/null +++ b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/weight_layout/vertical_stripes_2d.h" + +#include "core/codegen/passes/utils/codegen_context.h" + +namespace onnxruntime { +namespace tvm_codegen { + +constexpr auto local_name_prefix = "vertical_stripe_2d_"; + +const std::string WeightLayoutVerticalStripe2D::GetKey( + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int stripe_width) { + return WeightLayout::GetKey( + local_name_prefix + std::to_string(stripe_width), + proto_type, 2, 0.0f); +} + +WeightLayoutVerticalStripe2D::WeightLayoutVerticalStripe2D( + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int stripe_width) + : WeightLayout( + local_name_prefix + std::to_string(stripe_width), + proto_type, 2, 0.0f), + stripe_width_(stripe_width) { +} + +CoordTransFunc WeightLayoutVerticalStripe2D::ToActual(const tvm::Tensor& /*X*/) const { + return [&](const tvm::Array& nominal_coord) { + ORT_ENFORCE(nominal_coord.size() == 2); + const auto& y = nominal_coord[0]; + const auto& x = nominal_coord[1]; + return tvm::Array{ + x / stripe_width_, + y, + x % stripe_width_}; + }; +} + +CoordTransFunc WeightLayoutVerticalStripe2D::ToNominal(const tvm::Tensor& /*X*/) const { + return [&](const tvm::Array& actual_coord) { + ORT_ENFORCE(actual_coord.size() == 3); + const auto& z = actual_coord[0]; + const auto& y = actual_coord[1]; + const auto& x = actual_coord[2]; + return tvm::Array{ + y, + x + stripe_width_ * z}; + }; +} + +tvm::Array WeightLayoutVerticalStripe2D::ToActualShape(const tvm::Tensor& X) const { + tvm::Array new_shape = { + (X->shape[1] + stripe_width_ - 1) / stripe_width_, + X->shape[0], + stripe_width_}; + return new_shape; +} + +std::vector WeightLayoutVerticalStripe2D::ToActualShape(const Tensor* X) const { + ORT_ENFORCE(X != nullptr); + auto old_shape = X->Shape().GetDims(); + + ORT_ENFORCE(old_shape.size() == 2); + + std::vector new_shape = { + (old_shape[1] + stripe_width_ - 1) / stripe_width_, + old_shape[0], + stripe_width_}; + + return new_shape; +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h new file mode 100644 index 0000000000000..b9b65025dc014 --- /dev/null +++ b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/common/common.h" +#include "core/codegen/passes/weight_layout/weight_layout.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +// WeightLayoutVerticalStripe2D for making a 2D weight to 3D, by tiling the lowest (verteical) dimension +// [W, H] => [H/stripe, W, stripe] +class WeightLayoutVerticalStripe2D : public WeightLayout { + public: + static const std::string GetKey( + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int stripe_width); + + public: + WeightLayoutVerticalStripe2D( + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int stripe_width); + + ~WeightLayoutVerticalStripe2D() = default; + + virtual CoordTransFunc ToNominal(const tvm::Tensor& X) const override; + virtual CoordTransFunc ToActual(const tvm::Tensor& X) const override; + tvm::Array ToActualShape(const tvm::Tensor& X) const override; + std::vector ToActualShape(const Tensor* X) const override; + + private: + int stripe_width_; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutVerticalStripe2D); +}; + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc new file mode 100644 index 0000000000000..0b8ae71030779 --- /dev/null +++ b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/codegen/passes/weight_layout/weight_layout.h" + +#include "core/codegen/common/common.h" +#include "core/codegen/common/utils.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/codegen/passes/utils/ort_tvm_utils.h" + +namespace onnxruntime { +namespace tvm_codegen { + +static tvm::Tensor CreateTVMPlaceholder( + const std::string& name, + HalideIR::Type type, + int dim) { + tvm::Array shape; + if (dim > 0) { + for (int i = 0; i < dim; ++i) { + shape.push_back(tvm::Var(name + "_v" + std::to_string(i))); + } + } else { + shape.push_back(1); + } + return tvm::placeholder(shape, type, name + "_placeholder"); +} + +const std::string WeightLayout::GetKey( + const std::string& name, + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int input_dim, + float pad_zero) { + std::string key = name; + key += "_type_" + std::to_string(static_cast(proto_type)); + key += "_dim_" + input_dim; + key += "_pad_zero_" + std::to_string(pad_zero); + key = NormalizeCppName(key); + return key; +} + +WeightLayout::WeightLayout( + const std::string& name, + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int input_dim, + float pad_zero) + : name_(GetKey(name, proto_type, input_dim, pad_zero)), + proto_type_(proto_type), + input_dim_(input_dim), + pad_zero_(pad_zero) {} + +const std::string& WeightLayout::Name() const { + return name_; +} + +void WeightLayout::CreateLayoutMarshallingTVMOp(tvm::Array& inputs, + tvm::Array& outputs) const { + HalideIR::Type halide_type = ToTvmType(proto_type_); + + tvm::Tensor placeholder = CreateTVMPlaceholder(name_, halide_type, input_dim_); + inputs.push_back(placeholder); + + tvm::Array new_shape = ToActualShape(placeholder); + CoordTransFunc new_coord_to_old_coord_func = ToNominal(placeholder); + tvm::Expr pad_zero_expr = tvm::make_const(halide_type, pad_zero_); + + tvm::Tensor output = tvm::compute( + new_shape, + [&](const tvm::Array& output_coord) { + tvm::Array output_coord1; + for (const auto& coord : output_coord) + output_coord1.push_back(coord); + auto input_coord = new_coord_to_old_coord_func(output_coord1); + ORT_ENFORCE(input_coord.size() == placeholder->shape.size()); + + if (input_coord.size() > 0) { + auto in_range = (input_coord[0] >= 0) && (input_coord[0] < placeholder->shape[0]); + for (size_t dim = 1; dim < input_coord.size(); ++dim) + in_range = in_range && (input_coord[dim] >= 0) && (input_coord[dim] < placeholder->shape[dim]); + + return tvm::ir::Select::make(in_range, placeholder(input_coord), pad_zero_expr); + } else { + // scalar + return placeholder(input_coord); + } + }); + + outputs.push_back(output); +} + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h new file mode 100644 index 0000000000000..bcd9b229b5a3d --- /dev/null +++ b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/codegen/common/common.h" +#include "core/codegen/common/registry.h" +#include "core/common/common.h" +#include "core/framework/tensor.h" +#include + +namespace onnxruntime { +namespace tvm_codegen { + +using CoordTransFunc = std::function(const tvm::Array&)>; + +// WeightLayout is data layout trasnformer for weight/initializer +class WeightLayout { + public: + // Static function to return unique string as a key + static const std::string GetKey( + const std::string& name, + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int input_dim, + float pad_zero); + + public: + WeightLayout( + const std::string& name, + ONNX_NAMESPACE::TensorProto_DataType proto_type, + int input_dim, + float pad_zero); + + ~WeightLayout() = default; + + // Return a CoordTransFunc from actual (transformed) coordinate to normial (original) coordinate + virtual CoordTransFunc ToNominal(const tvm::Tensor& X) const = 0; + + // Return a CoordTransFunc from normial (original) coordinate to actual (transformed) coordinate + virtual CoordTransFunc ToActual(const tvm::Tensor& X) const = 0; + + // Return actual (transformed) shape in tvm::Array (tvm_codegen) + virtual tvm::Array ToActualShape(const tvm::Tensor& X) const = 0; + + // Return actual (transformed) shape in vector (ort) + virtual std::vector ToActualShape(const Tensor* X) const = 0; + + // Create Layout Marshalling op in outputs + void CreateLayoutMarshallingTVMOp(tvm::Array& inputs, + tvm::Array& outputs) const; + + // Layout name + const std::string& Name() const; + + protected: + std::string name_; + ONNX_NAMESPACE::TensorProto_DataType proto_type_; + int input_dim_; + float pad_zero_; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayout); +}; + +// Weight Layout Registry is a registry holds all WeightLayout +using WeightLayoutRegistry = codegen::RegistryBase; + +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/tvm/tvm_compiler.cc b/onnxruntime/core/codegen/tvm/tvm_compiler.cc deleted file mode 100644 index a3ae548f70363..0000000000000 --- a/onnxruntime/core/codegen/tvm/tvm_compiler.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include -#include "core/codegen/tvm/tvm_compiler.h" -namespace onnxruntime { - -TVMGraph::TensorDescriptor::TensorDescriptor(MLDataType type, onnxruntime::ProviderType execution_provider_type, tvm::Tensor tvm_tensor) : tvm_tensor_(tvm_tensor) { - if (execution_provider_type == onnxruntime::kCpuExecutionProvider) { - ctx_.device_type = DLDeviceType::kDLCPU; - ctx_.device_id = 0; - } else { - ORT_NOT_IMPLEMENTED("Non-cpu execution provider not supported on TVM now."); - } - - if (DataTypeImpl::GetTensorType() == type) { - dtype_.code = kDLFloat; - dtype_.bits = 64; - dtype_.lanes = 1; - } else { - ORT_NOT_IMPLEMENTED("Non-double type not supported on TVM now."); - } -} - -class IdGenerator { - public: - IdGenerator() {} - int GetNext() { - return cur_++; - } - - private: - int cur_{0}; -}; - -// This is a special compiler step for the test case that sum two 1-D tensors -static void Compile1DAddToTVM(const onnxruntime::Node& node, std::unordered_map& tvm_tensors, onnxruntime::ProviderType execution_provider_type, IdGenerator& generator) { - ORT_ENFORCE(node.OpType() == "Add"); - tvm::Array shape; - shape.push_back(tvm::var("n1")); - - tvm::Tensor t1; - tvm::Tensor t2; - auto it = tvm_tensors.find(node.InputDefs()[0]->Name()); - if (it == tvm_tensors.end()) { - tvm_tensors[node.InputDefs()[0]->Name()] = TVMGraph::TensorDescriptor( - DataTypeImpl::TypeFromProto(*node.InputDefs()[0]->TypeAsProto()), - execution_provider_type, - tvm::placeholder(shape, tvm::Float(64), "T" + std::to_string(generator.GetNext()))); - } - t1 = tvm_tensors[node.InputDefs()[0]->Name()].tvm_tensor_; - it = tvm_tensors.find(node.InputDefs()[1]->Name()); - if (it == tvm_tensors.end()) { - tvm_tensors[node.InputDefs()[1]->Name()] = TVMGraph::TensorDescriptor( - DataTypeImpl::TypeFromProto(*node.InputDefs()[1]->TypeAsProto()), - execution_provider_type, - tvm::placeholder(shape, tvm::Float(64), "T" + std::to_string(generator.GetNext()))); - } - t2 = tvm_tensors[node.InputDefs()[1]->Name()].tvm_tensor_; - - tvm_tensors[node.OutputDefs()[0]->Name()] = TVMGraph::TensorDescriptor( - DataTypeImpl::TypeFromProto(*node.InputDefs()[1]->TypeAsProto()), - execution_provider_type, - tvm::compute( - t1->shape, [&t1, &t2](tvm::Expr i) { - return t1[i] + t2[i]; - }, - "T" + std::to_string(generator.GetNext()))); -} - -TVMGraph CompileToTVM(const onnxruntime::Graph& graph, onnxruntime::ProviderType execution_provider_type) { - TVMGraph result; - std::unordered_map tvm_tensors; - IdGenerator generator; - for (auto& node : graph.Nodes()) { - Compile1DAddToTVM(node, tvm_tensors, execution_provider_type, generator); - } - - for (auto& input : graph.GetInputs()) { - result.inputs_.push_back(tvm_tensors[input->Name()]); - } - - // check initializer - for (auto& initializer : graph.GetAllInitializedTensors()) { - result.inputs_.push_back(tvm_tensors[initializer.first]); - } - - auto& output = graph.GetOutputs()[0]; - result.outputs_.push_back(tvm_tensors[output->Name()]); - return result; -} -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/tvm/tvm_compiler.h b/onnxruntime/core/codegen/tvm/tvm_compiler.h deleted file mode 100644 index e4eed0dc80d94..0000000000000 --- a/onnxruntime/core/codegen/tvm/tvm_compiler.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include -#include "core/common/common.h" -#include "core/framework/data_types.h" -#include "core/graph/function.h" -#include "core/graph/constants.h" -#include "core/graph/graph_viewer.h" -namespace onnxruntime { - -//TODO: this is just initial design to represent TVM Graph, to make the basic test work. -//We may need to revisit it later to finialize it. -struct TVMGraph { - struct TensorDescriptor { - tvm::Tensor tvm_tensor_; - DLContext ctx_; - DLDataType dtype_; - - public: - TensorDescriptor(MLDataType type, onnxruntime::ProviderType execution_provider_type, tvm::Tensor tvm_tensor); - - TensorDescriptor() = default; - }; - std::vector inputs_; - std::vector outputs_; -}; - -//TODO: compile a onnxruntime graph to tvm's tensor expression is a common logic for all hardwares -//onnxruntime framework should provide this functionality to executionp providers. -//We will need to register how to compiler it for each node. A detail design is needed. -//Here for testing we just provide the functionality that compile add 1D tensors. -TVMGraph CompileToTVM(const onnxruntime::Graph& graph, onnxruntime::ProviderType execution_provider_type); -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/tvm/tvm_kernel.h b/onnxruntime/core/codegen/tvm/tvm_kernel.h deleted file mode 100644 index 59d997e02d2f1..0000000000000 --- a/onnxruntime/core/codegen/tvm/tvm_kernel.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/tvm/tvm_compiler.h" -#include "core/graph/function.h" -#include "core/framework/op_kernel.h" - -namespace onnxruntime { - -// TVMScheduleCreator is the function that create a tvm schedule based on given TVM graph. -// Different hardware may have different schedule strategy. -typedef tvm::Schedule (*TVMScheduleCreator)(const TVMGraph&); -// TVMModuleBuilder is the function that build a tvm module, given a schedule and args. -// Different tvm kernel may chose different way to build the module, like target to LLVM or other backend. -typedef tvm::runtime::Module (*TVMModuleBuilder)(tvm::Schedule schedule, tvm::BuildConfig config, tvm::Array args, std::vector& target_func_names); - -template -class TVMKernel : public OpKernel { - public: - explicit TVMKernel(const OpKernelInfo& info) : OpKernel(info), tvm_values_(nullptr), dl_tensors_(nullptr), tvm_type_codes_(nullptr) { - auto& node = info.node(); - ORT_ENFORCE(node.NodeType() == Node::Type::Fused); - auto func = node.GetFunctionBody(); - const onnxruntime::Graph& func_body = func->Body(); - //1. compile the onnxruntime Graph to tvm graph. This step is common for all hardware, and provided by onnxruntime framework. - tvm_graph_ = CompileToTVM(func_body, node.GetExecutionProviderType()); - //2. create schedule for tvm graph, this step is depends on the execution provider/hardware. - auto s = S(tvm_graph_); - //3. Build module - std::vector tvm_args; - for (auto& t : tvm_graph_.inputs_) { - tvm_args.push_back(t.tvm_tensor_); - } - for (auto& t : tvm_graph_.outputs_) { - tvm_args.push_back(t.tvm_tensor_); - } - - std::vector func_names; - tvm_module_ = M(s, tvm::build_config(), tvm_args, func_names); - //TODO: do we have case that need more than 1 evaluation function? - evaluate_func_ = tvm_module_.GetFunction(func_names[0]); - //4. prepare args according to the type - n_args_ = tvm_args.size(); - tvm_values_ = new TVMValue[n_args_]; - tvm_type_codes_ = new int[n_args_]; - dl_tensors_ = new DLTensor[n_args_]; - int i = 0; - for (auto& tensor : tvm_graph_.inputs_) { - tvm_type_codes_[i] = kNDArrayContainer; - dl_tensors_[i].ctx = tensor.ctx_; - dl_tensors_[i].dtype = tensor.dtype_; - dl_tensors_[i].strides = nullptr; - dl_tensors_[i].byte_offset = 0; - tvm_values_[i].v_handle = &dl_tensors_[i]; - i++; - } - - for (auto& tensor : tvm_graph_.outputs_) { - tvm_type_codes_[i] = kNDArrayContainer; - dl_tensors_[i].ctx = tensor.ctx_; - dl_tensors_[i].dtype = tensor.dtype_; - dl_tensors_[i].strides = nullptr; - dl_tensors_[i].byte_offset = 0; - tvm_values_[i].v_handle = &dl_tensors_[i]; - i++; - } - ORT_ENFORCE(i == n_args_); - } - - virtual ~TVMKernel() { - if (!tvm_values_) - delete[] tvm_values_; - if (!tvm_type_codes_) - delete[] tvm_type_codes_; - if (!dl_tensors_) - delete[] dl_tensors_; - } - - virtual Status Compute(OpKernelContext* context) const override { - for (int i = 0; i < tvm_graph_.inputs_.size(); ++i) { - auto t = context->Input(i); - dl_tensors_[i].data = const_cast(t)->MutableDataRaw(); - dl_tensors_[i].ndim = static_cast(t->Shape().NumDimensions()); - dl_tensors_[i].shape = dl_tensors_[i].ndim > 0 ? const_cast(&(t->Shape().GetDims()[0])) : nullptr; - } - - int num_inputs = static_cast(tvm_graph_.inputs_.size()); - - for (int i = 0; i < tvm_graph_.outputs_.size(); ++i) { - //TODO: we need to have a shape inference function that could calculate output shape based on the symbolic formular in tvm - //We could build that function as part of tvm module, or reuse the shape inference in onnx funciton. - //Here for testing purpose, assume the output shape is same to input shape. - auto t = context->Output(i, GetOutputShape(context, i)); - dl_tensors_[num_inputs + i].data = t->MutableDataRaw(); - dl_tensors_[num_inputs + i].ndim = static_cast(t->Shape().NumDimensions()); - dl_tensors_[num_inputs + i].shape = dl_tensors_[i].ndim > 0 ? const_cast(&(t->Shape().GetDims()[0])) : nullptr; - } - - tvm::TVMArgs tvm_args(&tvm_values_[0], &tvm_type_codes_[0], static_cast(n_args_)); - tvm::TVMRetValue rvalue; - try { - evaluate_func_.CallPacked(tvm_args, &rvalue); - } catch (std::exception& ex) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TVM run failed:", ex.what()); - } - if (rvalue.type_code() != kNull) { - return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::FAIL, "TVM return not null"); // TODO: get error code. - } else { - return Status::OK(); - } - } - - protected: - virtual const TensorShape& GetOutputShape(OpKernelContext* context, int i) const = 0; - - TVMGraph tvm_graph_; - tvm::runtime::Module tvm_module_; - tvm::PackedFunc evaluate_func_; - - size_t n_args_; - TVMValue* tvm_values_; - DLTensor* dl_tensors_; - int* tvm_type_codes_; -}; -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/tvm/tvm_utils.cc b/onnxruntime/core/codegen/tvm/tvm_utils.cc deleted file mode 100644 index d1980ca1d10b4..0000000000000 --- a/onnxruntime/core/codegen/tvm/tvm_utils.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "tvm_utils.h" - -namespace onnxruntime { -namespace tvm_codegen { - -#define RETURN_DLDATATYPE_IF_MATCH(type, type_code) \ - if (ml_type == DataTypeImpl::GetType()) { \ - return {type_code, sizeof(type) * 8, 1}; \ - } - -// DLDataType: {DLDataTypeCode, bits, lanes} -DLDataType ToTvmDLDataType(MLDataType ml_type) { - RETURN_DLDATATYPE_IF_MATCH(int8_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(uint8_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(int16_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(uint16_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(int32_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(uint32_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(int64_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(uint64_t, kDLInt); - - RETURN_DLDATATYPE_IF_MATCH(float, kDLFloat); - RETURN_DLDATATYPE_IF_MATCH(double, kDLFloat); - - ORT_NOT_IMPLEMENTED("converting MLDataType ", ml_type, " to tvm DLDataType is not implemented"); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/tvm/tvm_utils.h b/onnxruntime/core/codegen/tvm/tvm_utils.h deleted file mode 100644 index abe35753a07e0..0000000000000 --- a/onnxruntime/core/codegen/tvm/tvm_utils.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include - -#include "core/framework/data_types.h" - -namespace onnxruntime { - -constexpr const char* TVM_STACKVM = "TvmStackVm"; - -namespace tvm_codegen { - // Helper function that converts a onnxruntime MLDataType to TVM DLDataType - DLDataType ToTvmDLDataType(MLDataType ml_type); -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc index 63b3bdc51b974..29a4b30532fc9 100644 --- a/onnxruntime/core/common/logging/logging.cc +++ b/onnxruntime/core/common/logging/logging.cc @@ -84,7 +84,7 @@ LoggingManager::LoggingManager(std::unique_ptr sink, Severity default_min default_filter_user_data_{filter_user_data}, default_max_vlog_level_{default_max_vlog_level}, owns_default_logger_{false} { - if (!sink_) { + if (sink_ == nullptr) { throw std::logic_error("ISink must be provided."); } @@ -126,7 +126,6 @@ LoggingManager::~LoggingManager() { void LoggingManager::CreateDefaultLogger(const std::string& logger_id) { // this method is only called from ctor in scope where DefaultLoggerMutex() is already locked - if (s_default_logger_ != nullptr) { throw std::logic_error("Default logger already set. "); } @@ -186,7 +185,8 @@ std::exception LoggingManager::LogFatalAndCreateException(const char* category, // create Capture in separate scope so it gets destructed (leading to log output) before we throw. { ::onnxruntime::logging::Capture c{::onnxruntime::logging::LoggingManager::DefaultLogger(), - ::onnxruntime::logging::Severity::kFATAL, category, ::onnxruntime::logging::DataType::SYSTEM, location}; + ::onnxruntime::logging::Severity::kFATAL, category, + ::onnxruntime::logging::DataType::SYSTEM, location}; va_list args; va_start(args, format_str); diff --git a/onnxruntime/core/common/logging/sinks/ostream_sink.cc b/onnxruntime/core/common/logging/sinks/ostream_sink.cc index 1d689b25d8466..9d8833c067274 100644 --- a/onnxruntime/core/common/logging/sinks/ostream_sink.cc +++ b/onnxruntime/core/common/logging/sinks/ostream_sink.cc @@ -21,9 +21,9 @@ void OStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger std::ostringstream msg; msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", " - << message.Location().ToString() << "] " << message.Message(); + << message.Location().ToString() << "] " << message.Message() << "\n"; - (*stream_) << msg.str() << "\n"; + (*stream_) << msg.str(); if (flush_) { stream_->flush(); diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index 18c46a994f4d2..d8eb1b2354027 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -72,6 +72,11 @@ std::string Profiler::EndProfiling() { profile_with_logger_ = false; return std::string(); } + + if (session_logger_) { + LOGS(*session_logger_, INFO) << "Writing profiler data to file " << profile_stream_file_; + } + std::lock_guard lock(mutex_); profile_stream_ << "[\n"; diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h index 3e0496282719c..48ecf5747467a 100644 --- a/onnxruntime/core/common/profiler.h +++ b/onnxruntime/core/common/profiler.h @@ -44,7 +44,10 @@ class Profiler { */ TimePoint StartTime() const; - bool FEnabled() const { + /* + Whether data collection and output from this profiler is enabled. + */ + bool IsEnabled() const { return enabled_; } diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc index 9ffcc9b5557b5..552d702b80e7c 100644 --- a/onnxruntime/core/framework/allocation_planner.cc +++ b/onnxruntime/core/framework/allocation_planner.cc @@ -51,6 +51,7 @@ std::ostream& operator<<(std::ostream& out, std::pair index_to_name; out << "Allocation Plan:\n"; + out << "(ort_value_idx) output_name : \n"; auto plan_size = plan.allocation_plan.size(); for (auto& name_index : session_state.GetOrtValueNameIdxMap()) { @@ -256,8 +257,11 @@ class PlannerImpl { const auto& val2 = shape2.dim(i); if (val1.has_dim_value() && val2.has_dim_value() && (val1.dim_value() == val2.dim_value())) continue; // same known dimension - if (val1.has_dim_param() && val2.has_dim_param() && (val1.dim_param() == val2.dim_param())) - continue; // same unknown dimension + if (val1.has_dim_param() && val2.has_dim_param()) { + const auto& val1_param = val1.dim_param(); + if (val1_param == val2.dim_param() && !val1_param.empty()) + continue; // same unknown dimension + } return false; } return true; diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index c136465b713cf..b8847a00801c3 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -3,6 +3,7 @@ #include "core/framework/allocator.h" #include "core/framework/allocatormgr.h" +#include "core/mlas/inc/mlas.h" #include #include @@ -11,15 +12,8 @@ namespace onnxruntime { void* CPUAllocator::Alloc(size_t size) { if (size <= 0) return nullptr; - //default align to 64; void* p; -#if defined(__AVX512F__) - size_t alignment = 64; -#elif defined(__AVX__) - size_t alignment = 32; -#else - size_t alignment = 32; //Indeed, the default one(8 or 16) should be enough -#endif + size_t alignment = MlasGetPreferredBufferAlignment(); #if _MSC_VER p = _aligned_malloc(size, alignment); if (p == nullptr) throw std::bad_alloc(); @@ -52,7 +46,15 @@ std::ostream& operator<<(std::ostream& out, const OrtAllocatorInfo& info) { ORT_API_STATUS_IMPL(OrtCreateAllocatorInfo, _In_ const char* name1, OrtAllocatorType type, int id1, OrtMemType mem_type1, _Out_ OrtAllocatorInfo** out) { - *out = new OrtAllocatorInfo(name1, type, id1, mem_type1); + if (strcmp(name1, onnxruntime::CPU) == 0) { + *out = new OrtAllocatorInfo(name1, type, OrtDevice(), id1, mem_type1); + } else if (strcmp(name1, onnxruntime::CUDA) == 0) { + *out = new OrtAllocatorInfo(name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast(id1)), id1, mem_type1); + } else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) { + *out = new OrtAllocatorInfo(name1, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast(id1)), id1, mem_type1); + } else { + return OrtCreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported."); + } return nullptr; } diff --git a/onnxruntime/core/framework/arena.h b/onnxruntime/core/framework/arena.h index a4df32fe95bb0..24872e5026958 100644 --- a/onnxruntime/core/framework/arena.h +++ b/onnxruntime/core/framework/arena.h @@ -37,7 +37,7 @@ class DummyArena : public IArenaAllocator { public: explicit DummyArena(std::unique_ptr resource_allocator) : allocator_(std::move(resource_allocator)), - info_(allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, allocator_->Info().id) { + info_(allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, allocator_->Info().device, allocator_->Info().id) { } ~DummyArena() override = default; diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc index eecdac5a6e83c..a795d28b4c0fc 100644 --- a/onnxruntime/core/framework/bfc_arena.cc +++ b/onnxruntime/core/framework/bfc_arena.cc @@ -9,7 +9,7 @@ BFCArena::BFCArena(std::unique_ptr resource_allocator, : device_allocator_(std::move(resource_allocator)), free_chunks_list_(kInvalidChunkHandle), next_allocation_id_(1), - info_(device_allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, device_allocator_->Info().id, device_allocator_->Info().mem_type) { + info_(device_allocator_->Info().name, OrtAllocatorType::OrtArenaAllocator, device_allocator_->Info().device, device_allocator_->Info().id, device_allocator_->Info().mem_type) { curr_region_allocation_bytes_ = RoundedBytes(std::min(total_memory, size_t{1048576})); // Allocate the requested amount of memory. diff --git a/onnxruntime/core/framework/data_transfer.cc b/onnxruntime/core/framework/data_transfer.cc new file mode 100644 index 0000000000000..465655f03d6ec --- /dev/null +++ b/onnxruntime/core/framework/data_transfer.cc @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/data_transfer.h" + +namespace onnxruntime { + +common::Status IDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const { + return CopyTensor(src, dst, 0); +} + +bool CPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { + return src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU; +} + +common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /*exec_queue_id*/) const { + const void* src_data = src.DataRaw(); + void* dst_data = dst.MutableDataRaw(); + if (src_data == dst_data) { + // no need copying as both pointers are referring to same piece of memory. + return Status::OK(); + } + // Copying only happens between two same size tensors. + ORT_ENFORCE(src.SizeInBytes() == dst.SizeInBytes()); + memcpy(dst_data, src_data, src.SizeInBytes()); + return Status::OK(); +} + +}; // namespace onnxruntime diff --git a/onnxruntime/core/framework/data_transfer.h b/onnxruntime/core/framework/data_transfer.h new file mode 100644 index 0000000000000..1e707b196babc --- /dev/null +++ b/onnxruntime/core/framework/data_transfer.h @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/status.h" +#include "core/framework/tensor.h" + +namespace onnxruntime { + +// Data transfer interface. +class IDataTransfer { + public: + virtual ~IDataTransfer() = default; + + virtual bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const = 0; + + virtual common::Status CopyTensor(const Tensor& src, Tensor& dst) const; + virtual common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const = 0; +}; + +class CPUDataTransfer : public IDataTransfer { + public: + CPUDataTransfer() = default; + bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; + common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override; +}; +} // namespace onnxruntime diff --git a/onnxruntime/core/framework/data_transfer_manager.cc b/onnxruntime/core/framework/data_transfer_manager.cc new file mode 100644 index 0000000000000..3f07ac7559f00 --- /dev/null +++ b/onnxruntime/core/framework/data_transfer_manager.cc @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/data_transfer_manager.h" + +namespace onnxruntime { +using namespace common; + +Status DataTransferManager::RegisterDataTransfer(std::unique_ptr data_transfer) { + if (nullptr == data_transfer) { + return Status(ONNXRUNTIME, INVALID_ARGUMENT, "data_transfer registered is nullptr."); + } + datatransfers_.push_back(std::move(data_transfer)); + return Status::OK(); +} + +const IDataTransfer* DataTransferManager::GetDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const { + for (auto& data_transfer : datatransfers_) { + if (!data_transfer->CanCopy(src_device, dst_device)) { + continue; + } + + return data_transfer.get(); + } + return nullptr; +} + + +Status DataTransferManager::CopyTensor(const Tensor& src, Tensor& dst) const { + return CopyTensor(src, dst, 0); +} + +Status DataTransferManager::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const { + if (src.Shape().Size() != dst.Shape().Size()) { + return Status(ONNXRUNTIME, FAIL, "Tensor size mismatch"); + } + + for (auto& data_transfer : datatransfers_) { + if (!data_transfer->CanCopy(src.Location().device, dst.Location().device)) { + continue; + } + + return data_transfer->CopyTensor(src, dst, exec_queue_id); + } + + return ORT_MAKE_STATUS(ONNXRUNTIME, + FAIL, + "There's no data transfer registered for copying tensors from ", + src.Location().device.ToString(), + " to ", + dst.Location().device.ToString()); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/framework/data_transfer_manager.h b/onnxruntime/core/framework/data_transfer_manager.h new file mode 100644 index 0000000000000..35c85a0ba16c5 --- /dev/null +++ b/onnxruntime/core/framework/data_transfer_manager.h @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/status.h" +#include "core/framework/data_transfer.h" +#include "core/framework/tensor.h" + +namespace onnxruntime { + +// Data transfer manager, which has all functions registered to copy tensors with different location. +// It's not thread-safe. +class DataTransferManager { + public: + DataTransferManager() = default; + //static DataTransferManager& Instance(); + + common::Status RegisterDataTransfer(std::unique_ptr data_transfer); + + const IDataTransfer* GetDataTransfer(const OrtDevice& src_device, const OrtDevice& dst_device) const; + + common::Status CopyTensor(const Tensor& src, Tensor& dst) const; + common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const; + + private: + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(DataTransferManager); + + // It's assumed that data transfers in this array have no overlap in terms of copying functionality. + std::vector> datatransfers_; +}; +} // namespace onnxruntime diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc index b0831b465e80e..c44bb3e0497a3 100644 --- a/onnxruntime/core/framework/execution_frame.cc +++ b/onnxruntime/core/framework/execution_frame.cc @@ -24,9 +24,9 @@ IExecutionFrame::IExecutionFrame(const std::vector& feed_mlvalue_idxs, cons const OrtValueNameIdxMap& ort_value_idx_map, const NodeIndexInfo& node_index_info) : node_index_info_{node_index_info}, fetch_mlvalue_idxs_{fetch_mlvalue_idxs} { ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size()); - ORT_ENFORCE(fetches.empty() || fetches.size() == fetch_mlvalue_idxs.size()); + ORT_ENFORCE(fetches.empty() || fetches.size() == fetch_mlvalue_idxs_.size()); - Init(feed_mlvalue_idxs, feeds, initializers, fetch_mlvalue_idxs, fetches, ort_value_idx_map); + Init(feed_mlvalue_idxs, feeds, initializers, fetches, ort_value_idx_map); } IExecutionFrame::~IExecutionFrame() = default; @@ -104,17 +104,17 @@ int IExecutionFrame::GetNodeIdxToMLValueIdx(int index) const { void IExecutionFrame::Init(const std::vector& feed_mlvalue_idxs, const std::vector& feeds, const std::unordered_map& initializers, - const std::vector& fetch_mlvalue_idxs, const std::vector& fetches, + const std::vector& fetches, const OrtValueNameIdxMap& ort_value_idx_map) { // 1. resize the all_value_ vector all_values_.resize(ort_value_idx_map.MaxIdx() + 1); // 2. Handle non-empty output vector if (!fetches.empty()) { - auto num_fetches = fetch_mlvalue_idxs.size(); + auto num_fetches = fetch_mlvalue_idxs_.size(); for (size_t idx = 0; idx < num_fetches; ++idx) { - int ort_value_idx = fetch_mlvalue_idxs[idx]; + int ort_value_idx = fetch_mlvalue_idxs_[idx]; all_values_[ort_value_idx] = fetches[idx]; } } @@ -189,7 +189,7 @@ ExecutionFrame::ExecutionFrame(const std::vector& feed_mlvalue_idxs, const // and we have execution plan generated, try to setup // memory pattern optimization. if (session_state.GetEnableMemoryPattern() && session_state.GetExecutionPlan()) { - std::vector input_shapes; + std::vector> input_shapes; bool all_tensors = true; // Reserve mem to avoid re-allocation. input_shapes.reserve(feeds.size()); @@ -199,7 +199,7 @@ ExecutionFrame::ExecutionFrame(const std::vector& feed_mlvalue_idxs, const break; } auto& tensor = feed.Get(); - input_shapes.push_back(tensor.Shape()); + input_shapes.push_back(std::cref(tensor.Shape())); } //if there are some traditional ml value type in inputs disable the memory pattern optimization. @@ -316,6 +316,27 @@ Status ExecutionFrame::AllocateMLValueTensorPreAllocateBuffer(OrtValue& ort_valu OrtValue& ort_value_reuse = GetMutableMLValue(ort_value_index_reuse); auto* reuse_tensor = ort_value_reuse.GetMutable(); + auto buffer_num_elements = reuse_tensor->Shape().Size(); + auto required_num_elements = shape.Size(); + + // check number of elements matches. shape may not be an exact match (e.g. Reshape op) + if (buffer_num_elements != required_num_elements) { + // could be an allocation planner bug (less likely) or the model incorrectly uses something like 'None' + // as a dim_param, or -1 in dim_value in multiple places making the planner think those shapes are equal. + auto message = onnxruntime::MakeString( + "Shape mismatch attempting to re-use buffer. ", + reuse_tensor->Shape(), " != ", shape, + ". Validate usage of dim_value (values should be > 0) and " + "dim_param (all values with the same string should equate to the same size) in shapes in the model."); + + // be generous and use the buffer if it's large enough. log a warning though as it indicates a bad model + if (buffer_num_elements >= required_num_elements) { + LOGS_DEFAULT(WARNING) << message; + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, message); + } + } + void* reuse_buffer = reuse_tensor->MutableDataRaw(); // create fence on reused ort_value if needed diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h index a46e62aedc487..c99979edb7eba 100644 --- a/onnxruntime/core/framework/execution_frame.h +++ b/onnxruntime/core/framework/execution_frame.h @@ -73,7 +73,7 @@ class IExecutionFrame { ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(IExecutionFrame); void Init(const std::vector& feed_mlvalue_idxs, const std::vector& feeds, - const std::unordered_map& initializers, const std::vector& fetch_mlvalue_idxs, + const std::unordered_map& initializers, const std::vector& fetches, const OrtValueNameIdxMap& ort_value_idx_map); const OrtValue& GetMLValue(int ort_value_index) const { diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc index 02d894967543d..c86f3e225e4e5 100644 --- a/onnxruntime/core/framework/execution_provider.cc +++ b/onnxruntime/core/framework/execution_provider.cc @@ -43,14 +43,6 @@ IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, return result; } -common::Status IExecutionProvider::CopyTensor(const Tensor& src, - Tensor& dst, - int exec_queue_id) const { - // execution provider may override this to support different exec queues - ORT_ENFORCE(exec_queue_id == 0); - return CopyTensor(src, dst); -} - common::Status IExecutionProvider::Sync() const { return Status::OK(); }; common::Status IExecutionProvider::OnRunStart() { return Status::OK(); } diff --git a/onnxruntime/core/framework/feeds_fetches_manager.cc b/onnxruntime/core/framework/feeds_fetches_manager.cc index cedc0eefb1e98..7371469297848 100644 --- a/onnxruntime/core/framework/feeds_fetches_manager.cc +++ b/onnxruntime/core/framework/feeds_fetches_manager.cc @@ -43,9 +43,7 @@ Status FeedsFetchesManager::Create(const std::vector& feed_names, const std::vector& output_names, const OrtValueNameIdxMap& ort_value_name_idx_map, std::unique_ptr& feed_fetch_manager) { - FeedsFetchesInfo info; - info.feed_names = feed_names; - info.output_names = output_names; + FeedsFetchesInfo info{feed_names, output_names}; ORT_RETURN_IF_ERROR(info.SetMLValueIdxs(ort_value_name_idx_map)); diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index d4d250027cb1c..5b0cba6c3b0d8 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -176,7 +176,7 @@ Status GraphPartitioner::Partition(Graph& graph, bool export_dll, FuncManager& f //prepare the func kernel KernelDefBuilder builder; BuildFusedKernelDef(builder, *node); - if (node->GetExecutionProviderType() == onnxruntime::kTensorrtExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNGraphExecutionProvider) { + if (node->GetExecutionProviderType() == onnxruntime::kTensorrtExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNGraphExecutionProvider || node->GetExecutionProviderType() == onnxruntime::kNnapiExecutionProvider) { builder.SetDefaultInputsMemoryType(OrtMemTypeCPUInput); builder.SetDefaultOutputMemoryType(OrtMemTypeCPUOutput); } diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc index 4dacf10346570..cfc545d367c26 100644 --- a/onnxruntime/core/framework/kernel_registry.cc +++ b/onnxruntime/core/framework/kernel_registry.cc @@ -240,9 +240,12 @@ Status KernelRegistry::Register(KernelCreateInfo&& create_info) { return Status::OK(); } -Status KernelRegistry::TryCreateKernel(const onnxruntime::Node& node, const IExecutionProvider& execution_provider, - const std::unordered_map& initialized_tensors, - const OrtValueNameIdxMap& ort_value_name_idx_map, const FuncManager& funcs_mgr, +Status KernelRegistry::TryCreateKernel(const onnxruntime::Node& node, + const IExecutionProvider& execution_provider, + const std::unordered_map& constant_initialized_tensors, + const OrtValueNameIdxMap& ort_value_name_idx_map, + const FuncManager& funcs_mgr, + const DataTransferManager& data_transfer_mgr, /*out*/ std::unique_ptr& op_kernel) const { const KernelCreateInfo* kernel_create_info = TryFindKernel(node, execution_provider.Type()); @@ -250,8 +253,13 @@ Status KernelRegistry::TryCreateKernel(const onnxruntime::Node& node, const IExe return Status(ONNXRUNTIME, FAIL, "Failed to find kernel for " + node.OpType()); } - OpKernelInfo kernel_info(node, *kernel_create_info->kernel_def, execution_provider, initialized_tensors, - ort_value_name_idx_map, funcs_mgr); + OpKernelInfo kernel_info(node, + *kernel_create_info->kernel_def, + execution_provider, + constant_initialized_tensors, + ort_value_name_idx_map, + funcs_mgr, + data_transfer_mgr); op_kernel.reset(kernel_create_info->kernel_create_func(kernel_info)); return Status::OK(); } diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc index c254273342a58..5fe803b368022 100644 --- a/onnxruntime/core/framework/kernel_registry_manager.cc +++ b/onnxruntime/core/framework/kernel_registry_manager.cc @@ -22,8 +22,8 @@ Status KernelRegistryManager::CreateKernel(const onnxruntime::Node& node, Status status; { for (auto& registry : custom_kernel_registries_) { - status = registry->TryCreateKernel(node, execution_provider, session_state.GetInitializedTensors(), - session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), op_kernel); + status = registry->TryCreateKernel(node, execution_provider, session_state.GetConstantInitializedTensors(), + session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), session_state.GetDataTransferMgr(), op_kernel); if (status.IsOK()) { return status; } @@ -34,8 +34,8 @@ Status KernelRegistryManager::CreateKernel(const onnxruntime::Node& node, auto iter = provider_type_to_registry_.find(ptype); if (iter != provider_type_to_registry_.end()) p = iter->second.get(); if (p != nullptr) { - status = p->TryCreateKernel(node, execution_provider, session_state.GetInitializedTensors(), - session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), op_kernel); + status = p->TryCreateKernel(node, execution_provider, session_state.GetConstantInitializedTensors(), + session_state.GetOrtValueNameIdxMap(), session_state.GetFuncMgr(), session_state.GetDataTransferMgr(), op_kernel); if (status.IsOK()) { return status; } diff --git a/onnxruntime/core/framework/memcpy.cc b/onnxruntime/core/framework/memcpy.cc index a7847c2f0746f..3c63cf233d042 100644 --- a/onnxruntime/core/framework/memcpy.cc +++ b/onnxruntime/core/framework/memcpy.cc @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/framework/data_transfer_manager.h" #include "memcpy.h" using namespace ONNX_NAMESPACE; namespace onnxruntime { @@ -13,7 +14,7 @@ Memcpy::Memcpy(const OpKernelInfo& info) Status Memcpy::Compute(OpKernelContext* ctx) const { const auto* X = ctx->Input(0); Tensor* Y = ctx->Output(0, X->Shape()); - Status retval = provider_->CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId()); + Status retval = Info().GetDataTransferManager().CopyTensor(*X, *Y, Info().GetKernelDef().ExecQueueId()); return retval; } diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.cc b/onnxruntime/core/framework/onnxruntime_typeinfo.cc index fcb6c143e5397..4f00ec89dde4a 100644 --- a/onnxruntime/core/framework/onnxruntime_typeinfo.cc +++ b/onnxruntime/core/framework/onnxruntime_typeinfo.cc @@ -21,12 +21,12 @@ OrtTypeInfo::~OrtTypeInfo() { OrtReleaseTensorTypeAndShapeInfo(data); } -ORT_API_STATUS_IMPL(OrtOnnxTypeFromTypeInfo, _In_ const struct OrtTypeInfo* input, ONNXType* out) { +ORT_API_STATUS_IMPL(OrtGetOnnxTypeFromTypeInfo, _In_ const struct OrtTypeInfo* input, ONNXType* out) { *out = input->type; return nullptr; } -ORT_API_STATUS_IMPL(OrtCastTypeInfoToTensorInfo, _In_ struct OrtTypeInfo* input, const struct OrtTensorTypeAndShapeInfo** out) { +ORT_API_STATUS_IMPL(OrtCastTypeInfoToTensorInfo, _In_ const struct OrtTypeInfo* input, const struct OrtTensorTypeAndShapeInfo** out) { *out = input->type == ONNX_TYPE_TENSOR ? input->data : nullptr; return nullptr; } diff --git a/onnxruntime/core/framework/op_kernel_info.cc b/onnxruntime/core/framework/op_kernel_info.cc index ec2c990ad1398..bb3c101e365cf 100644 --- a/onnxruntime/core/framework/op_kernel_info.cc +++ b/onnxruntime/core/framework/op_kernel_info.cc @@ -8,22 +8,26 @@ namespace onnxruntime { -OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node, const KernelDef& kernel_def, +OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node, + const KernelDef& kernel_def, const IExecutionProvider& execution_provider, - const std::unordered_map& initialized_tensors, - const OrtValueNameIdxMap& ort_value_name_idx_map, const FuncManager& funcs_mgr) + const std::unordered_map& constant_initialized_tensors, + const OrtValueNameIdxMap& ort_value_name_idx_map, + const FuncManager& funcs_mgr, + const DataTransferManager& data_transfer_mgr) : OpNodeProtoHelper(&proto_helper_context_), node_(node), kernel_def_(kernel_def), execution_provider_(&execution_provider), - initialized_tensors_(initialized_tensors), + constant_initialized_tensors_(constant_initialized_tensors), ort_value_name_idx_map_(ort_value_name_idx_map), funcs_mgr_(funcs_mgr), + data_transfer_mgr_(data_transfer_mgr), proto_helper_context_(node) {} OpKernelInfo::OpKernelInfo(const OpKernelInfo& other) - : OpKernelInfo(other.node_, other.kernel_def_, *other.execution_provider_, other.initialized_tensors_, - other.ort_value_name_idx_map_, other.funcs_mgr_) {} + : OpKernelInfo(other.node_, other.kernel_def_, *other.execution_provider_, other.constant_initialized_tensors_, + other.ort_value_name_idx_map_, other.funcs_mgr_, other.data_transfer_mgr_) {} const OrtAllocatorInfo& OpKernelInfo::GetAllocatorInfo(int device_id, OrtMemType mem_type) const { AllocatorPtr alloc = GetAllocator(device_id, mem_type); @@ -43,6 +47,10 @@ const IExecutionProvider* OpKernelInfo::GetExecutionProvider() const noexcept { return execution_provider_; } +const DataTransferManager& OpKernelInfo::GetDataTransferManager() const noexcept { + return data_transfer_mgr_; +} + const onnxruntime::Node& OpKernelInfo::node() const noexcept { return node_; } @@ -57,14 +65,16 @@ bool OpKernelInfo::TryGetConstantInput(int input_index, const Tensor** constant_ return false; } - auto iter = initialized_tensors_.find(input_arg_index); - if (initialized_tensors_.end() == iter) { + auto iter = constant_initialized_tensors_.find(input_arg_index); + if (constant_initialized_tensors_.end() == iter) { return false; } + if (!iter->second.IsTensor()) { - // Only constant Tensor input is support right now, since we're using initializers to store the data. + // Only constant Tensor input is supported right now, since we're using initializers to store the data. return false; } + *constant_input_value = &iter->second.Get(); return true; } diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc index 9c9091987c23a..72ee80cd421ee 100644 --- a/onnxruntime/core/framework/parallel_executor.cc +++ b/onnxruntime/core/framework/parallel_executor.cc @@ -35,8 +35,8 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v const std::unordered_map& fetch_allocators, const logging::Logger& logger) { TimePoint tp; - bool f_profiler_enabled = session_state.Profiler().FEnabled(); - if (f_profiler_enabled) { + const bool is_profiler_enabled = session_state.Profiler().IsEnabled(); + if (is_profiler_enabled) { tp = session_state.Profiler().StartTime(); } @@ -84,7 +84,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v VLOGS(logger, 1) << "Done execution."; if (root_frame_->HasMemoryPatternPlanner()) { - std::vector input_shapes; + std::vector> input_shapes; bool all_tensors = true; for (const auto& feed : feeds) { if (!(feed.IsTensor())) { @@ -92,7 +92,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v break; } auto& tensor = feed.Get(); - input_shapes.push_back(tensor.Shape()); + input_shapes.push_back(std::cref(tensor.Shape())); } if (all_tensors) { @@ -102,7 +102,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v } } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "ParallelExecutor::Execute", tp); } @@ -121,7 +121,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index, auto graph_viewer = session_state.GetGraphViewer(); TimePoint sync_time_begin; TimePoint kernel_begin_time; - bool f_profiler_enabled = session_state.Profiler().FEnabled(); + const bool f_profiler_enabled = session_state.Profiler().IsEnabled(); // Avoid context switching if possible. while (keep_running) { diff --git a/onnxruntime/core/framework/run_options.cc b/onnxruntime/core/framework/run_options.cc index add62db9daecb..079be56fc5ae4 100644 --- a/onnxruntime/core/framework/run_options.cc +++ b/onnxruntime/core/framework/run_options.cc @@ -12,7 +12,7 @@ ORT_API_STATUS_IMPL(OrtCreateRunOptions, OrtRunOptions** out) { API_IMPL_END } -ORT_API_STATUS_IMPL(OrtRunOptionsSetRunLogVerbosityLevel, _In_ OrtRunOptions* options, unsigned int value) { +ORT_API_STATUS_IMPL(OrtRunOptionsSetRunLogVerbosityLevel, _In_ OrtRunOptions* options, int value) { options->run_log_verbosity_level = value; return nullptr; } @@ -23,17 +23,22 @@ ORT_API_STATUS_IMPL(OrtRunOptionsSetRunTag, _In_ OrtRunOptions* options, _In_ co return nullptr; } -ORT_API_STATUS_IMPL(OrtRunOptionsGetRunLogVerbosityLevel, _In_ OrtRunOptions* options, unsigned int* out) { +ORT_API_STATUS_IMPL(OrtRunOptionsGetRunLogVerbosityLevel, _In_ const OrtRunOptions* options, int* out) { *out = options->run_log_verbosity_level; return nullptr; } -ORT_API_STATUS_IMPL(OrtRunOptionsGetRunTag, _In_ OrtRunOptions* options, const char** out) { +ORT_API_STATUS_IMPL(OrtRunOptionsGetRunTag, _In_ const OrtRunOptions* options, const char** out) { *out = options->run_tag.c_str(); return nullptr; } -ORT_API_STATUS_IMPL(OrtRunOptionsSetTerminate, _In_ OrtRunOptions* options, bool value) { - options->terminate = value; +ORT_API_STATUS_IMPL(OrtRunOptionsEnableTerminate, _Inout_ OrtRunOptions* options) { + options->terminate = true; + return nullptr; +} + +ORT_API_STATUS_IMPL(OrtRunOptionsDisableTerminate, _Inout_ OrtRunOptions* options) { + options->terminate = false; return nullptr; } diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index 504c843ebce2b..bd45bbfdc0b01 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -27,12 +27,12 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) { - bool f_profiler_enabled = session_state.Profiler().FEnabled(); + const bool is_profiler_enabled = session_state.Profiler().IsEnabled(); TimePoint tp; TimePoint sync_time_begin; TimePoint kernel_begin_time; - if (f_profiler_enabled) { + if (is_profiler_enabled) { tp = session_state.Profiler().StartTime(); } @@ -65,7 +65,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, p_op_kernel->Node().ImplicitInputDefs(), terminate_flag_); // TODO: log kernel outputs? - if (f_profiler_enabled) { + if (is_profiler_enabled) { sync_time_begin = session_state.Profiler().StartTime(); } @@ -104,7 +104,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node()); #endif - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, p_op_kernel->Node().Name() + "_fence_before", sync_time_begin, @@ -128,7 +128,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: return Status(compute_status.Category(), compute_status.Code(), msg_string); } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, p_op_kernel->Node().Name() + "_kernel_time", kernel_begin_time, @@ -159,7 +159,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: } } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, p_op_kernel->Node().Name() + "_fence_after", sync_time_begin, @@ -181,7 +181,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: VLOGS(logger, 1) << "Done with execution."; if (frame.HasMemoryPatternPlanner()) { - std::vector input_shapes; + std::vector> input_shapes; bool all_tensors = true; for (const auto& feed : feeds) { if (!(feed.IsTensor())) { @@ -189,7 +189,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: break; } auto& tensor = feed.Get(); - input_shapes.push_back(tensor.Shape()); + input_shapes.push_back(std::cref(tensor.Shape())); } if (all_tensors) { @@ -199,7 +199,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: } } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", tp); } diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 6669cdc449afd..a6fe46be955ed 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -36,17 +36,32 @@ void SessionState::SetExecutionPlan(std::unique_ptr p_s const SequentialExecutionPlan* SessionState::GetExecutionPlan() const { return p_seq_exec_plan_.get(); } -Status SessionState::AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d) { +Status SessionState::AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d, + bool constant) { ORT_ENFORCE(ort_value_index >= 0 && ort_value_index <= ort_value_name_idx_map_.MaxIdx()); auto p = initialized_tensors_.insert({ort_value_index, ort_value}); if (!p.second) return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "duplicated ort_value index:", ort_value_index, ". Do you have duplicated calls to SessionState::AddInitializedTensor function?"); - if (d != nullptr && d->f != nullptr) deleter_for_initialized_tensors_[ort_value_index] = *d; + + if (d != nullptr && d->f != nullptr) { + deleter_for_initialized_tensors_[ort_value_index] = *d; + } + + if (constant) { + constant_initialized_tensors_.insert({ort_value_index, ort_value}); + } + return Status::OK(); } -const std::unordered_map& SessionState::GetInitializedTensors() const { return initialized_tensors_; } +const std::unordered_map& SessionState::GetInitializedTensors() const { + return initialized_tensors_; +} + +const std::unordered_map& SessionState::GetConstantInitializedTensors() const { + return constant_initialized_tensors_; +} SessionState& SessionState::SetLogger(const logging::Logger& logger) { logger_ = &logger; @@ -63,26 +78,27 @@ void SessionState::SetProfiler(profiling::Profiler& profiler) { profiler_ = &pro ::onnxruntime::profiling::Profiler& SessionState::Profiler() const { return *profiler_; } -static int64_t CalculateMemoryPatternsKey(const std::vector& shapes) { +static int64_t CalculateMemoryPatternsKey(const std::vector>& shapes) { int64_t key = 0; - for (auto& shape : shapes) { - for (auto dim : shape.GetDims()) key ^= dim; + for (auto shape : shapes) { + for (auto dim : shape.get().GetDims()) key ^= dim; } return key; } -const MemoryPatternGroup* SessionState::GetMemoryPatternGroup(const std::vector& input_shapes) const { - std::lock_guard lock(mem_patterns_lock_); +const MemoryPatternGroup* SessionState::GetMemoryPatternGroup(const std::vector>& input_shapes) const { int64_t key = CalculateMemoryPatternsKey(input_shapes); + + std::lock_guard lock(mem_patterns_lock_); auto it = mem_patterns_.find(key); if (it == mem_patterns_.end()) return nullptr; return it->second.get(); } -Status SessionState::UpdateMemoryPatternGroupCache(const std::vector& input_shape, +Status SessionState::UpdateMemoryPatternGroupCache(const std::vector>& input_shapes, std::unique_ptr mem_patterns) const { - int64_t key = CalculateMemoryPatternsKey(input_shape); + int64_t key = CalculateMemoryPatternsKey(input_shapes); std::lock_guard lock(mem_patterns_lock_); auto it = mem_patterns_.find(key); diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h index cd98401aa9005..dfec27108257a 100644 --- a/onnxruntime/core/framework/session_state.h +++ b/onnxruntime/core/framework/session_state.h @@ -14,6 +14,7 @@ #include "core/common/logging/logging.h" #include "core/common/profiler.h" #include "core/framework/allocation_planner.h" +#include "core/framework/data_transfer_manager.h" #include "core/framework/execution_providers.h" #include "core/framework/feeds_fetches_manager.h" #include "core/framework/kernel_registry_manager.h" @@ -71,17 +72,25 @@ class SessionState { /** * Adds an initialized tensor (weight) so that it can be used by the * execution frame to setup the appropriate OrtValue vectors. - * This function will take a shallow copy of d if d is not NULL + * This function will take a shallow copy of d if d is not NULL. + * If 'constant' is true the tensor value cannot be overridden by an input at runtime. */ - Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d); + Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d, bool constant); /** - * Gets the list of all initialized tensors (weights) so that it can be used by the + * Gets the map of ort_value_index to initialized tensors (weights) so that it can be used by the * execution frame to setup the appropriate OrtValue vectors. * The lifetime of returned OrtValues are limited by this SessionState object. */ const std::unordered_map& GetInitializedTensors() const; + /** + * Gets the map of ort_value_index to initialized tensors (e.g. weights) that are constant + * and cannot be overridden at runtime. + * The lifetime of returned OrtValues are limited by this SessionState object. + */ + const std::unordered_map& GetConstantInitializedTensors() const; + // execution plan void SetExecutionPlan(std::unique_ptr p_seq_exec_plan); const SequentialExecutionPlan* GetExecutionPlan() const; @@ -111,13 +120,13 @@ class SessionState { /** Get cached memory pattern based on input shapes */ - const MemoryPatternGroup* GetMemoryPatternGroup(const std::vector& input_shapes) const; + const MemoryPatternGroup* GetMemoryPatternGroup(const std::vector>& input_shapes) const; /** Set generated memory pattern with a given input shapes. Const as it's an internal cache update only. */ - Status UpdateMemoryPatternGroupCache(const std::vector& input_shape, + Status UpdateMemoryPatternGroupCache(const std::vector>& input_shape, std::unique_ptr mem_patterns) const; /** @@ -174,8 +183,10 @@ class SessionState { const FuncManager& GetFuncMgr() const { return fused_funcs_mgr_; } FuncManager& GetMutableFuncMgr() { return fused_funcs_mgr_; } - std::vector& GetMutableWeightsBuffers() { return weights_buffers_; } + const DataTransferManager& GetDataTransferMgr() const { return *data_transfer_mgr_; } + void SetDataTransferMgr(const DataTransferManager* data_transfer_mgr) { data_transfer_mgr_ = data_transfer_mgr; } + std::vector& GetMutableWeightsBuffers() { return weights_buffers_; } void CalculateNodeIndexInfo(); const NodeIndexInfo& GetNodeIndexInfo() const; @@ -190,9 +201,12 @@ class SessionState { const ExecutionProviders& execution_providers_; // owned by InferenceSession OrtValueNameIdxMap ort_value_name_idx_map_; - // initialized tensorset + // initialized tensors std::unordered_map initialized_tensors_; // key is ort_value_index - // This data structure is for unintializing string tensors and + // subset of initialized_tensors_ that are constant and cannot be overridden at runtime + std::unordered_map constant_initialized_tensors_; + + // This data structure is for uninitializing string tensors and // munmap memory region and close file descriptor std::unordered_map deleter_for_initialized_tensors_; std::vector weights_buffers_; @@ -221,6 +235,7 @@ class SessionState { bool export_fused_dll_ = false; FuncManager fused_funcs_mgr_; + const DataTransferManager* data_transfer_mgr_; std::unique_ptr node_index_info_; std::multimap> cached_feeds_fetches_managers_; diff --git a/onnxruntime/core/framework/session_state_initializer.cc b/onnxruntime/core/framework/session_state_initializer.cc index 95b61b7f74747..3f4777d8608d0 100644 --- a/onnxruntime/core/framework/session_state_initializer.cc +++ b/onnxruntime/core/framework/session_state_initializer.cc @@ -12,6 +12,8 @@ #include "core/common/logging/logging.h" #include "core/graph/graph_viewer.h" +#include "core/framework/data_transfer_manager.h" +#include "core/graph/graph_utils.h" #include "core/framework/graph_partitioner.h" #include "core/framework/ml_value.h" #include "core/framework/ort_value_pattern_planner.h" @@ -35,7 +37,8 @@ static common::Status SaveInitializedTensors(const Env& env, const std::basic_st const onnxruntime::Graph& graph, const ExecutionProviders& exec_providers, const OrtValueNameIdxMap& ort_value_name_idx_map, ITensorAllocator* planner, const T& save_tensor_func, - const logging::Logger& logger); + const logging::Logger& logger, + const DataTransferManager& data_transfer_mgr); static common::Status SaveKernels(const ExecutionProviders& execution_providers, SessionState& session_state, @@ -107,10 +110,10 @@ common::Status SessionStateInitializer::InitializeAndSave( const Env& env = Env::Default(); ORT_RETURN_IF_ERROR(SaveInitializedTensors( env, graph_loc_, graph_, execution_providers_, ort_value_name_idx_map, tensor_allocator_.get(), - [this](int idx, const OrtValue& value, const OrtCallback& d) -> Status { - return session_state_.AddInitializedTensor(idx, value, &d); + [this](int idx, const OrtValue& value, const OrtCallback& d, bool constant) -> Status { + return session_state_.AddInitializedTensor(idx, value, &d, constant); }, - logger_)); + logger_, session_state_.GetDataTransferMgr())); // remove weights from the graph now to save memory but in many cases it won't save memory, if the tensor was // preallocated with the some other tensors in a single 'allocate' call, which is very common. // TODO: make it better @@ -178,7 +181,8 @@ common::Status SaveMLValueNameIndexMapping(const GraphViewer& graph_viewer, OrtV static common::Status DeserializeTensorProto(const Env& env, const std::basic_string& proto_path, const ONNX_NAMESPACE::TensorProto& tensor_proto, const MemBuffer& m, const ExecutionProviders& exec_providers, OrtValue& ort_value, - OrtCallback& deleter) { + OrtCallback& deleter, + const DataTransferManager& data_transfer_mgr) { const OrtAllocatorInfo& alloc_info = m.GetAllocInfo(); if (strcmp(alloc_info.name, CPU) == 0 || alloc_info.mem_type == OrtMemTypeCPUOutput) { // deserialize directly to CPU tensor @@ -217,7 +221,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st p_tensor = std::make_unique(p_deserialize_tensor.DataType(), p_deserialize_tensor.Shape(), m.GetBuffer(), m.GetAllocInfo()); // TODO: does this function work for string tensor? - Status copy_status = provider->CopyTensor(p_deserialize_tensor, *p_tensor); + Status copy_status = data_transfer_mgr.CopyTensor(p_deserialize_tensor, *p_tensor); if (d.f) d.f(d.param); if (!copy_status.IsOK()) { if (copy_status.ErrorMessage().empty()) { @@ -237,7 +241,8 @@ template common::Status SaveInitializedTensors(const Env& env, const std::basic_string& graph_loc, const Graph& graph, const ExecutionProviders& exec_providers, const OrtValueNameIdxMap& ort_value_name_idx_map, ITensorAllocator* planner, - const T& save_tensor_func, const logging::Logger& logger) { + const T& save_tensor_func, const logging::Logger& logger, + const DataTransferManager& data_transfer_mgr) { LOGS(logger, INFO) << "Saving initialized tensors."; ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > 0, "OrtValue indexes should have been populated."); @@ -270,14 +275,15 @@ common::Status SaveInitializedTensors(const Env& env, const std::basic_stringGetBuffer() != nullptr || m->GetLen() == 0); #endif OrtValue ort_value; - Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, *m, exec_providers, ort_value, deleter); + Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, *m, exec_providers, ort_value, deleter, data_transfer_mgr); if (!st.IsOK()) { std::ostringstream oss; oss << "Deserialize tensor " << name << " failed." << st.ErrorMessage(); return Status(st.Category(), st.Code(), oss.str()); } - ORT_RETURN_IF_ERROR(save_tensor_func(ort_value_index, ort_value, deleter)); + bool constant = graph_utils::IsConstantInitializer(graph, name, /* check_outer_scope */ false); + ORT_RETURN_IF_ERROR(save_tensor_func(ort_value_index, ort_value, deleter, constant)); VLOGS(logger, 1) << "Added weight with name : " << name << " with index: " << ort_value_index; } @@ -412,9 +418,10 @@ common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::Graph& graph for (const auto& graph_input : graph_inputs) { const auto& name = graph_input->Name(); if (input_map.find(name) == end_map) { - // dummy entry for an input that we didn't find a use of in the graph. warn about it in case that's a bug. + // dummy entry for an input that we didn't find a use of in the graph. log it in case that's a bug. // utils::CopyOneInputAcrossDevices will use the input OrtValue as is given we don't believe it's used anywhere. - LOGS(session_state.Logger(), WARNING) << "Graph input with name " << name << " is not associated with a node. "; + LOGS(session_state.Logger(), INFO) << (graph.IsSubgraph() ? "Subgraph" : "Graph") << " input with name " + << name << " is not used by any node."; ORT_RETURN_IF_ERROR(session_state.AddInputNameToNodeInfoMapping(name, empty_node_info)); } } diff --git a/onnxruntime/core/framework/tensor_shape.cc b/onnxruntime/core/framework/tensor_shape.cc index e252f64729e24..b37c2e9499c8a 100644 --- a/onnxruntime/core/framework/tensor_shape.cc +++ b/onnxruntime/core/framework/tensor_shape.cc @@ -11,6 +11,9 @@ namespace onnxruntime { TensorShape::TensorShape(const std::vector& dims) : std::vector(dims) { } +TensorShape::TensorShape(std::vector&& dims) : std::vector(std::move(dims)) { +} + TensorShape::TensorShape(const std::initializer_list& dims) : std::vector(dims) { } @@ -20,7 +23,6 @@ TensorShape::TensorShape(const int64_t* dimension_sizes, size_t dimension_count) } } - TensorShape::TensorShape(const std::vector& dims, size_t start, size_t end) { assign(dims.begin() + start, dims.begin() + end); } @@ -38,8 +40,8 @@ int64_t TensorShape::Size() const { int64_t TensorShape::SizeToDimension(size_t dimension) const { const size_t num_dims = size(); ORT_ENFORCE(dimension <= num_dims, - "Invalid dimension of ", dimension, " for SizeFromDimension. Tensor has ", - num_dims, " dimensions."); + "Invalid dimension of ", dimension, " for SizeFromDimension. Tensor has ", + num_dims, " dimensions."); int64_t size = SizeHelper(0, dimension); return size; @@ -48,8 +50,8 @@ int64_t TensorShape::SizeToDimension(size_t dimension) const { int64_t TensorShape::SizeFromDimension(size_t dimension) const { const size_t num_dims = size(); ORT_ENFORCE(dimension <= num_dims, - "Invalid dimension of ", dimension, " for SizeFromDimension. Tensor has ", - num_dims, " dimensions."); + "Invalid dimension of ", dimension, " for SizeFromDimension. Tensor has ", + num_dims, " dimensions."); int64_t size = SizeHelper(dimension, num_dims); return size; @@ -57,7 +59,7 @@ int64_t TensorShape::SizeFromDimension(size_t dimension) const { TensorShape TensorShape::Slice(size_t dimstart, size_t dimend) const { ORT_ENFORCE(dimstart <= dimend && dimend <= size(), - "Invalid tensor shape slice argument."); + "Invalid tensor shape slice argument."); return TensorShape(*this, dimstart, dimend); } diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 02c72b1de1cb0..ce7fc4e91d286 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -274,14 +274,14 @@ common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& return Status::OK(); } -std::vector GetTensorShapeFromTensorShapeProto(const ONNX_NAMESPACE::TensorShapeProto& tensor_shape_proto) { +TensorShape GetTensorShapeFromTensorShapeProto(const ONNX_NAMESPACE::TensorShapeProto& tensor_shape_proto) { const auto& dims = tensor_shape_proto.dim(); std::vector tensor_shape_vec(static_cast(dims.size())); for (int i = 0; i < dims.size(); ++i) { - tensor_shape_vec[i] = dims[i].has_dim_param() ? -1 /* symbolic dimensions are represented as -1 in onnxruntime*/ - : dims[i].dim_value(); + tensor_shape_vec[i] = dims[i].has_dim_value() ? dims[i].dim_value() + : -1; /* symbolic dimensions are represented as -1 in onnxruntime*/ } - return tensor_shape_vec; + return TensorShape(std::move(tensor_shape_vec)); } struct UnInitializeParam { @@ -538,7 +538,7 @@ TensorProto::DataType GetTensorProtoType(const Tensor& tensor) { } ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name, - const onnx::TypeProto& tensor_proto_type) { + const ONNX_NAMESPACE::TypeProto& tensor_proto_type) { // Given we are using the raw_data field in the protobuf, this will work only for little-endian format. ORT_ENFORCE(IsLittleEndianOrder()); @@ -557,7 +557,7 @@ ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std: tensor_proto.set_data_type(tensor_proto_type.tensor_type().elem_type()); - tensor_proto.set_raw_data(tensor.DataRaw(), tensor.Size()); + tensor_proto.set_raw_data(tensor.DataRaw(), tensor.SizeInBytes()); return tensor_proto; } diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h index cd11184b34912..15d70f35edbde 100644 --- a/onnxruntime/core/framework/tensorprotoutils.h +++ b/onnxruntime/core/framework/tensorprotoutils.h @@ -24,7 +24,7 @@ class TensorShapeProto; namespace onnxruntime { class Tensor; namespace utils { -std::vector GetTensorShapeFromTensorShapeProto(const ONNX_NAMESPACE::TensorShapeProto& tensor_shape_proto); +TensorShape GetTensorShapeFromTensorShapeProto(const ONNX_NAMESPACE::TensorShapeProto& tensor_shape_proto); /** * deserialize a TensorProto into a preallocated memory buffer. * \param tensor_proto_path A local file path of where the 'input' was loaded from. Can be NULL if the tensor proto doesn't @@ -47,7 +47,7 @@ ONNX_NAMESPACE::TensorProto::DataType GetTensorProtoType(const Tensor& tensor); TODO Once the GetTensorProtoType supports all data types, we can remove the tensor_proto_type parameter and instead get the type from the tensor. */ ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name, - const onnx::TypeProto& tensor_proto_type); + const ONNX_NAMESPACE::TypeProto& tensor_proto_type); ONNXTensorElementDataType CApiElementTypeFromProtoType(int type); ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto& tensor_proto); diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index 65a64a3c782eb..b0171f25be843 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -6,7 +6,7 @@ #include #include "core/graph/graph_viewer.h" - +#include "core/framework/data_transfer_manager.h" #include "core/framework/execution_frame.h" #include "core/framework/execution_providers.h" #include "core/framework/feeds_fetches_manager.h" @@ -58,7 +58,9 @@ const std::string& GetNodeInputProviderType(const SessionState::NodeInfo& info) return required_provider_type; } -static Status CopyMLValue(const FeedsFetchesManager::MLValueCopyInfo& copy_info, const OrtValue& source_mlvalue, +static Status CopyMLValue(const DataTransferManager& data_transfer_mgr, + const FeedsFetchesManager::MLValueCopyInfo& copy_info, + const OrtValue& source_mlvalue, OrtValue& target_mlvalue) { if (copy_info.copy_provider == nullptr) { target_mlvalue = source_mlvalue; @@ -72,7 +74,7 @@ static Status CopyMLValue(const FeedsFetchesManager::MLValueCopyInfo& copy_info, Tensor* p_output_tensor = target_mlvalue.GetMutable(); - ORT_RETURN_IF_ERROR(copy_info.copy_provider->CopyTensor(source_tensor, *p_output_tensor)); + ORT_RETURN_IF_ERROR(data_transfer_mgr.CopyTensor(source_tensor, *p_output_tensor)); } return Status::OK(); @@ -96,7 +98,6 @@ common::Status CopyOneInputAcrossDevices(const SessionState& session_state, cons // info on the logic to create the node_info_vec. // for (auto& node_info : node_info_vec) { auto& node_info = node_info_vec.front(); - if (node_info.p_node == nullptr) { // dummy entry for an input that we didn't find a use of in the graph. // use the input as is given we don't believe it's actually needed. @@ -150,7 +151,7 @@ common::Status CopyOneInputAcrossDevices(const SessionState& session_state, cons copy_info.allocation_provider = required_provider; copy_info.copy_provider = p_copy_provider; - ORT_RETURN_IF_ERROR(CopyMLValue(copy_info, orig_mlvalue, new_mlvalue)); + ORT_RETURN_IF_ERROR(CopyMLValue(session_state.GetDataTransferMgr(), copy_info, orig_mlvalue, new_mlvalue)); needed_copy = true; @@ -205,14 +206,15 @@ static common::Status CopyInputsAcrossDevices(const SessionState& session_state, // copies inputs across devices only if required using cached copy_info static common::Status CachedCopyInputsAcrossDevices( const std::vector& orig_feeds, std::vector& new_feeds, - const std::vector& copy_info) { + const std::vector& copy_info, + const DataTransferManager& data_transfer_mgr) { size_t num_feeds = orig_feeds.size(); ORT_ENFORCE(copy_info.size() == num_feeds); new_feeds.resize(num_feeds); for (size_t idx = 0; idx < num_feeds; ++idx) { - ORT_RETURN_IF_ERROR(CopyMLValue(copy_info[idx], orig_feeds[idx], new_feeds[idx])); + ORT_RETURN_IF_ERROR(CopyMLValue(data_transfer_mgr, copy_info[idx], orig_feeds[idx], new_feeds[idx])); } return Status::OK(); @@ -379,7 +381,7 @@ static common::Status CopyOutputsAcrossDevices(const SessionState& session_state const int device_id = 0; // TODO: As per comment in the copy input code, make this configurable. FeedsFetchesManager::MLValueCopyInfo copy_info{device_id, p_output_provider, p_copy_provider}; - ORT_RETURN_IF_ERROR(CopyMLValue(copy_info, fetched_mlvalue, output_mlvalue)); + ORT_RETURN_IF_ERROR(CopyMLValue(session_state.GetDataTransferMgr(), copy_info, fetched_mlvalue, output_mlvalue)); if (copiers) { (*copiers)[idx] = copy_info; @@ -391,7 +393,8 @@ static common::Status CopyOutputsAcrossDevices(const SessionState& session_state static common::Status CachedCopyOutputsAcrossDevices( const std::vector& fetches, std::vector& user_fetches, - const std::vector& copy_info) { + const std::vector& copy_info, + const DataTransferManager& data_transfer_mgr) { auto num_outputs = fetches.size(); // internal logic error if these are mismatched @@ -399,31 +402,24 @@ static common::Status CachedCopyOutputsAcrossDevices( // used the cached copy logic if available for (size_t idx = 0; idx < num_outputs; ++idx) { - ORT_RETURN_IF_ERROR(CopyMLValue(copy_info[idx], fetches[idx], user_fetches[idx])); + ORT_RETURN_IF_ERROR(CopyMLValue(data_transfer_mgr, copy_info[idx], fetches[idx], user_fetches[idx])); } return Status::OK(); } -// check if all the execution providers use the same allocator. if so, no copies between devices should be required, -// and the overall status for DeviceCopyChecks can be set to NoCopy static DeviceCopyCheck CheckExecutionProviders(const ExecutionProviders& execution_providers) { - bool all_cpu = true; for (const auto& execution_provider : execution_providers) { - const auto& allocators = execution_provider->GetAllocators(); - // this won't work as desired until multiple providers can share the CPU Allocator and the logic here is updated - // to detect that.. - // it will currently handle the scenario when only the CPUExecutionProvider is registered though - if (!std::all_of(allocators.cbegin(), allocators.cend(), - [](const gsl::not_null& allocator) { - return strcmp(allocator->Info().name, CPU) == 0; - })) { - all_cpu = false; - break; + if (execution_provider->Type() != onnxruntime::kCpuExecutionProvider && + execution_provider->Type() != onnxruntime::kMklDnnExecutionProvider && + execution_provider->Type() != onnxruntime::kNGraphExecutionProvider && + execution_provider->Type() != onnxruntime::kNupharExecutionProvider && + execution_provider->Type() != onnxruntime::kOpenVINOExecutionProvider) { + return DeviceCopyCheck::Unknown; } } - return all_cpu ? DeviceCopyCheck::NoCopy : DeviceCopyCheck::Unknown; + return DeviceCopyCheck::NoCopy; } // execute graph with cached info from FeedsFetchesManager. @@ -456,7 +452,8 @@ common::Status ExecuteGraphWithCachedInfo( // Copy inputs if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) { ORT_RETURN_IF_ERROR(CachedCopyInputsAcrossDevices(feeds, device_feeds, - feeds_fetches_manager.GetFeedsDeviceCopiers())); + feeds_fetches_manager.GetFeedsDeviceCopiers(), + session_state.GetDataTransferMgr())); p_feeds = &device_feeds; } @@ -480,7 +477,8 @@ common::Status ExecuteGraphWithCachedInfo( if (device_copy_checks.output_copy_needed == DeviceCopyCheck::Copy) { ORT_RETURN_IF_ERROR(CachedCopyOutputsAcrossDevices(*p_fetches, fetches, - feeds_fetches_manager.GetFetchesDeviceCopiers())); + feeds_fetches_manager.GetFetchesDeviceCopiers(), + session_state.GetDataTransferMgr())); } } diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 637201efd24f1..49d8e3309a989 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -8,6 +8,7 @@ #include "core/graph/op.h" #include "onnx/defs/schema.h" #include "onnx/defs/shape_inference.h" +#include "core/mlas/inc/mlas.h" #ifdef MICROSOFT_INTERNAL #include "core/graph/contrib_ops/internal_schema_defs.h" @@ -19,13 +20,172 @@ void convPoolShapeInference( bool use_dilation, bool require_kernel_shape, int input1Idx, int input2Idx); -} +void globalPoolTypeShapeInference(ONNX_NAMESPACE::InferenceContext& ctx); +} // namespace ONNX_NAMESPACE + namespace onnxruntime { namespace contrib { using ONNX_NAMESPACE::AttributeProto; using ONNX_NAMESPACE::OpSchema; using ONNX_NAMESPACE::OPTIONAL; +void NchwcPoolOpSchemaGenerator(OpSchema& schema) { + schema.SetDomain(kMSNchwcDomain); + schema.SinceVersion(1); + schema.SetDoc(R"DOC(For internal use.)DOC"); + schema.Attr("auto_pad", "", AttributeProto::STRING, std::string("NOTSET")); + schema.Attr("kernel_shape", "", AttributeProto::INTS); + schema.Attr("dilations", "", AttributeProto::INTS, OPTIONAL); + schema.Attr("strides", "", AttributeProto::INTS, OPTIONAL); + schema.Attr("pads", "", AttributeProto::INTS, OPTIONAL); + schema.Attr("ceil_mode", "", AttributeProto::INT, static_cast(0)); + schema.Input(0, "X", "", "T"); + schema.Output(0, "Y", "", "T"); + schema.TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float tensors"); + schema.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0); + ONNX_NAMESPACE::convPoolShapeInference(ctx, true, true, 0, 1); + }); +} + +void NchwcGlobalPoolOpSchemaGenerator(OpSchema& schema) { + schema.SetDomain(kMSNchwcDomain); + schema.SinceVersion(1); + schema.SetDoc(R"DOC(For internal use.)DOC"); + schema.Input(0, "X", "", "T"); + schema.Output(0, "Y", "", "T"); + schema.TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float tensors"); + schema.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + ONNX_NAMESPACE::globalPoolTypeShapeInference(ctx); + }); +} + +void RegisterNchwcSchemas() { + ONNX_CONTRIB_OPERATOR_SCHEMA(ReorderInput) + .SetDomain(kMSNchwcDomain) + .SinceVersion(1) + .SetDoc(R"DOC(For internal use.)DOC") + .Input(0, "X", "", "T") + .Output(0, "Y", "", "T") + .TypeConstraint( + "T", + {"tensor(float)", "tensor(int8)", "tensor(uint8)"}, + "Constrain input and output types to float/quantized tensors") + .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput); + + ONNX_CONTRIB_OPERATOR_SCHEMA(ReorderOutput) + .SetDomain(kMSNchwcDomain) + .SinceVersion(1) + .SetDoc(R"DOC(For internal use.)DOC") + .Attr( + "channels", + "", + AttributeProto::INT, + static_cast(0)) + .Input(0, "X", "", "T") + .Output(0, "Y", "", "T") + .TypeConstraint( + "T", + {"tensor(float)", "tensor(int8)", "tensor(uint8)"}, + "Constrain input and output types to float/quantized tensors") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateElemTypeFromInputToOutput(ctx, 0, 0); + if (!hasNInputShapes(ctx, 1)) { + return; + } + propagateShapeFromInputToOutput(ctx, 0, 0); + + // Update the output shape with the actual number of channels. + auto channels = getAttribute(ctx, "channels", 0); + if (channels <= 0) { + fail_shape_inference("invalid channel count"); + } + auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape(); + if (output_shape->dim_size() < 2) { + fail_shape_inference("tensor rank too small"); + } + auto* channels_dim = output_shape->mutable_dim(1); + channels_dim->clear_dim_param(); + channels_dim->set_dim_value(channels); + }); + + ONNX_CONTRIB_OPERATOR_SCHEMA(Conv) + .SetDomain(kMSNchwcDomain) + .SinceVersion(1) + .SetDoc(R"DOC(For internal use.)DOC") + .Attr( + "auto_pad", + "", + AttributeProto::STRING, + std::string("NOTSET")) + .Attr( + "kernel_shape", + "", + AttributeProto::INTS, + OPTIONAL) + .Attr( + "dilations", + "", + AttributeProto::INTS, + OPTIONAL) + .Attr( + "strides", + "", + AttributeProto::INTS, + OPTIONAL) + .Attr( + "pads", + "", + AttributeProto::INTS, OPTIONAL) + .Attr( + "group", + "", + AttributeProto::INT, + static_cast(1)) + .Attr( + "activation", + "", + AttributeProto::STRING, + OPTIONAL) + .Attr( + "activation_params", + "", + AttributeProto::FLOATS, + OPTIONAL) + .Input(0, "X", "", "T") + .Input(1, "W", "", "T") + .Input(2, "B", "", "T", OpSchema::Optional) + .Input(3, "Sum", "", "T", OpSchema::Optional) + .Output(0, "Y", "", "T") + .TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float tensors") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0); + ONNX_NAMESPACE::convPoolShapeInference(ctx, true, false, 0, 1); + }); + + ONNX_CONTRIB_OPERATOR_SCHEMA(MaxPool) + .FillUsing(NchwcPoolOpSchemaGenerator) + .Attr( + "storage_order", + "", + AttributeProto::INT, + static_cast(0)); + + ONNX_CONTRIB_OPERATOR_SCHEMA(AveragePool) + .FillUsing(NchwcPoolOpSchemaGenerator) + .Attr( + "count_include_pad", + "", + AttributeProto::INT, + static_cast(0)); + + ONNX_CONTRIB_OPERATOR_SCHEMA(GlobalMaxPool) + .FillUsing(NchwcGlobalPoolOpSchemaGenerator); + + ONNX_CONTRIB_OPERATOR_SCHEMA(GlobalAveragePool) + .FillUsing(NchwcGlobalPoolOpSchemaGenerator); +} + void RegisterContribSchemas() { // Register removed experimental ops for backward compatibility. // Experimental operators do not have version history. However, RS5 takes bunch of experimental operators @@ -497,12 +657,16 @@ and op)DOC"; bottom_border = border[3]; if (H < top_border + bottom_border) - fail_shape_inference("Input's height (", H, ") needs to be greater than or equal to " - "the top_border (", top_border, ") + bottom_border (", bottom_border, ")"); + fail_shape_inference("Input's height (", H, + ") needs to be greater than or equal to " + "the top_border (", + top_border, ") + bottom_border (", bottom_border, ")"); if (W < left_border + right_border) - fail_shape_inference("Input's width (", W, ") needs to be greater than or equal to " - "the left_border (", left_border, ") + right_border (", right_border, ")"); + fail_shape_inference("Input's width (", W, + ") needs to be greater than or equal to " + "the left_border (", + left_border, ") + right_border (", right_border, ")"); int64_t bottom_limit = H - bottom_border; int64_t right_limit = W - right_border; @@ -698,10 +862,15 @@ activation.)DOC") AttributeProto::INTS, OPTIONAL) .Attr( - "strides", "", AttributeProto::INTS, OPTIONAL) - .Attr("pads", - "", - AttributeProto::INTS, OPTIONAL) + "strides", + "", + AttributeProto::INTS, + OPTIONAL) + .Attr( + "pads", + "", + AttributeProto::INTS, + OPTIONAL) .Attr( "group", "", @@ -713,9 +882,9 @@ activation.)DOC") AttributeProto::STRING, OPTIONAL) .Attr( - "alpha", + "activation_params", "", - AttributeProto::FLOAT, + AttributeProto::FLOATS, OPTIONAL) .Input( 0, @@ -727,7 +896,12 @@ activation.)DOC") "W", "", "T") - .Input(2, "B", "", "T", OpSchema::Optional) + .Input( + 2, + "B", + "", + "T", + OpSchema::Optional) .Output( 0, "Y", @@ -1215,7 +1389,7 @@ Example 4: pads_initializer->int64_data().end()); // fill with zeros if needed to reach appropriate size - if (pads_data.size() != static_cast(2 * input_rank)) + if (pads_data.size() != 2 * static_cast(input_rank)) pads_data.resize(2 * input_rank, 0); const auto& output_shape = @@ -1309,11 +1483,11 @@ Example 4: return; }) .SetDoc(R"DOC( - Finds all the unique values (deduped list) present in the given input tensor. - This operator returns 3 outputs. - The first output tensor 'uniques' contains all of the unique elements of the input, + Finds all the unique values (deduped list) present in the given input tensor. + This operator returns 3 outputs. + The first output tensor 'uniques' contains all of the unique elements of the input, sorted in the same order that they occur in the input. - The second output tensor 'idx' is the same size as the input and it contains the index + The second output tensor 'idx' is the same size as the input and it contains the index of each value of the input in 'uniques'. The third output tensor 'counts' contains the count of each element of 'uniques' in the input. Example: @@ -1414,10 +1588,15 @@ Example 4: a fixed size = [crop_height, crop_width]. The result is a 4-D tensor [num_boxes, crop_height, crop_width, depth]. The resizing is corner aligned.)DOC"); + // Register the NCHWc schemas if supported by the platform. + if (MlasNchwcGetBlockSize() > 1) { + RegisterNchwcSchemas(); + } + #ifdef MICROSOFT_INTERNAL // register internal ops RegisterInternalSchemas(); #endif +} } // namespace contrib -} // namespace contrib -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/graph/function.cc b/onnxruntime/core/graph/function.cc index 091a58b58b224..1f5cc3d2ce948 100644 --- a/onnxruntime/core/graph/function.cc +++ b/onnxruntime/core/graph/function.cc @@ -106,6 +106,7 @@ FunctionImpl::FunctionImpl(const onnxruntime::Graph& graph, auto input_arg = parent_graph_->GetNodeArg(input); auto& sub_graph_input_arg = sub_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto()); sub_graph_inputs[i] = &sub_graph_input_arg; + ORT_ENFORCE(input_arg->Type() != nullptr); op_schema_->Input(i, input, "", *input_arg->Type()); ++i; } diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 5a701181625ef..e26f91117c509 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -51,13 +51,33 @@ static bool GraphLoadedFromModelFile(const GraphProto* graph_proto) { graph_proto->value_info_size() != 0); } -NodeArg::NodeArg(const std::string& name, - const TypeProto* p_node_arg_type) { +// there are some known invalid usages of dim_param and dim_value. remove them from the TypeProto so that +// they don't affect shape inferencing or the allocation planner +static void RemoveInvalidValues(ONNX_NAMESPACE::TypeProto& type) { + if (type.has_tensor_type() && type.tensor_type().has_shape()) { + auto* shape = type.mutable_tensor_type()->mutable_shape(); + for (int i = 0, end = shape->dim_size(); i < end; ++i) { + auto& dim = *shape->mutable_dim(i); + if (dim.has_dim_param()) { + if (dim.dim_param().empty()) { + dim.clear_dim_param(); + } + } else if (dim.has_dim_value()) { + if (dim.dim_value() < 0) { + dim.clear_dim_value(); + } + } + } + } +} + +NodeArg::NodeArg(const std::string& name, const TypeProto* p_node_arg_type) { node_arg_info_.set_name(name); // If the name is empty, it means the arg does not exist. exists_ = !(name.empty()); if (nullptr != p_node_arg_type) { (*node_arg_info_.mutable_type()) = *p_node_arg_type; + RemoveInvalidValues(*node_arg_info_.mutable_type()); type_ = DataTypeUtils::ToType(node_arg_info_.type()); } else { type_ = nullptr; @@ -549,6 +569,15 @@ const Graph* Node::GetGraphAttribute(const std::string& attr_name) const { return const_cast(this)->GetMutableGraphAttribute(attr_name); } +std::vector> Node::GetSubgraphs() const { + std::vector> subgraphs; + subgraphs.reserve(attr_to_subgraph_map_.size()); + std::transform(attr_to_subgraph_map_.cbegin(), attr_to_subgraph_map_.cend(), std::back_inserter(subgraphs), + [](const auto& entry) { return entry.second; }); + + return subgraphs; +} + void Node::ForEachDef(std::function func, bool include_missing_optional_defs) const { for (const auto* arg : InputDefs()) { @@ -869,7 +898,7 @@ void Graph::RemoveEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int s } GSL_SUPPRESS(es .84) // ignoring return value from unordered_map::insert causes noisy complaint -Status Graph::BuildConnections(std::vector& outer_scope_node_args_consumed) { +Status Graph::BuildConnections(std::unordered_set& outer_scope_node_args_consumed) { const std::unordered_set& outer_scope_node_args = resolve_context_.outer_scope_node_args; std::unordered_set inner_nodes; @@ -879,7 +908,7 @@ Status Graph::BuildConnections(std::vector& outer_scope_node_args_c for (auto* node : resolve_context_.nodes_with_subgraphs) { for (auto& subgraph : node->MutableSubgraphs()) { - std::vector node_args_consumed; + std::unordered_set node_args_consumed; ORT_RETURN_IF_ERROR(subgraph->BuildConnections(node_args_consumed)); for (auto& node_arg_name : node_args_consumed) { @@ -889,7 +918,7 @@ Status Graph::BuildConnections(std::vector& outer_scope_node_args_c // it's a node arg from outside this graph's scope, so add that to the list we return // so that we can add the dependency at the next level up. this happens if you have multiple // levels of subgraphs between the graph with the original NodeArg and the subgraph with implicit usage. - outer_scope_node_args_consumed.push_back(node_arg_name); + ORT_IGNORE_RETURN_VALUE(outer_scope_node_args_consumed.insert(node_arg_name)); if (!parent_graph_) { return ORT_MAKE_STATUS( @@ -958,25 +987,31 @@ Status Graph::BuildConnections(std::vector& outer_scope_node_args_c continue; } - auto output_arg_iter = resolve_context_.output_args.find(input_arg->Name()); - if (resolve_context_.output_args.end() == output_arg_iter) { - // No such output_arg matching this input_arg. - // This input arg should be fed when running evaluation. - // See if it's present in the outer scope. If so it will be 'fed' by the execution frame - // providing access to the OrtValue from the outer scope. Pass the name back up so nodes can - // be linked correctly at that level. - if (outer_scope_node_args.find(input_arg->Name()) != outer_scope_node_args.cend()) { - outer_scope_node_args_consumed.push_back(input_arg->Name()); - } + const auto& input_arg_name = input_arg->Name(); + auto output_arg_iter = resolve_context_.output_args.find(input_arg_name); + if (resolve_context_.output_args.end() != output_arg_iter) { + // The input to this node is an output from a previous node in this graph. + // Create relationship between this node (node), and the node providing the output (output_node). + Node& output_node = *output_arg_iter->second.first; + AddEdge(output_node.Index(), node.Index(), output_arg_iter->second.second, input_slot_index); - continue; + inner_nodes.insert(&output_node); + } else { + // the value is either an input, an initializer, or coming from outer scope. we only need to take action + // if coming from outer scope, so first check if this is a subgraph (otherwise there is no outer scope). + if (parent_graph_ != nullptr) { + // make sure it's not an input or initializer first as those override any outer scope values + if (resolve_context_.inputs_and_initializers.find(input_arg_name) == + resolve_context_.inputs_and_initializers.cend()) { + // If it is present in the outer scope it will be 'fed' by the execution frame + // providing access to the OrtValue from the outer scope. Pass the name back up so nodes can + // be linked correctly at that level. + if (outer_scope_node_args.find(input_arg_name) != outer_scope_node_args.cend()) { + ORT_IGNORE_RETURN_VALUE(outer_scope_node_args_consumed.insert(input_arg_name)); + } + } + } } - - // Create relationship between this node (node), and the node providing the output (output_node). - Node& output_node = *output_arg_iter->second.first; - AddEdge(output_node.Index(), node.Index(), output_arg_iter->second.second, input_slot_index); - - inner_nodes.insert(&output_node); } } else if (node.OutputDefs().empty()) { // This is a useless node. @@ -986,7 +1021,7 @@ Status Graph::BuildConnections(std::vector& outer_scope_node_args_c } return Status::OK(); -} +} // namespace onnxruntime void Graph::ReverseDFSFrom(const std::vector& from, const std::function& enter, @@ -1321,6 +1356,7 @@ Status Graph::InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph, " inputs and requires ", num_required_subgraph_inputs, " inputs. Either provide all subgraph inputs, or just the required inputs."); } + subgraph_inputs = &required_subgraph_inputs; num_subgraph_inputs = num_required_subgraph_inputs; } @@ -1839,7 +1875,7 @@ Status Graph::Resolve(bool no_proto_sync_required) { // recursively set the outer scope node args. ORT_RETURN_IF_ERROR(SetOuterScopeNodeArgs(resolve_context_.outer_scope_node_args)); - std::vector outer_scope_node_args_consumed; + std::unordered_set outer_scope_node_args_consumed; // recursively build connections between nodes in this graph and all subgraphs ORT_RETURN_IF_ERROR(BuildConnections(outer_scope_node_args_consumed)); @@ -1870,6 +1906,8 @@ Status Graph::Resolve(bool no_proto_sync_required) { ORT_RETURN_IF_ERROR(ForThisAndAllSubgraphs(all_subgraphs, finalize_func)); + ++num_resolves_; + return Status::OK(); } @@ -2224,7 +2262,16 @@ void Graph::CleanUnusedInitializers() { for (const auto& pv : name_to_initial_tensor_) { const std::string& name = pv.first; if (used_args.find(name) == end) { - LOGS_DEFAULT(WARNING) << name << " exists in this graph's initializers but it is not used by any node"; + // on the first call to Graph::Resolve we are removing unnecessary initializers that should be removed + // from the model. + // on later calls we are removing initializers that optimizations have made redundant. + if (num_resolves_ == 0) { + LOGS_DEFAULT(WARNING) << "Removing initializer '" + << name << "'. It is not used by any node and should be removed from the model."; + } else { + LOGS_DEFAULT(INFO) << "Removing initializer '" << name << "'. It is no longer used by any node."; + } + erase_list.push_back(name); } } @@ -2286,7 +2333,7 @@ Status Graph::SetGraphInputsOutputs() { } // Set graph outputs. - // Graph outputs specified in the model must be nodes' outputs, initailizer or graph inputs. + // Graph outputs specified in the model must be nodes' outputs, initializer or graph inputs. for (auto& graph_output : graph_proto_->output()) { auto& graph_output_name = graph_output.name(); auto iter = nodes_outputs.find(graph_output_name); @@ -2355,31 +2402,39 @@ Status Graph::SetGraphInputsOutputs() { auto output_arg_iter = output_name_to_node_arg_index.find(input_arg->Name()); if (output_name_to_node_arg_index.end() == output_arg_iter) { - // This input arg should be fed when running evaluation. - // it should be a graph input. + // This input arg is not the output of another node so must come from either a graph input or an initializer. const std::string& name = input_arg->Name(); + if (added_input_names.end() == added_input_names.find(name)) { // This graph input has not been added into . + bool is_initializer = name_to_initial_tensor_.find(name) != name_to_initial_tensor_.end(); + if (!graph_inputs_manually_set_) { - graph_inputs_including_initializers_.push_back(input_arg); + // if IR version < 4 all initializers must have a matching graph input + // (even though the graph input is not allowed to override the initializer). + // if IR version >= 4 initializers are not required to have a matching graph input. + // any graph inputs that are to override initializers must be specified by calling SetInputs. + if (!is_initializer || ir_version_ < 4) { + graph_inputs_including_initializers_.push_back(input_arg); + } } else { + // graph_inputs_including_initializers_ has been manually populated by SetInputs. // Validation: the must be in graph inputs or initializers when it's manually set. - auto& inputs = GetInputsIncludingInitializers(); - auto iter = std::find(inputs.begin(), inputs.end(), input_arg); - if (inputs.end() == iter) { - // it's not in graph inputs. - auto initializers = GetAllInitializedTensors(); - if (initializers.end() == initializers.find(input_arg->Name())) { - // It's not in graph initializers. - return Status(ONNXRUNTIME, FAIL, input_arg->Name() + " must be either specified in graph inputs or graph initailizers."); + if (!is_initializer) { + const auto& inputs = graph_inputs_including_initializers_; + bool in_inputs = std::find(inputs.begin(), inputs.end(), input_arg) != inputs.end(); + if (!in_inputs) { + return Status(ONNXRUNTIME, FAIL, + name + " must be either specified in graph inputs or graph initializers."); } } } - if (name_to_initial_tensor_.find(name) == name_to_initial_tensor_.end()) { + + if (!is_initializer) { graph_inputs_excluding_initializers_.push_back(input_arg); } - added_input_names.insert(input_arg->Name()); + added_input_names.insert(name); } } else if (graph_output_args.erase(output_arg_iter->first) >= 1) { // Remove the output arg name from graph outputs since it's @@ -2410,7 +2465,7 @@ Status Graph::SetGraphInputsOutputs() { // calling private ctor GSL_SUPPRESS(r .11) gsl::not_null Graph::AllocateNode() { - ORT_ENFORCE(nodes_.size() < std::numeric_limits::max()); + ORT_ENFORCE(nodes_.size() < static_cast(std::numeric_limits::max())); std::unique_ptr new_node(new Node(nodes_.size(), *this)); Node* node{new_node.get()}; diff --git a/onnxruntime/core/graph/graph_utils.cc b/onnxruntime/core/graph/graph_utils.cc index 1a4ab6d8b5e75..2ac2a15303a11 100644 --- a/onnxruntime/core/graph/graph_utils.cc +++ b/onnxruntime/core/graph/graph_utils.cc @@ -54,18 +54,20 @@ static bool OutputEdgeProvidesImplicitInput(const Graph& graph, const GraphEdge& /** Checks if new_output_name can be used to replace removed_output_name in the subgraph input. If there is an existing NodeArg in a subgraph that implicitly consumes removed_output_name, it is not safe. */ -static bool CanUpdateImplicitInputNameInSubgraph(Node& node, +static bool CanUpdateImplicitInputNameInSubgraph(const Node& node, const std::string& removed_output_name, const std::string& new_output_name) { - for (auto& attr_subgraph_pair : node.GetAttributeNameToMutableSubgraphMap()) { - Graph& subgraph = *attr_subgraph_pair.second; + if (!node.ContainsSubgraph()) + return true; + + for (const gsl::not_null& subgraph : node.GetSubgraphs()) { // if we have an existing NodeArg in the subgraph with the new_output_name that would override an implicit input // with the same name - if (subgraph.GetNodeArg(new_output_name) != nullptr) { + if (subgraph->GetNodeArg(new_output_name) != nullptr) { return false; } - for (auto& subgraph_node : attr_subgraph_pair.second->Nodes()) { + for (auto& subgraph_node : subgraph->Nodes()) { // recurse if this node also consumes removed_output_name as an implicit input (i.e. there are multiple levels of nested // subgraphs, and at least one level lower uses removed_output_name as an implicit input const auto subgraph_node_implicit_inputs = subgraph_node.ImplicitInputDefs(); @@ -248,6 +250,26 @@ static bool RemoveNodeWithSingleInitializerIn(Graph& graph, Node& node) { return true; } +static bool ReplaceInitializerImpl(Graph& graph, const std::string& original_name, + const ONNX_NAMESPACE::TensorProto& initializer, bool check_outer_scope) { + bool replaced = false; + const ONNX_NAMESPACE::TensorProto* old_initializer = nullptr; + if (graph.GetInitializedTensor(original_name, old_initializer)) { + // Be conservative and only remove if the name matches. Graph::CleanupUnusedInitializers can take care + // of removing anything unused after optimization + if (original_name == initializer.name()) { + graph.RemoveInitializedTensor(original_name); + } + graph.AddInitializedTensor(initializer); + replaced = true; + + } else if (check_outer_scope && graph.IsSubgraph()) { + replaced = ReplaceInitializerImpl(*graph.MutableParentGraph(), original_name, initializer, check_outer_scope); + } + + return replaced; +} + //---------------------------- //--- end of local helpers --- //---------------------------- @@ -292,52 +314,6 @@ bool IsSupportedProvider(const Node& node, compatible_providers.find(node.GetExecutionProviderType()) == compatible_providers.end()); } -Status ForAllMutableSubgraphs(Graph& graph, std::function func) { - Status status = Status::OK(); - - for (auto& node : graph.Nodes()) { - for (auto& attr_name_to_subgraph_pair : node.GetAttributeNameToMutableSubgraphMap()) { - Graph* subgraph = attr_name_to_subgraph_pair.second; - ORT_ENFORCE(subgraph, "Main Graph instance should have populated all subgraphs when being resolved."); - - status = func(*subgraph); - ORT_RETURN_IF_ERROR(status); - - // recurse - status = ForAllMutableSubgraphs(*subgraph, func); - ORT_RETURN_IF_ERROR(status); - } - } - - return status; -} - -Status ForAllSubgraphs(const Graph& graph, std::function func) { - Status status = Status::OK(); - - for (auto& node : graph.Nodes()) { - for (auto& attribute : node.GetAttributes()) { - auto& name = attribute.first; - auto& proto = attribute.second; - - // check if it has a subgraph - if (proto.has_g()) { - const Graph* subgraph = node.GetGraphAttribute(name); - ORT_ENFORCE(subgraph, "Main Graph instance should have populated all subgraphs when being resolved."); - - status = func(*subgraph); - ORT_RETURN_IF_ERROR(status); - - // recurse - status = ForAllSubgraphs(*subgraph, func); - ORT_RETURN_IF_ERROR(status); - } - } - } - - return status; -} - bool IsSingleInSingleOutNode(const Node& node) { return node.InputDefs().size() == 1 && node.ImplicitInputDefs().empty() && node.OutputDefs().size() == 1; } @@ -399,26 +375,69 @@ bool IsGraphInput(const Graph& graph, const NodeArg* input) { return std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end(); } +const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const Graph& graph, const std::string& initializer_name, + bool check_outer_scope) { + const ONNX_NAMESPACE::TensorProto* initializer = nullptr; + if (graph.GetInitializedTensor(initializer_name, initializer)) { + if (graph.CanOverrideInitializer()) { + const auto& graph_inputs = graph.GetInputsIncludingInitializers(); + bool is_constant = std::none_of(graph_inputs.cbegin(), graph_inputs.cend(), + [&initializer_name](const NodeArg* input) { + return input->Name() == initializer_name; + }); + + if (!is_constant) { + initializer = nullptr; + } + } + } else if (check_outer_scope && graph.IsSubgraph()) { + initializer = GetConstantInitializer(*graph.ParentGraph(), initializer_name); + } + + return initializer; +} + +bool IsConstantInitializer(const Graph& graph, const std::string& initializer_name, bool check_outer_scope) { + const ONNX_NAMESPACE::TensorProto* initializer = GetConstantInitializer(graph, initializer_name, check_outer_scope); + return initializer != nullptr; +} + bool NodeArgIsConstant(const Graph& graph, const NodeArg& node_arg) { - const onnx::TensorProto* initializer = nullptr; - return graph.GetInitializedTensor(node_arg.Name(), initializer) && !IsGraphInput(graph, &node_arg); + return IsConstantInitializer(graph, node_arg.Name(), true); } -bool AllNodeInputsAreConstant(const Graph& graph, const Node& node) { +bool AllNodeInputsAreConstant(const Graph& graph, const Node& node, InitializedTensorSet& constant_inputs) { + // clear so we have a known state. if we fail part way through we go back to this state. + constant_inputs.clear(); + + // only initializers can be constant, and there's no edge from a node to an initializer + // so the input edges count must be 0 if (node.GetInputEdgesCount() > 0) { return false; } + for (const auto* input_def : node.InputDefs()) { // Important note: when an initializer appears in the graph's input, this input will not be considered constant, - // because it can be overriden by the user at runtime. For constant folding to be applied, the initializer should + // because it can be overridden by the user at runtime. For constant folding to be applied, the initializer should // not appear in the graph's inputs (that is the only way to guarantee it will always be constant). - if (!NodeArgIsConstant(graph, *input_def)) { + const ONNX_NAMESPACE::TensorProto* initializer = GetConstantInitializer(graph, input_def->Name(), true); + if (initializer) { + constant_inputs.insert({input_def->Name(), initializer}); + } else { + constant_inputs.clear(); return false; } } + return true; } +void ReplaceInitializer(Graph& graph, const std::string& original_name, const ONNX_NAMESPACE::TensorProto& initializer, + bool check_outer_scope) { + ORT_ENFORCE(ReplaceInitializerImpl(graph, original_name, initializer, check_outer_scope), + "Failed to replace initializer. Original initializer was not found. Name:", original_name); +} + size_t RemoveNodeOutputEdges(Graph& graph, Node& node) { std::vector output_edges = GetNodeOutputEdges(node); RemoveGraphEdges(graph, output_edges); diff --git a/onnxruntime/core/graph/graph_utils.h b/onnxruntime/core/graph/graph_utils.h index c3d872df23ec4..9d60e70fbdc98 100644 --- a/onnxruntime/core/graph/graph_utils.h +++ b/onnxruntime/core/graph/graph_utils.h @@ -38,12 +38,32 @@ bool IsOutputUsed(const Node& node, int index); /** Returns true if the graph has the given input.*/ bool IsGraphInput(const Graph& graph, const NodeArg* input); -/** Checks if the given node has only constant inputs (initializers). */ -bool AllNodeInputsAreConstant(const Graph& graph, const Node& node); +/** returns true if 'name' is an initializer, and is constant and cannot be overridden at runtime. +@param check_outer_scope If true and the graph is a subgraph, check ancestor graph/s for 'name' if not found in 'graph'. +*/ +bool IsConstantInitializer(const Graph& graph, const std::string& name, bool check_outer_scope = true); + +/** returns the initializer's TensorProto if 'name' is an initializer, and is constant and +cannot be overridden at runtime. If the initializer is not found or is not constant a nullptr is returned. +@param check_outer_scope If true and the graph is a subgraph, check ancestor graph/s for 'name' if not found in 'graph'. +*/ +const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const Graph& graph, const std::string& name, + bool check_outer_scope = true); + +/** Find the initializer called 'original_name' in 'graph', or its ancestors if check_outer_scope is true, + and replace with 'initializer' in the current graph. + Does NOT look in any subgraphs. Requires original_name to match an initializer. + */ +void ReplaceInitializer(Graph& graph, const std::string& original_name, const ONNX_NAMESPACE::TensorProto& initializer, + bool check_outer_scope = true); /** Checks if the given NodeArg is constant, i.e., it appears in the graph's initializers but not in its inputs. */ bool NodeArgIsConstant(const Graph& graph, const NodeArg& node_arg); +/** Checks if the given node has only constant inputs (initializers) and if so returns them in constant_inputs as they +may come from outer scope. */ +bool AllNodeInputsAreConstant(const Graph& graph, const Node& node, InitializedTensorSet& constant_inputs); + /** Gets the name of the incoming NodeArg with the specified index for the given node. */ const std::string& GetNodeInputName(const Node& node, int index); @@ -66,9 +86,6 @@ bool GetRepeatedNodeAttributeValues(const Node& node, return false; } -Status ForAllMutableSubgraphs(Graph& main_graph, std::function func); -Status ForAllSubgraphs(const Graph& main_graph, std::function func); - /** Removes the given Node from the Graph and keeps Graph consistent by rebuilding needed connections. We support the removal of the Node as long as the following conditions hold: - There should be no implicit inputs. diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index d0bd869fc0a8a..b1e08f09b567c 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -46,7 +46,7 @@ typedef enum { CblasLeft=141, CblasRight=142} CBLAS_SIDE; // // Forward declare the thread pool implementation class. // -// N.B. Avoid including onnxruntime headers here to keep the dependencies for +// N.B. Avoid including ONNX Runtime headers here to keep the dependencies for // standalone MLAS test executables smaller. // @@ -58,6 +58,16 @@ namespace onnxruntime { using MLAS_THREADPOOL = onnxruntime::concurrency::ThreadPool; +// +// Platform routines. +// + +size_t +MLASCALL +MlasGetPreferredBufferAlignment( + void + ); + // // Activation routines. // @@ -68,21 +78,30 @@ enum MLAS_ACTIVATION_KIND { MlasLeakyReluActivation, MlasTanhActivation, MlasLogisticActivation, + MlasClipActivation, }; struct MLAS_ACTIVATION { MLAS_ACTIVATION_KIND ActivationKind; - float alpha; + union { + struct { + float alpha; + } LeakyRelu; + struct { + float minimum; + float maximum; + } Clip; + float Values[2]; + } Parameters; }; void MLASCALL MlasActivation( const MLAS_ACTIVATION* Activation, - const float* Input, + float* Buffer, const float* Bias, size_t M, - float* Output, size_t N, size_t ldc ); diff --git a/onnxruntime/core/mlas/lib/activate.cpp b/onnxruntime/core/mlas/lib/activate.cpp index eea0b74327d07..fe23c12d508e2 100644 --- a/onnxruntime/core/mlas/lib/activate.cpp +++ b/onnxruntime/core/mlas/lib/activate.cpp @@ -106,7 +106,11 @@ struct MLAS_ACTIVATION_FUNCTION float Activate(float Value) { - return (std::max)(0.0f, Value); +#if defined(MLAS_SSE2_INTRINSICS) + return _mm_cvtss_f32(Activate(_mm_set_ss(Value))); +#else + return (std::max)(Value, 0.0f); +#endif } }; @@ -119,7 +123,7 @@ struct MLAS_ACTIVATION_FUNCTION MLAS_ACTIVATION_FUNCTION(const MLAS_ACTIVATION* Activation) { - AlphaBroadcast = MlasBroadcastFloat32x4(&Activation->alpha); + AlphaBroadcast = MlasBroadcastFloat32x4(&Activation->Parameters.LeakyRelu.alpha); } MLAS_FLOAT32X4 Activate(MLAS_FLOAT32X4 Value) @@ -155,14 +159,46 @@ struct MLAS_ACTIVATION_FUNCTION } }; +template<> +struct MLAS_ACTIVATION_FUNCTION +{ + MLAS_FLOAT32X4 MinimumBroadcast; + MLAS_FLOAT32X4 MaximumBroadcast; + + MLAS_ACTIVATION_FUNCTION(const MLAS_ACTIVATION* Activation) + { + MinimumBroadcast = MlasBroadcastFloat32x4(&Activation->Parameters.Clip.minimum); + MaximumBroadcast = MlasBroadcastFloat32x4(&Activation->Parameters.Clip.maximum); + } + + MLAS_FLOAT32X4 Activate(MLAS_FLOAT32X4 Value) + { + Value = MlasMaximumFloat32x4(MinimumBroadcast, Value); + Value = MlasMinimumFloat32x4(MaximumBroadcast, Value); + + return Value; + } + + float Activate(float Value) + { +#if defined(MLAS_SSE2_INTRINSICS) + return _mm_cvtss_f32(Activate(_mm_set_ss(Value))); +#else + Value = (std::max)(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast)); + Value = (std::min)(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast)); + + return Value; +#endif + } +}; + template void MlasActivationKernel( const MLAS_ACTIVATION* Activation, - const float* Input, + float* Buffer, const float* Bias, size_t M, - float* Output, size_t N, size_t ldc ) @@ -177,15 +213,13 @@ Routine Description: Activation - Supplies the parameters for the activation. - Input - Supplies the input matrix. + Buffer - Supplies the output matrix. Bias - Supplies the optional bias vector. M - Supplies the number of elements of the bias vector and the number of rows in the output matrix. - Output - Supplies the output matrix. - N - Supplies the number of columns of the output matrix. ldc - Supplies the number of elements per row of the output matrix. @@ -205,8 +239,7 @@ Return Value: while (M-- > 0) { - const float* input = Input; - float* output = Output; + float* buffer = Buffer; size_t n = N; BiasAddition.LoadNext(Bias); @@ -215,10 +248,9 @@ Return Value: do { - MLAS_FLOAT32X4 Vector = BiasAddition.Add(MlasLoadFloat32x4(input)); - MlasStoreFloat32x4(output, ActivationFunction.Activate(Vector)); - input += 4; - output += 4; + MLAS_FLOAT32X4 Vector = BiasAddition.Add(MlasLoadFloat32x4(buffer)); + MlasStoreFloat32x4(buffer, ActivationFunction.Activate(Vector)); + buffer += 4; n -= 4; } while (n >= 4); @@ -226,13 +258,12 @@ Return Value: while (n > 0) { - float Scalar = BiasAddition.Add(*input++); - *output++ = ActivationFunction.Activate(Scalar); + float Scalar = BiasAddition.Add(*buffer); + *buffer++ = ActivationFunction.Activate(Scalar); n -= 1; } - Input += ldc; - Output += ldc; + Buffer += ldc; } } @@ -241,10 +272,9 @@ inline void MlasActivationKernel( const MLAS_ACTIVATION* Activation, - const float* Input, + float* Buffer, const float* Bias, size_t M, - float* Output, size_t N, size_t ldc ) @@ -259,15 +289,13 @@ Routine Description: Activation - Supplies the parameters for the activation. - Input - Supplies the input matrix. + Buffer - Supplies the output matrix. Bias - Supplies the optional bias vector. M - Supplies the number of elements of the bias vector and the number of rows in the output matrix. - Output - Supplies the output matrix. - N - Supplies the number of columns of the output matrix. ldc - Supplies the number of elements per row of the output matrix. @@ -283,10 +311,9 @@ Return Value: // MLAS_UNREFERENCED_PARAMETER(Activation); - MLAS_UNREFERENCED_PARAMETER(Input); + MLAS_UNREFERENCED_PARAMETER(Buffer); MLAS_UNREFERENCED_PARAMETER(Bias); MLAS_UNREFERENCED_PARAMETER(M); - MLAS_UNREFERENCED_PARAMETER(Output); MLAS_UNREFERENCED_PARAMETER(N); MLAS_UNREFERENCED_PARAMETER(ldc); } @@ -296,10 +323,9 @@ inline void MlasActivationKernel( const MLAS_ACTIVATION* Activation, - const float* Input, + float* Buffer, const float* Bias, size_t M, - float* Output, size_t N, size_t ldc ) @@ -314,15 +340,13 @@ Routine Description: Activation - Supplies the parameters for the activation. - Input - Supplies the input matrix. + Buffer - Supplies the output matrix. Bias - Supplies the optional bias vector. M - Supplies the number of elements of the bias vector and the number of rows in the output matrix. - Output - Supplies the output matrix. - N - Supplies the number of columns of the output matrix. ldc - Supplies the number of elements per row of the output matrix. @@ -334,9 +358,9 @@ Return Value: --*/ { if (Bias != nullptr) { - MlasActivationKernel(Activation, Input, Bias, M, Output, N, ldc); + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); } else { - MlasActivationKernel(Activation, Input, Bias, M, Output, N, ldc); + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); } } @@ -344,10 +368,9 @@ void MLASCALL MlasActivation( const MLAS_ACTIVATION* Activation, - const float* Input, + float* Buffer, const float* Bias, size_t M, - float* Output, size_t N, size_t ldc ) @@ -362,15 +385,13 @@ Routine Description: Activation - Supplies the parameters for the activation. - Input - Supplies the input matrix. + Buffer - Supplies the output matrix. Bias - Supplies the optional bias vector. M - Supplies the number of elements of the bias vector and the number of rows in the output matrix. - Output - Supplies the output matrix. - N - Supplies the number of columns of the output matrix. ldc - Supplies the number of elements per row of the output matrix. @@ -385,34 +406,34 @@ Return Value: case MlasIdentityActivation: { - MlasActivationKernel(Activation, Input, Bias, M, Output, N, ldc); + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); break; } case MlasReluActivation: { - MlasActivationKernel(Activation, Input, Bias, M, Output, N, ldc); + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); break; } case MlasLeakyReluActivation: { - MlasActivationKernel(Activation, Input, Bias, M, Output, N, ldc); + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); break; } case MlasTanhActivation: { if (Bias != nullptr) { - MlasActivationKernel(Activation, Input, Bias, M, Output, N, ldc); + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); } if (N == ldc) { - MlasComputeTanh(Output, Output, M * N); + MlasComputeTanh(Buffer, Buffer, M * N); } else { while (M-- > 0) { - MlasComputeTanh(Output, Output, N); - Output += ldc; + MlasComputeTanh(Buffer, Buffer, N); + Buffer += ldc; } } @@ -422,19 +443,25 @@ Return Value: case MlasLogisticActivation: { if (Bias != nullptr) { - MlasActivationKernel(Activation, Input, Bias, M, Output, N, ldc); + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); } if (N == ldc) { - MlasComputeLogistic(Output, Output, M * N); + MlasComputeLogistic(Buffer, Buffer, M * N); } else { while (M-- > 0) { - MlasComputeLogistic(Output, Output, N); - Output += ldc; + MlasComputeLogistic(Buffer, Buffer, N); + Buffer += ldc; } } break; } + + case MlasClipActivation: + { + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); + break; + } } } diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 9caf11625912c..0acaece7728d6 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -602,7 +602,7 @@ Return Value: // MlasActivation(Parameters->Activation, SegmentOutput, Bias, FilterCount, - SegmentOutput, CountN, OutputSize); + CountN, OutputSize); } } @@ -731,7 +731,7 @@ Return Value: bias += group * FilterCount; } - MlasActivation(Parameters->Activation, output, bias, FilterCount, output, + MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize); } } @@ -941,7 +941,7 @@ Return Value: // Apply the activation with optional bias. // - MlasActivation(Parameters->Activation, Output, bias, FilterCount, Output, + MlasActivation(Parameters->Activation, Output, bias, FilterCount, OutputSize, OutputSize); break; @@ -967,7 +967,7 @@ Return Value: // Apply the activation with optional bias. // - MlasActivation(Parameters->Activation, Output, bias, FilterCount, Output, + MlasActivation(Parameters->Activation, Output, bias, FilterCount, OutputSize, OutputSize); break; diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 65d404348e5bc..b191c155928d9 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -255,33 +255,13 @@ typedef MLAS_POOL_FLOAT_KERNEL* PMLAS_POOL_FLOAT_KERNEL; typedef void -(MLASCALL MLAS_LOGISTIC_KERNEL_ROUTINE)( +(MLASCALL MLAS_ELEMENTWISE_KERNEL_ROUTINE)( const float* Input, float* Output, size_t N ); -typedef MLAS_LOGISTIC_KERNEL_ROUTINE* PMLAS_LOGISTIC_KERNEL_ROUTINE; - -typedef -void -(MLASCALL MLAS_TANH_KERNEL_ROUTINE)( - const float* Input, - float* Output, - size_t N - ); - -typedef MLAS_TANH_KERNEL_ROUTINE* PMLAS_TANH_KERNEL_ROUTINE; - -typedef -void -(MLASCALL MLAS_ERF_KERNEL_ROUTINE)( - const float* Input, - float* Output, - size_t N - ); - -typedef MLAS_ERF_KERNEL_ROUTINE* PMLAS_ERF_KERNEL_ROUTINE; +typedef MLAS_ELEMENTWISE_KERNEL_ROUTINE* PMLAS_ELEMENTWISE_KERNEL_ROUTINE; extern "C" { @@ -347,24 +327,41 @@ extern "C" { MLAS_POOL_FLOAT_KERNEL MlasPoolAverageIncludePadFloatKernel; #endif - MLAS_TANH_KERNEL_ROUTINE MlasLogisticKernel; - MLAS_TANH_KERNEL_ROUTINE MlasTanhKernel; - MLAS_ERF_KERNEL_ROUTINE MlasErfKernel; + MLAS_ELEMENTWISE_KERNEL_ROUTINE MlasLogisticKernel; + MLAS_ELEMENTWISE_KERNEL_ROUTINE MlasTanhKernel; + MLAS_ELEMENTWISE_KERNEL_ROUTINE MlasErfKernel; #if defined(MLAS_TARGET_AMD64) - MLAS_TANH_KERNEL_ROUTINE MlasLogisticKernelFma3; - MLAS_TANH_KERNEL_ROUTINE MlasTanhKernelFma3; - MLAS_ERF_KERNEL_ROUTINE MlasErfKernelFma3; + MLAS_ELEMENTWISE_KERNEL_ROUTINE MlasLogisticKernelFma3; + MLAS_ELEMENTWISE_KERNEL_ROUTINE MlasTanhKernelFma3; + MLAS_ELEMENTWISE_KERNEL_ROUTINE MlasErfKernelFma3; #endif } +// +// Define the default preferred byte alignment for buffers. +// +// MLAS_TARGET_AMD64_IX86: The typical architecture uses AVX instructions +// accessing 256-bit vectors. MLAS_TARGET_AMD64 returns a larger value if the +// platform supports 512-bit vectors to ensure that vectors are not split. +// +// MLAS_TARGET_ARM64: The kernels use "load pair" instructions to access 128-bit +// vectors, so this value keeps both vectors in the same cache line. +// +// MLAS_TARGET_ARM: Using 16 for a single 128-bit vector may be sufficient for +// this architecture, but the ONNX Runtime has historically used this larger +// value. +// + +#define MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT 32 + // // Define the target number of per-thread multiplies before using another // thread to perform additional work. // // The number is derived from performance results running SGEMM across a // range of workloads and observing the ideal number of threads to complete -// that workload. See EvaluateThreadingPerformance() in the unit test. +// that workload. // #if defined(_OPENMP) @@ -420,10 +417,11 @@ struct MLAS_PLATFORM { PMLAS_CONV_DEPTHWISE_FLOAT_KERNEL ConvDepthwiseFloatKernel; PMLAS_CONV_POINTWISE_FLOAT_KERNEL ConvPointwiseFloatKernel; PMLAS_POOL_FLOAT_KERNEL PoolFloatKernel[MlasPoolingKindCount]; - PMLAS_LOGISTIC_KERNEL_ROUTINE LogisticKernelRoutine; - PMLAS_TANH_KERNEL_ROUTINE TanhKernelRoutine; - PMLAS_ERF_KERNEL_ROUTINE ErfKernelRoutine; + PMLAS_ELEMENTWISE_KERNEL_ROUTINE LogisticKernelRoutine; + PMLAS_ELEMENTWISE_KERNEL_ROUTINE TanhKernelRoutine; + PMLAS_ELEMENTWISE_KERNEL_ROUTINE ErfKernelRoutine; uint32_t NchwcBlockSize; + uint32_t PreferredBufferAlignment; #endif #if defined(MLAS_USE_WIN32_THREADPOOL) diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index afcfcfd1cdd3a..4f99d50fb27b0 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -99,6 +99,7 @@ Return Value: this->TanhKernelRoutine = MlasTanhKernel; this->ErfKernelRoutine = MlasErfKernel; this->NchwcBlockSize = 8; + this->PreferredBufferAlignment = MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT; #endif // @@ -168,6 +169,7 @@ Return Value: this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelAvx512F; this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelAvx512F; this->NchwcBlockSize = 16; + this->PreferredBufferAlignment = 64; } else { @@ -210,3 +212,33 @@ Return Value: #endif } + +size_t +MLASCALL +MlasGetPreferredBufferAlignment( + void + ) +/*++ + +Routine Description: + + This routine returns the preferred byte alignment for buffers that are used + with this library. Buffers that are not bytes aligned to this value will + function, but will not achieve best performance. + +Arguments: + + None. + +Return Value: + + Returns the preferred byte alignment for buffers. + +--*/ +{ +#if defined(MLAS_TARGET_AMD64) + return MlasPlatform.PreferredBufferAlignment; +#else + return MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT; +#endif +} diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp index 76d0f5938a376..843f919f44c40 100644 --- a/onnxruntime/core/mlas/lib/snchwc.cpp +++ b/onnxruntime/core/mlas/lib/snchwc.cpp @@ -169,6 +169,7 @@ Return Value: size_t InputSize = 1; size_t OutputSize = 1; + bool CanFlattenShape = (Dimensions == 2); for (size_t dim = 0; dim < Dimensions; dim++) { @@ -193,6 +194,8 @@ Return Value: WorkBlock->DilationShape[dim] = 1; } + CanFlattenShape &= (WorkBlock->DilationShape[dim] == 1); + if (Padding != nullptr) { WorkBlock->Padding[dim] = size_t(Padding[dim]); WorkBlock->Padding[dim + Dimensions] = size_t(Padding[dim + Dimensions]); @@ -201,21 +204,58 @@ Return Value: WorkBlock->Padding[dim + Dimensions] = 0; } + CanFlattenShape &= (WorkBlock->Padding[dim] == 0 && WorkBlock->Padding[dim + Dimensions] == 0); + if (StrideShape != nullptr) { WorkBlock->StrideShape[dim] = size_t(StrideShape[dim]); } else { WorkBlock->StrideShape[dim] = 1; } - // - // Compute the number of output elements affected by left and right - // padding. - // + CanFlattenShape &= (WorkBlock->StrideShape[dim] == 1); + } + + WorkBlock->InputSize = InputSize; + WorkBlock->OutputSize = OutputSize; + + // + // Detect operations where the kernel is using the entire input width, + // has strides and dilations set to one, and no padding. These operations + // are transformed from outputting [N][1] to [1][N] by flattening the + // operation to a single line using striding equal to the original width. + // + // With the originally shape, the NCHWc kernels would process a single + // output per output line. After reshaping, the NCHWc kernels are able to + // process multiple outputs per output line which typically performs better, + // despite potentially using fewer threads due to the decreased output + // height. + // + + if (CanFlattenShape && (WorkBlock->InputShape[1] == WorkBlock->KernelShape[1])) { + + WorkBlock->StrideShape[1] = WorkBlock->InputShape[1]; + + WorkBlock->InputShape[1] *= WorkBlock->InputShape[0]; + WorkBlock->InputShape[0] = 1; + + WorkBlock->OutputShape[1] *= WorkBlock->OutputShape[0]; + WorkBlock->OutputShape[0] = 1; + + WorkBlock->KernelShape[1] *= WorkBlock->KernelShape[0]; + WorkBlock->KernelShape[0] = 1; + } + + // + // Compute the number of output elements affected by left and right padding. + // + + for (size_t dim = 0; dim < Dimensions; dim++) { const size_t SpanValue = WorkBlock->DilationShape[dim] * (WorkBlock->KernelShape[dim] - 1) + 1; const size_t StrideValue = WorkBlock->StrideShape[dim]; const size_t PaddingLeftValue = WorkBlock->Padding[dim]; + const size_t InputValue = WorkBlock->InputShape[dim]; size_t OutputCountWithLeftPad; @@ -231,13 +271,12 @@ Return Value: OutputCountLeftPad = OutputCountWithLeftPad; } + const size_t OutputValue = WorkBlock->OutputShape[dim]; + WorkBlock->OutputCountLeftPad[dim] = OutputCountLeftPad; WorkBlock->OutputCount[dim] = OutputCountWithLeftPad - OutputCountLeftPad; WorkBlock->OutputCountRightPad[dim] = OutputValue - OutputCountWithLeftPad; } - - WorkBlock->InputSize = InputSize; - WorkBlock->OutputSize = OutputSize; } // @@ -486,7 +525,7 @@ struct MLAS_NCHWC_CONV_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM // output size instead of the output width as done in NCHW convolution. // - MlasActivation(Activation, output, nullptr, FilterCount, output, + MlasActivation(Activation, output, nullptr, FilterCount, BlockedOutputWidth, BlockSize * OutputSize); } }; diff --git a/onnxruntime/core/mlas/lib/x86_64/xgetbv.h b/onnxruntime/core/mlas/lib/x86_64/xgetbv.h deleted file mode 100644 index 007a8e397aaff..0000000000000 --- a/onnxruntime/core/mlas/lib/x86_64/xgetbv.h +++ /dev/null @@ -1,41 +0,0 @@ -/*++ - -Copyright (c) Microsoft Corporation. All rights reserved. - -Licensed under the MIT License. - -Module Name: - - xgetbv.h - -Abstract: - - This module contains a wrapper for the XGETBV instruction for compilers - lacking an intrinsic alternative. - ---*/ - -#pragma once -// clang-format off - -#if !defined(_XCR_XFEATURE_ENABLED_MASK) -#define _XCR_XFEATURE_ENABLED_MASK 0 -#endif - -inline -uint64_t -xgetbv( - unsigned int ext_ctrl_reg - ) -{ - uint32_t eax, edx; - - __asm__ - ( - "xgetbv" - : "=a" (eax), "=d" (edx) - : "c" (ext_ctrl_reg) - ); - - return ((uint64_t)edx << 32) | eax; -} diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc index b6273a870b6c7..ff7882e974f33 100644 --- a/onnxruntime/core/optimizer/constant_folding.cc +++ b/onnxruntime/core/optimizer/constant_folding.cc @@ -23,21 +23,24 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level) ORT_RETURN_IF_ERROR(Recurse(*node, modified, graph_level)); + InitializedTensorSet constant_inputs; + // Check if constant folding can be applied on this node. if (!graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) || excluded_op_types_.find(node->OpType()) != excluded_op_types_.end() || - // constant folding is not currently supported for nodes that include subgraphs (control flow operators, - // such as If/Loop/Scan, fall into this category). + // constant folding does not support executing a node that includes subgraphs (control flow operators, + // such as If/Loop/Scan, fall into this category). individual nodes in the subgraph will be processed + // by the Recurse call above node->ContainsSubgraph() || // if the node output is in the graph output, we will get a graph with no nodes. // TODO check if this is allowed in ONNX and ORT. graph.IsNodeOutputsInGraphOutputs(*node) || - !graph_utils::AllNodeInputsAreConstant(graph, *node)) { + !graph_utils::AllNodeInputsAreConstant(graph, *node, constant_inputs)) { continue; } // Create execution frame for executing constant nodes. - OptimizerExecutionFrame::Info info({node}, graph.GetAllInitializedTensors()); + OptimizerExecutionFrame::Info info({node}, constant_inputs); std::vector fetch_mlvalue_idxs; for (const auto* node_out : node->OutputDefs()) { diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc index c0c8e1ea45541..a3e1f400089fa 100644 --- a/onnxruntime/core/optimizer/conv_activation_fusion.cc +++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc @@ -3,7 +3,6 @@ #include #include "core/graph/graph_utils.h" -#include "core/optimizer/initializer.h" #include "core/optimizer/conv_activation_fusion.h" using namespace ONNX_NAMESPACE; @@ -11,13 +10,6 @@ using namespace ::onnxruntime::common; namespace onnxruntime { namespace { -bool IsFusableActivation(const Node& node) { - return graph_utils::IsSupportedOptypeVersionAndDomain(node, "LeakyRelu", {6}) || - graph_utils::IsSupportedOptypeVersionAndDomain(node, "Relu", {6}) || - graph_utils::IsSupportedOptypeVersionAndDomain(node, "Sigmoid", {6}) || - graph_utils::IsSupportedOptypeVersionAndDomain(node, "Tanh", {6}); -} - void HandleActivationNodeEdges(Graph& g, const Node& act, Node& fused_conv) { Node::EdgeSet output_edges; for (auto it = act.OutputEdgesBegin(); it != act.OutputEdgesEnd(); ++it) { @@ -43,54 +35,62 @@ Status ConvActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l std::deque removed_nodes; for (auto index : order) { - auto node = graph.GetNode(index); + auto* node = graph.GetNode(index); ORT_RETURN_IF_ERROR(Recurse(*node, modified, graph_level)); if (!graph_utils::IsSupportedOptypeVersionAndDomain(*node, "Conv", {1}) || - !graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) || + !graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) || node->GetOutputEdgesCount() != 1) { continue; } - const Node& next_node = *(node->OutputNodesBegin()); - if (!IsFusableActivation(next_node) || - next_node.GetExecutionProviderType() != node->GetExecutionProviderType()) { + + const auto& next_node = *(node->OutputNodesBegin()); + if (next_node.GetExecutionProviderType() != node->GetExecutionProviderType()) { continue; } - Node* conv_node = node; - const Node& act_node = next_node; + // Test if this is an activation that can be fused and also extract the + // activation's parameters. + std::vector activation_params; + if (!graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "Relu", {6}) && + !graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "Sigmoid", {6}) && + !graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "Tanh", {6})) { + if (graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "LeakyRelu", {6})) { + activation_params.push_back(graph_utils::GetNodeAttribute(next_node, "alpha")->f()); + } else if (graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "Clip", {6})) { + activation_params.push_back(graph_utils::GetNodeAttribute(next_node, "min")->f()); + activation_params.push_back(graph_utils::GetNodeAttribute(next_node, "max")->f()); + } else { + continue; + } + } - Node& fused_conv = graph.AddNode(graph.GenerateNodeName("fused " + conv_node->Name()), "FusedConv", - "fused Conv " + conv_node->Name() + "with activation " + act_node.OpType(), - conv_node->MutableInputDefs(), + Node& fused_conv = graph.AddNode(graph.GenerateNodeName("fused " + node->Name()), "FusedConv", + "fused Conv " + node->Name() + "with activation " + next_node.OpType(), + node->MutableInputDefs(), graph.IsNodeOutputsInGraphOutputs(next_node) - ? const_cast(act_node).MutableOutputDefs() - : conv_node->MutableOutputDefs(), - &conv_node->GetAttributes(), + ? const_cast(next_node).MutableOutputDefs() + : node->MutableOutputDefs(), + &node->GetAttributes(), "com.microsoft"); - //Add a new attribute to specify the activation type - fused_conv.AddAttribute("activation", act_node.OpType()); - // Assign provider to this new node. Provider should be same as the provider for old node. - fused_conv.SetExecutionProviderType(conv_node->GetExecutionProviderType()); + fused_conv.SetExecutionProviderType(node->GetExecutionProviderType()); - //Add optional attributes for activations - if (act_node.OpType() == "LeakyRelu") { - const NodeAttributes& attrs = act_node.GetAttributes(); - for (const auto& attr : attrs) { - fused_conv.AddAttribute(attr.first, attr.second); - } + // Add attributes to specify the activation type and parameters. + fused_conv.AddAttribute("activation", next_node.OpType()); + if (activation_params.size() > 0) { + fused_conv.AddAttribute("activation_params", activation_params); } if (!graph.IsNodeOutputsInGraphOutputs(next_node)) { - - HandleActivationNodeEdges(graph, act_node, fused_conv); + + HandleActivationNodeEdges(graph, next_node, fused_conv); // Replace the input of the node following activation node - const NodeArg* act_output_def = act_node.OutputDefs()[0]; + const NodeArg* act_output_def = next_node.OutputDefs()[0]; NodeArg* fused_conv_output_def = fused_conv.MutableOutputDefs()[0]; - for (auto it = act_node.OutputNodesBegin(); it != act_node.OutputNodesEnd(); ++it) { + for (auto it = next_node.OutputNodesBegin(); it != next_node.OutputNodesEnd(); ++it) { auto output_node = graph.GetNode((*it).Index()); if (!output_node) { return Status(ONNXRUNTIME, INVALID_ARGUMENT); @@ -105,8 +105,8 @@ Status ConvActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l } } - removed_nodes.push_front(conv_node->Index()); - removed_nodes.push_front(act_node.Index()); + removed_nodes.push_front(node->Index()); + removed_nodes.push_front(next_node.Index()); } for (auto node : removed_nodes) { diff --git a/onnxruntime/core/optimizer/conv_add_fusion.cc b/onnxruntime/core/optimizer/conv_add_fusion.cc index 94296e908fe2d..64c468e98fd70 100644 --- a/onnxruntime/core/optimizer/conv_add_fusion.cc +++ b/onnxruntime/core/optimizer/conv_add_fusion.cc @@ -15,19 +15,14 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie const auto& conv_inputs = conv_node.InputDefs(); const auto& add_inputs = add_node.InputDefs(); - const ONNX_NAMESPACE::TensorProto* conv_W_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(conv_inputs[1]->Name(), conv_W_tensor_proto)) { - return Status::OK(); - } + const auto* conv_W_tensor_proto = graph_utils::GetConstantInitializer(graph, conv_inputs[1]->Name()); + ORT_ENFORCE(conv_W_tensor_proto); - const ONNX_NAMESPACE::TensorProto* add_B_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(add_inputs[1]->Name(), add_B_tensor_proto)) { - return Status::OK(); - } + const auto* add_B_tensor_proto = graph_utils::GetConstantInitializer(graph, add_inputs[1]->Name()); + ORT_ENFORCE(add_B_tensor_proto); // Currently, fusion is only supported for float or double data type. - if (!Initializer::IsSupportedDataType(add_B_tensor_proto) || - conv_W_tensor_proto->dims_size() < 4) { + if (!Initializer::IsSupportedDataType(add_B_tensor_proto) || conv_W_tensor_proto->dims_size() < 4) { return Status::OK(); } @@ -51,11 +46,9 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie } } - const ONNX_NAMESPACE::TensorProto* conv_B_tensor_proto = nullptr; if (conv_inputs.size() == 3) { - if (!graph.GetInitializedTensor(conv_inputs[2]->Name(), conv_B_tensor_proto)) { - return Status::OK(); - } + const auto* conv_B_tensor_proto = graph_utils::GetConstantInitializer(graph, conv_inputs[2]->Name()); + ORT_ENFORCE(conv_B_tensor_proto); if (!Initializer::IsSupportedDataType(conv_B_tensor_proto) || conv_B_tensor_proto->data_type() != add_B_tensor_proto->data_type() || @@ -78,8 +71,7 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie conv_B->ToProto(&new_conv_B_tensor_proto); // Replace initializers of conv node - graph.RemoveInitializedTensor(conv_inputs[2]->Name()); - graph.AddInitializedTensor(new_conv_B_tensor_proto); + graph_utils::ReplaceInitializer(graph, conv_inputs[2]->Name(), new_conv_B_tensor_proto); } else { NodeArg* add_B_node_arg = graph.GetNodeArg(add_B_tensor_proto->name()); if (add_B_node_arg == nullptr) { @@ -92,8 +84,7 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie new_conv_B_tensor_proto.clear_dims(); new_conv_B_tensor_proto.add_dims(dim); - graph.RemoveInitializedTensor(add_B_tensor_proto->name()); - graph.AddInitializedTensor(new_conv_B_tensor_proto); + graph_utils::ReplaceInitializer(graph, add_B_tensor_proto->name(), new_conv_B_tensor_proto); // Update shape of NodeArg TensorShapeProto shape; diff --git a/onnxruntime/core/optimizer/conv_bn_fusion.cc b/onnxruntime/core/optimizer/conv_bn_fusion.cc index 65469b365d585..9dea7bed866de 100644 --- a/onnxruntime/core/optimizer/conv_bn_fusion.cc +++ b/onnxruntime/core/optimizer/conv_bn_fusion.cc @@ -23,31 +23,21 @@ Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff // Get initializers of BatchNormalization const auto& bn_inputs = bn_node.InputDefs(); - const ONNX_NAMESPACE::TensorProto* bn_scale_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(bn_inputs[1]->Name(), bn_scale_tensor_proto)) { - return Status::OK(); - } + const auto* bn_scale_tensor_proto = graph_utils::GetConstantInitializer(graph, bn_inputs[1]->Name()); + ORT_ENFORCE(bn_scale_tensor_proto); - const ONNX_NAMESPACE::TensorProto* bn_B_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(bn_inputs[2]->Name(), bn_B_tensor_proto)) { - return Status::OK(); - } + const auto* bn_B_tensor_proto = graph_utils::GetConstantInitializer(graph, bn_inputs[2]->Name()); + ORT_ENFORCE(bn_B_tensor_proto); - const ONNX_NAMESPACE::TensorProto* bn_mean_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(bn_inputs[3]->Name(), bn_mean_tensor_proto)) { - return Status::OK(); - } + const auto* bn_mean_tensor_proto = graph_utils::GetConstantInitializer(graph, bn_inputs[3]->Name()); + ORT_ENFORCE(bn_mean_tensor_proto); - const ONNX_NAMESPACE::TensorProto* bn_var_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(bn_inputs[4]->Name(), bn_var_tensor_proto)) { - return Status::OK(); - } + const auto* bn_var_tensor_proto = graph_utils::GetConstantInitializer(graph, bn_inputs[4]->Name()); + ORT_ENFORCE(bn_var_tensor_proto); const auto& conv_inputs = conv_node.InputDefs(); - const ONNX_NAMESPACE::TensorProto* conv_W_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(conv_inputs[1]->Name(), conv_W_tensor_proto)) { - return Status::OK(); - } + const auto* conv_W_tensor_proto = graph_utils::GetConstantInitializer(graph, conv_inputs[1]->Name()); + ORT_ENFORCE(conv_W_tensor_proto); // Currently, fusion is only supported for float or double data type. if (!Initializer::IsSupportedDataType(bn_scale_tensor_proto) || @@ -76,12 +66,11 @@ Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff auto bn_var = std::make_unique(bn_var_tensor_proto); auto conv_W = std::make_unique(conv_W_tensor_proto); - const ONNX_NAMESPACE::TensorProto* conv_B_tensor_proto = nullptr; std::unique_ptr conv_B = nullptr; + const ONNX_NAMESPACE::TensorProto* conv_B_tensor_proto = nullptr; if (conv_inputs.size() == 3) { - if (!graph.GetInitializedTensor(conv_inputs[2]->Name(), conv_B_tensor_proto)) { - return Status::OK(); - } + conv_B_tensor_proto = graph_utils::GetConstantInitializer(graph, conv_inputs[2]->Name()); + ORT_ENFORCE(conv_B_tensor_proto); if (!Initializer::IsSupportedDataType(conv_B_tensor_proto) || conv_B_tensor_proto->dims_size() != 1 || @@ -124,24 +113,23 @@ Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff } // Replace initializers of conv node - graph.RemoveInitializedTensor(conv_W_tensor_proto->name()); + graph_utils::ReplaceInitializer(graph, conv_W_tensor_proto->name(), new_conv_W_tensor_proto); + if (conv_inputs.size() == 3) { #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 6011) // Not deferencing null pointer. conv_B_tensor_proto is set on line 93 #endif - graph.RemoveInitializedTensor(conv_B_tensor_proto->name()); + graph_utils::ReplaceInitializer(graph, conv_B_tensor_proto->name(), new_conv_B_tensor_proto); #ifdef _MSC_VER #pragma warning(pop) #endif } else { - graph.RemoveInitializedTensor(bn_B_tensor_proto->name()); + graph_utils::ReplaceInitializer(graph, bn_B_tensor_proto->name(), new_conv_B_tensor_proto); conv_node.MutableInputDefs().push_back(bn_B_node_arg); conv_node.MutableInputArgsCount()[2] = 1; } - graph.AddInitializedTensor(new_conv_W_tensor_proto); - graph.AddInitializedTensor(new_conv_B_tensor_proto); // Remove BN node. auto* bn_node_to_remove = graph.GetNode(bn_node.Index()); diff --git a/onnxruntime/core/optimizer/conv_mul_fusion.cc b/onnxruntime/core/optimizer/conv_mul_fusion.cc index 54408f31857ca..32b4c0bbd4fb8 100644 --- a/onnxruntime/core/optimizer/conv_mul_fusion.cc +++ b/onnxruntime/core/optimizer/conv_mul_fusion.cc @@ -15,15 +15,11 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef const auto& conv_inputs = conv_node.InputDefs(); const auto& mul_inputs = mul_node.InputDefs(); - const ONNX_NAMESPACE::TensorProto* conv_W_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(conv_inputs[1]->Name(), conv_W_tensor_proto)) { - return Status::OK(); - } + const auto* conv_W_tensor_proto = graph_utils::GetConstantInitializer(graph, conv_inputs[1]->Name()); + ORT_ENFORCE(conv_W_tensor_proto); - const ONNX_NAMESPACE::TensorProto* mul_B_tensor_proto = nullptr; - if (!graph.GetInitializedTensor(mul_inputs[1]->Name(), mul_B_tensor_proto)) { - return Status::OK(); - } + const auto* mul_B_tensor_proto = graph_utils::GetConstantInitializer(graph, mul_inputs[1]->Name()); + ORT_ENFORCE(mul_B_tensor_proto); if (!Initializer::IsSupportedDataType(conv_W_tensor_proto) || !Initializer::IsSupportedDataType(mul_B_tensor_proto) || @@ -61,10 +57,9 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef std::unique_ptr conv_B = nullptr; const bool is_3d = conv_inputs.size() == 3; if (is_3d) { - if (!graph.GetInitializedTensor(conv_inputs[2]->Name(), conv_B_tensor_proto)) - return Status::OK(); - if (conv_B_tensor_proto == nullptr) - return Status(ONNXRUNTIME, FAIL, "Internal error in ConvMulFusion. conv_B_tensor_proto is NULL"); + conv_B_tensor_proto = graph_utils::GetConstantInitializer(graph, conv_inputs[2]->Name()); + ORT_ENFORCE(conv_B_tensor_proto); + if (!Initializer::IsSupportedDataType(conv_B_tensor_proto) || conv_B_tensor_proto->data_type() != mul_B_tensor_proto->data_type() || conv_B_tensor_proto->dims_size() != 1 || @@ -90,14 +85,12 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef conv_W->ToProto(&new_conv_W_tensor_proto); // Replace initializers of conv node - graph.RemoveInitializedTensor(conv_inputs[1]->Name()); - graph.AddInitializedTensor(new_conv_W_tensor_proto); + graph_utils::ReplaceInitializer(graph, conv_inputs[1]->Name(), new_conv_W_tensor_proto); if (is_3d) { ONNX_NAMESPACE::TensorProto new_conv_B_tensor_proto(*conv_B_tensor_proto); conv_B->ToProto(&new_conv_B_tensor_proto); - graph.RemoveInitializedTensor(conv_inputs[2]->Name()); - graph.AddInitializedTensor(new_conv_B_tensor_proto); + graph_utils::ReplaceInitializer(graph, conv_inputs[2]->Name(), new_conv_B_tensor_proto); } // Remove Mul node. diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 262241c599780..173bd6fc6e7e7 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -1,3 +1,5 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. #include "core/optimizer/graph_transformer_utils.h" #include "core/optimizer/identity_elimination.h" @@ -14,6 +16,8 @@ #include "core/optimizer/dropout_elimination.h" #include "core/optimizer/relu_clip_fusion.h" #include "core/optimizer/shape_to_initializer.h" +#include "core/optimizer/nchwc_transformer.h" +#include "core/mlas/inc/mlas.h" namespace onnxruntime { @@ -41,6 +45,10 @@ std::vector> GenerateRewriteRules(TransformerLevel rules.push_back(std::make_unique()); rules.push_back(std::make_unique()); break; + + case TransformerLevel::Level3: + break; + default: ORT_ENFORCE(false, "Unsupported level" + std::to_string(static_cast(level))); } @@ -105,6 +113,16 @@ std::vector> GenerateTransformers(TransformerL #endif } break; + case TransformerLevel::Level3: { +#ifndef DISABLE_CONTRIB_OPS + // Register the NCHWc layout transformer if supported by the platform. + if (MlasNchwcGetBlockSize() > 1) { + transformers.emplace_back(std::make_unique()); + } +#endif + + } break; + default: ORT_ENFORCE(false, "Unsupported level " + std::to_string(static_cast(level))); break; diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.cc b/onnxruntime/core/optimizer/matmul_add_fusion.cc index a0c392ef75354..33bc507990a40 100644 --- a/onnxruntime/core/optimizer/matmul_add_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_add_fusion.cc @@ -43,10 +43,13 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level) auto matmul_input_defs = matmul_node.MutableInputDefs(); auto add_input_defs = add_node.MutableInputDefs(); - // Gemm only support float, so the inputs of MatMul + // Gemm requires that inputs be the same data type and both floating point (float32/float16). auto matmul_type = matmul_input_defs[0]->Type(); auto add_type = add_input_defs[0]->Type(); - if ((*matmul_type) != "tensor(float)" || (*add_type) != "tensor(float)") { + if ((*matmul_type) != (*add_type)) { + continue; + } + if ((*matmul_type) != "tensor(float)" && (*matmul_type) != "tensor(float16)") { continue; } diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc new file mode 100644 index 0000000000000..1a2003aca9b2d --- /dev/null +++ b/onnxruntime/core/optimizer/nchwc_transformer.cc @@ -0,0 +1,739 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include "core/graph/graph_utils.h" +#include "core/optimizer/initializer.h" +#include "core/optimizer/nchwc_transformer.h" +#include "core/mlas/inc/mlas.h" + +using namespace ONNX_NAMESPACE; +using namespace ::onnxruntime::common; +namespace onnxruntime { + +class NchwcTransformerImpl { + public: + NchwcTransformerImpl(Graph& graph) noexcept : graph_(graph) {} + + void Transform(Node& node); + void Finalize(bool& modified); + + static constexpr int kNchwcBatchChannelDims = 2; + static constexpr int kNchwcSpatialDims = 2; + static constexpr int kNchwcDims = kNchwcBatchChannelDims + kNchwcSpatialDims; + + private: + // Associate the following state with each created NCHWc output keyed off the + // original NodeArg. + struct NchwcArgument { + // Symbolic shape information for this NCHWc output. Each dimension stores + // the original NodeArg* that sourced the value. Spatial dimensions also + // track the number of times the original value has been shifted down due + // to a stride count of 2. + // + // For example, the first Conv node that takes NCHW input will create a + // NchwcArgument with the shape referencing itself. Other NCHWc nodes that + // use this first Conv node then do a limited shape inference. The shape + // inference carries forward any of the first Conv node's dimensions that + // are unchanged or resets to the NodeArg* of the updated output node. + // + // The benefit of doing this is for models where the model inputs are not + // fixed. For example, The YoloV3 model has the image height and width as + // parameters. The model has branches that are candidates for Conv/Add + // fusion that can be detected using this additional shape hint. + struct Shape { + const NodeArg* dims_[kNchwcDims]; + size_t shifts_[kNchwcSpatialDims]; + + Shape(const NodeArg* initial_dim) { + std::fill_n(dims_, kNchwcDims, initial_dim); + std::fill_n(shifts_, kNchwcSpatialDims, 0); + } + + bool IsDimEqual(const Shape& other, int dim) const { + bool is_dim_equal = false; + // Test if this dimension is derived from the same NodeArg. + if (dims_[dim] == other.dims_[dim]) { + if (dim >= kNchwcBatchChannelDims) { + // Test if the NodeArg has been shifted down the same number of + // times due to striding. + int spatial_dim = dim - kNchwcBatchChannelDims; + if (shifts_[spatial_dim] == other.shifts_[spatial_dim]) { + is_dim_equal = true; + } + } else { + is_dim_equal = true; + } + } + return is_dim_equal; + } + }; + + // Stores the node that generated the NCHWc output. + Node& output_node_; + + // Stores the NodeArg that represents the NCHWc output. + NodeArg* nchwc_arg_; + + // Stores the original number of uses for the original NodeArg. Edges are + // removed from the graph as nodes are converted to NCHWc form. + const size_t starting_original_uses_; + + // Stores the remaining number of uses for the original NodeArg. The count + // is decremented as uses are converted to NCHWc format. Nodes are inserted + // to reorder the output if this count is non-zero. + size_t remaining_original_uses_; + + // Stores the logical number of channels for this NCHWc output. The NCHWc + // NodeArg is zero padded to the NCHWc block size. If the output needs to + // be reordered back to a standard tensor format, this channel count is + // used to generate the expected number of channels. + const int64_t channels_; + + // Stores the proto shape for the NCHWc output. + NchwcArgument::Shape shape_; + + NchwcArgument(Node& output_node, NodeArg* output_nchwc_arg, size_t original_uses, size_t channels, const NchwcArgument::Shape& shape) + : output_node_(output_node), + nchwc_arg_(output_nchwc_arg), + starting_original_uses_(original_uses), + remaining_original_uses_(original_uses), + channels_(channels), + shape_(shape) { + } + }; + + size_t RemoveOutputEdges(Node& node); + void CreateNchwcArgument(Node& node, Node& nchwc_node, int64_t channels, const NchwcArgument::Shape& shape); + void FuseNchwcArgument(Node& node, const NchwcArgument& nchwc_arg); + void InsertReorderInput(Node& node); + + void ConvPoolShapeInference(const Node& node, + const NchwcArgument::Shape& input_shape, + NchwcArgument::Shape& output_shape, + const ONNX_NAMESPACE::TensorProto* filter_shape); + + void TransformConv(Node& node); + void TransformPool(Node& node); + void TransformAdd(Node& node); + void TransformConcat(Node& node); + void TransformActivation(Node& node); + + Graph& graph_; + + // Stores a queue of nodes to be removed after walking through the graph. + std::deque removed_nodes_; + + // Stores a mapping from the original NodeArg outputs to the NCHWc variants + // created inside this graph transform. + std::unordered_map> nchwc_args_; + + // Stores a mapping of NodeArg inputs that have already been reordered, so + // multiple nodes can share the NCHWc input. + std::unordered_map reorder_inputs_; + + // Stores a mapping of NodeArg filters that have already been reordered, so + // multiple nodes can share the NCHWc filter. + std::unordered_map filters_OIHWBo_; + std::unordered_map filters_OIHWBiBo_; + + // Stores a mapping of NodeArg biases that have already been aligned to the + // NCHWc block size, so multiple nodes can share the NCHWc biases. + std::unordered_map aligned_biases_; +}; + +size_t NchwcTransformerImpl::RemoveOutputEdges(Node& node) { + size_t output_edges_count = node.GetOutputEdgesCount(); + if (output_edges_count > 0) { + graph_utils::RemoveNodeOutputEdges(graph_, node); + } else { + // Bias the edge count to handle the case of a node that produces a graph + // output. + output_edges_count = 1; + } + return output_edges_count; +} + +void NchwcTransformerImpl::CreateNchwcArgument(Node& node, + Node& nchwc_node, + int64_t channels, + const NchwcArgument::Shape& shape) { + size_t original_uses = RemoveOutputEdges(node); + + // Create a new NodeArg to track the output from the NCHWc node. + auto& output_defs = nchwc_node.MutableOutputDefs(); + auto* output_original_arg = output_defs[0]; + std::string output_reorder_def_name = graph_.GenerateNodeArgName("reorder"); + auto* output_nchwc_arg = &graph_.GetOrCreateNodeArg(output_reorder_def_name, nullptr); + nchwc_args_[output_original_arg] = + std::make_unique(nchwc_node, output_nchwc_arg, original_uses, channels, shape); + output_defs[0] = output_nchwc_arg; +} + +void NchwcTransformerImpl::FuseNchwcArgument(Node& node, const NchwcArgument& nchwc_arg) { + size_t original_uses = RemoveOutputEdges(node); + + // Associate the existing NCHWc NodeArg with the output from this node. + auto* output_original_arg = node.MutableOutputDefs()[0]; + auto& nchwc_node = nchwc_arg.output_node_; + auto* output_nchwc_arg = nchwc_node.MutableOutputDefs()[0]; + nchwc_args_[output_original_arg] = + std::make_unique(nchwc_node, output_nchwc_arg, original_uses, nchwc_arg.channels_, nchwc_arg.shape_); +} + +void NchwcTransformerImpl::InsertReorderInput(Node& node) { + auto& input_defs = node.MutableInputDefs(); + auto* input_original_arg = input_defs[0]; + + auto it = reorder_inputs_.find(input_original_arg); + if (it == reorder_inputs_.end()) { + std::string input_reorder_def_name = graph_.GenerateNodeArgName("reorder"); + auto* input_nchwc_arg = &graph_.GetOrCreateNodeArg(input_reorder_def_name, nullptr); + reorder_inputs_[input_original_arg] = input_nchwc_arg; + Node& reorder_input_node = graph_.AddNode(graph_.GenerateNodeName("ReorderInput"), + "ReorderInput", + "ReorderInput", + {input_original_arg}, + {input_nchwc_arg}, + nullptr, + kMSNchwcDomain); + reorder_input_node.SetExecutionProviderType(node.GetExecutionProviderType()); + input_defs[0] = input_nchwc_arg; + } else { + input_defs[0] = it->second; + } +} + +void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node, + const NchwcArgument::Shape& input_shape, + NchwcArgument::Shape& output_shape, + const ONNX_NAMESPACE::TensorProto* filter_shape) { + // Skip the leading batch and channel counts. + const int kernel_size = kNchwcSpatialDims; + + // Maintain the batch count dimension from the NCHWc input. + output_shape.dims_[0] = input_shape.dims_[0]; + + const ONNX_NAMESPACE::AttributeProto* pads_attr = graph_utils::GetNodeAttribute(node, "pads"); + const ONNX_NAMESPACE::AttributeProto* strides_attr = graph_utils::GetNodeAttribute(node, "strides"); + const ONNX_NAMESPACE::AttributeProto* dilations_attr = graph_utils::GetNodeAttribute(node, "dilations"); + + if ((pads_attr != nullptr && pads_attr->ints_size() != kernel_size * 2) || + (strides_attr != nullptr && strides_attr->ints_size() != kernel_size) || + (dilations_attr != nullptr && dilations_attr->ints_size() != kernel_size)) { + return; + } + + // Require the kernel_shape attribute for pooling operators. Convolution + // uses the weight tensor shape to derive the kernel shape. + const ONNX_NAMESPACE::AttributeProto* kernel_shape_attr = nullptr; + if (filter_shape == nullptr) { + kernel_shape_attr = graph_utils::GetNodeAttribute(node, "kernel_shape"); + if (kernel_shape_attr == nullptr || kernel_shape_attr->ints_size() != kernel_size) { + return; + } + } + + auto* auto_pad_attr = graph_utils::GetNodeAttribute(node, "auto_pad"); + bool auto_pad_same_shape = false; + if (auto_pad_attr != nullptr && auto_pad_attr->has_s()) { + auto& auto_pad = auto_pad_attr->s(); + if (auto_pad != "NOTSET") { + if (auto_pad == "SAME_UPPER" || auto_pad == "SAME_LOWER") { + auto_pad_same_shape = true; + } else if (auto_pad != "VALID") { + return; + } + pads_attr = nullptr; + } + } + + for (int i = 0; i < kernel_size; i++) { + if (dilations_attr != nullptr && dilations_attr->ints(i) != 1) { + continue; + } + + int64_t stride = 1; + if (strides_attr != nullptr) { + stride = strides_attr->ints(i); + if (stride != 1 && stride != 2) { + continue; + } + } + + int64_t padding = 0; + if (pads_attr != nullptr) { + padding = pads_attr->ints(i) + pads_attr->ints(i + kernel_size); + } + + int64_t kernel; + if (kernel_shape_attr != nullptr) { + kernel = kernel_shape_attr->ints(i); + } else { + kernel = filter_shape->dims(2 + i); + } + + // Maintain the spatial dimension from the NCHWc input if the implicit or + // explicit padding results in the same symbolic dimension before applying + // the stride. When the stride is 2, then the actual output dimensions is + // half the original value. Track the number of times the symbolic dimension + // has been halved in the shifts field. + if (padding + 1 == kernel || auto_pad_same_shape) { + output_shape.dims_[kNchwcBatchChannelDims + i] = input_shape.dims_[kNchwcBatchChannelDims + i]; + output_shape.shifts_[i] = input_shape.shifts_[i] + static_cast(stride) - 1; + } + } +} + +void NchwcTransformerImpl::TransformConv(Node& node) { + auto& input_defs = node.MutableInputDefs(); + auto& output_defs = node.MutableOutputDefs(); + + // Require that the weights tensor be static. + const ONNX_NAMESPACE::TensorProto* conv_W_tensor_proto = nullptr; + if (!graph_utils::NodeArgIsConstant(graph_, *input_defs[1]) || + !graph_.GetInitializedTensor(input_defs[1]->Name(), conv_W_tensor_proto) || + (conv_W_tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) || + (conv_W_tensor_proto->dims_size() != 4)) { + return; + } + + const int64_t output_channels = conv_W_tensor_proto->dims(0); + const int64_t input_channels = conv_W_tensor_proto->dims(1); + + int64_t group_count; + auto* group_attr = graph_utils::GetNodeAttribute(node, "group"); + if (group_attr != nullptr && group_attr->has_i()) { + group_count = group_attr->i(); + } else { + group_count = 1; + } + + const size_t nchwc_block_size = MlasNchwcGetBlockSize(); + + const int64_t nchwc_output_channels = (output_channels + nchwc_block_size - 1) & ~(nchwc_block_size - 1); + + bool do_reorder_input = true; + bool reorder_filter_OIHWBo = false; + + if (group_count > 1) { + if ((output_channels % nchwc_block_size) != 0) { + return; + } + if (input_channels == 1 && output_channels == group_count) { + // Depthwise convolution. + reorder_filter_OIHWBo = true; + } else if (((input_channels % nchwc_block_size) != 0) || + ((output_channels % group_count) != 0) || + (((output_channels / group_count) % nchwc_block_size) != 0)) { + return; + } + } else { + if (static_cast(input_channels) < nchwc_block_size) { + // Use NCHW input buffer directly. + reorder_filter_OIHWBo = true; + do_reorder_input = false; + } else if ((input_channels % nchwc_block_size) != 0) { + return; + } + } + + // Also require that the optional bias tensor be static. + const ONNX_NAMESPACE::TensorProto* conv_B_tensor_proto = nullptr; + if (input_defs.size() >= 3) { + if (!graph_utils::NodeArgIsConstant(graph_, *input_defs[2]) || + !graph_.GetInitializedTensor(input_defs[2]->Name(), conv_B_tensor_proto) || + (conv_B_tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) || + (conv_B_tensor_proto->dims_size() != 1) || + (conv_B_tensor_proto->dims(0) != output_channels)) { + return; + } + } + + // Check if the filter has already been converted to the target format. + std::unordered_map* filters_map; + if (reorder_filter_OIHWBo) { + filters_map = &filters_OIHWBo_; + } else { + filters_map = &filters_OIHWBiBo_; + } + + NodeArg* nchwc_conv_W_arg; + auto filters_it = filters_map->find(input_defs[1]); + if (filters_it != filters_map->end()) { + // Reuse the existing NodeArg. + nchwc_conv_W_arg = filters_it->second; + } else { + auto conv_W = std::make_unique(conv_W_tensor_proto); + + std::vector reordered_filter(conv_W->size() / output_channels * nchwc_output_channels); + + // Reorder the weights tensor statically. + if (reorder_filter_OIHWBo) { + MlasReorderFilterOIHWBo(conv_W->dims().data(), conv_W->data(), reordered_filter.data()); + } else { + MlasReorderFilterOIHWBiBo(conv_W->dims().data(), conv_W->data(), reordered_filter.data()); + } + + ONNX_NAMESPACE::TensorProto nchwc_conv_W_tensor_proto; + + nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder")); + nchwc_conv_W_tensor_proto.set_raw_data(reordered_filter.data(), reordered_filter.size() * sizeof(float)); + + nchwc_conv_W_tensor_proto.add_dims(nchwc_output_channels); + for (size_t i = 1; i < 4; i++) { + nchwc_conv_W_tensor_proto.add_dims(conv_W->dims()[i]); + } + + graph_.AddInitializedTensor(nchwc_conv_W_tensor_proto); + + nchwc_conv_W_arg = &graph_.GetOrCreateNodeArg(nchwc_conv_W_tensor_proto.name(), nullptr); + filters_map->emplace(input_defs[1], nchwc_conv_W_arg); + } + + // Align the optional bias tensor up to the number of NCHWc output channels. + NodeArg* nchwc_conv_B_arg = nullptr; + if ((conv_B_tensor_proto != nullptr) && (output_channels != nchwc_output_channels)) { + auto biases_it = aligned_biases_.find(input_defs[2]); + if (biases_it != aligned_biases_.end()) { + // Reuse the existing NodeArg. + nchwc_conv_B_arg = biases_it->second; + } else { + auto conv_B = std::make_unique(conv_B_tensor_proto); + + std::vector aligned_bias(nchwc_output_channels); + std::copy_n(conv_B->data(), output_channels, aligned_bias.data()); + + ONNX_NAMESPACE::TensorProto nchwc_conv_B_tensor_proto; + + nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder")); + nchwc_conv_B_tensor_proto.set_raw_data(aligned_bias.data(), aligned_bias.size() * sizeof(float)); + + nchwc_conv_B_tensor_proto.add_dims(nchwc_output_channels); + + graph_.AddInitializedTensor(nchwc_conv_B_tensor_proto); + + nchwc_conv_B_arg = &graph_.GetOrCreateNodeArg(nchwc_conv_B_tensor_proto.name(), nullptr); + aligned_biases_.emplace(input_defs[2], nchwc_conv_B_arg); + } + } + + // Create the replacement node. + std::string nchwc_node_name = graph_.GenerateNodeName(output_defs[0]->Name() + "_nchwc"); + Node& nchwc_node = graph_.AddNode(nchwc_node_name, + "Conv", + nchwc_node_name, + input_defs, + output_defs, + &node.GetAttributes(), + kMSNchwcDomain); + nchwc_node.SetExecutionProviderType(node.GetExecutionProviderType()); + + nchwc_node.MutableInputDefs()[1] = nchwc_conv_W_arg; + + if (nchwc_conv_B_arg != nullptr) { + nchwc_node.MutableInputDefs()[2] = nchwc_conv_B_arg; + } + + NchwcArgument::Shape output_shape(output_defs[0]); + + if (do_reorder_input) { + auto it = nchwc_args_.find(input_defs[0]); + if (it == nchwc_args_.end()) { + InsertReorderInput(nchwc_node); + } else { + auto* nchwc_input = it->second.get(); + nchwc_node.MutableInputDefs()[0] = nchwc_input->nchwc_arg_; + nchwc_input->remaining_original_uses_--; + ConvPoolShapeInference(node, nchwc_input->shape_, output_shape, conv_W_tensor_proto); + } + } + + CreateNchwcArgument(node, nchwc_node, output_channels, output_shape); + removed_nodes_.push_front(node.Index()); +} + +void NchwcTransformerImpl::TransformPool(Node& node) { + auto& input_defs = node.MutableInputDefs(); + auto& output_defs = node.MutableOutputDefs(); + + // Bail out if MaxPool has the optional index tensor specified. + if (output_defs.size() > 1) { + return; + } + + const size_t nchwc_block_size = MlasNchwcGetBlockSize(); + + auto* input_shape = input_defs[0]->Shape(); + if ((input_shape == nullptr) || (input_shape->dim_size() != 4)) { + return; + } + auto& channels_dim = input_shape->dim(1); + if (!channels_dim.has_dim_value()) { + return; + } + const int64_t channels = channels_dim.dim_value(); + if ((channels % nchwc_block_size) != 0) { + return; + } + + // Create the replacement node. + std::string nchwc_node_name = graph_.GenerateNodeName(output_defs[0]->Name() + "_nchwc"); + Node& nchwc_node = graph_.AddNode(nchwc_node_name, + node.OpType(), + nchwc_node_name, + input_defs, + output_defs, + &node.GetAttributes(), + kMSNchwcDomain); + nchwc_node.SetExecutionProviderType(node.GetExecutionProviderType()); + + NchwcArgument::Shape output_shape(output_defs[0]); + + auto it = nchwc_args_.find(input_defs[0]); + if (it == nchwc_args_.end()) { + InsertReorderInput(nchwc_node); + } else { + auto* nchwc_input = it->second.get(); + nchwc_node.MutableInputDefs()[0] = nchwc_input->nchwc_arg_; + nchwc_input->remaining_original_uses_--; + ConvPoolShapeInference(node, nchwc_input->shape_, output_shape, nullptr); + } + + CreateNchwcArgument(node, nchwc_node, channels, output_shape); + removed_nodes_.push_front(node.Index()); +} + +// The existing Add/Sum operator implementations can be used with tensors +// in NCHWc format if the tensor shapes are exactly the same (elementwise +// add). +void NchwcTransformerImpl::TransformAdd(Node& node) { + auto& input_defs = node.MutableInputDefs(); + + // Verify that all of the inputs to this operator are from NCHWc outputs. + std::vector nchwc_inputs; + size_t input_defs_count = input_defs.size(); + nchwc_inputs.reserve(input_defs_count); + for (size_t i = 0; i < input_defs_count; i++) { + auto it = nchwc_args_.find(input_defs[i]); + if (it == nchwc_args_.end()) { + return; + } + nchwc_inputs.push_back(it->second.get()); + } + + // Test if all of the NCHWc inputs have a compatible shape. + auto* nchwc_input_0 = nchwc_inputs[0]; + auto* nchwc_input_0_shape = input_defs[0]->Shape(); + for (size_t n = 1; n < input_defs_count; n++) { + auto* nchwc_input_n = nchwc_inputs[n]; + for (int i = 0; i < kNchwcDims; i++) { + // Test if this dimension is derived from the same NodeArg. + if (!nchwc_input_0->shape_.IsDimEqual(nchwc_input_n->shape_, i)) { + // Check if ONNX shape inferencing has computed a precise dimension value. + auto* nchwc_input_n_shape = input_defs[n]->Shape(); + if ((nchwc_input_0_shape == nullptr) || (nchwc_input_n_shape == nullptr)) { + return; + } + auto& nchwc_input_0_dim = nchwc_input_0_shape->dim(i); + auto& nchwc_input_n_dim = nchwc_input_n_shape->dim(i); + if (!nchwc_input_0_dim.has_dim_value() || + !nchwc_input_n_dim.has_dim_value() || + (nchwc_input_0_dim.dim_value() <= 0) || + (nchwc_input_0_dim.dim_value() != nchwc_input_n_dim.dim_value())) { + return; + } + } + } + } + + // Update the node to directly use the NCHWc inputs directly and decrement + // the original use counts of the NCHWc inputs. + for (size_t n = 0; n < input_defs_count; n++) { + input_defs[n] = nchwc_inputs[n]->nchwc_arg_; + nchwc_inputs[n]->remaining_original_uses_--; + } + + // If one of the inputs to the Add/Sum node is a NCHWc convolution, then + // attempt to fuse the addition into the convolution itself. + if (input_defs_count == 2) { + for (size_t n = 0; n < 2; n++) { + auto* nchwc_input_n = nchwc_inputs[n]; + auto& nchwc_node = nchwc_input_n->output_node_; + auto& nchwc_input_defs = nchwc_node.MutableInputDefs(); + auto& nchwc_input_args_count = nchwc_node.MutableInputArgsCount(); + // Check if this is a single use NCHWc convolution that hasn't already + // been fused with another Add/Sum node. The Add/Sum can also only be + // fused if the convolution isn't itself fused with an activation. + if ((nchwc_node.OpType() == "Conv") && (nchwc_node.Domain() == kMSNchwcDomain) && + (nchwc_input_defs.size() < 4) && (nchwc_input_args_count.size() < 4) && + (nchwc_input_n->starting_original_uses_ == 1) && + (graph_utils::GetNodeAttribute(nchwc_node, "activation") == nullptr)) { + // Feed the output of the other NCHWc node into the selected convolution + // node. + nchwc_input_defs.resize(4); + nchwc_input_defs[3] = nchwc_inputs[n ^ 1]->output_node_.MutableOutputDefs()[0]; + nchwc_input_args_count.resize(4); + nchwc_input_args_count[3] = 1; + + FuseNchwcArgument(node, *nchwc_input_n); + removed_nodes_.push_front(node.Index()); + return; + } + } + } + + CreateNchwcArgument(node, node, nchwc_input_0->channels_, nchwc_input_0->shape_); +} + +void NchwcTransformerImpl::TransformConcat(Node& node) { + auto& input_defs = node.MutableInputDefs(); + auto& output_defs = node.MutableOutputDefs(); + + // Verify that this is a concatenation along the channel axis. + auto* axis_attr = graph_utils::GetNodeAttribute(node, "axis"); + if (axis_attr == nullptr || !axis_attr->has_i() || axis_attr->i() != 1) { + return; + } + + const size_t nchwc_block_size = MlasNchwcGetBlockSize(); + + // Verify that all of the inputs to this operator are from NCHWc outputs. + std::vector nchwc_inputs; + size_t input_defs_count = input_defs.size(); + nchwc_inputs.reserve(input_defs_count); + int64_t total_channels = 0; + for (size_t i = 0; i < input_defs_count; i++) { + auto it = nchwc_args_.find(input_defs[i]); + if (it == nchwc_args_.end()) { + return; + } + // Verify that the logical number of channels is block aligned. + int64_t input_channels = it->second->channels_; + if ((input_channels % nchwc_block_size) != 0) { + return; + } + total_channels += input_channels; + nchwc_inputs.push_back(it->second.get()); + } + + // Update the node to directly use the NCHWc inputs directly and decrement + // the original use counts of the NCHWc inputs. + for (size_t n = 0; n < input_defs_count; n++) { + input_defs[n] = nchwc_inputs[n]->nchwc_arg_; + nchwc_inputs[n]->remaining_original_uses_--; + } + + // Copy the shape from any of the NCHWc inputs, but use the current node for + // the channel dimension. + NchwcArgument::Shape output_shape = nchwc_inputs[0]->shape_; + output_shape.dims_[1] = output_defs[0]; + + CreateNchwcArgument(node, node, total_channels, output_shape); +} + +// After doing a Conv/Add fusion, there may be an activation node that could now +// be fused into the Conv node as well. +void NchwcTransformerImpl::TransformActivation(Node& node) { + auto& input_defs = node.MutableInputDefs(); + + auto it = nchwc_args_.find(input_defs[0]); + if (it != nchwc_args_.end()) { + auto& nchwc_input = it->second; + input_defs[0] = nchwc_input->nchwc_arg_; + nchwc_input->remaining_original_uses_--; + + // Check if this is a single use NCHWc convolution that hasn't already + // been fused with another activation. + auto& nchwc_node = nchwc_input->output_node_; + if ((nchwc_node.OpType() == "Conv") && (nchwc_node.Domain() == kMSNchwcDomain) && + (nchwc_input->starting_original_uses_ == 1) && + (graph_utils::GetNodeAttribute(nchwc_node, "activation") == nullptr)) { + nchwc_node.AddAttribute("activation", node.OpType()); + FuseNchwcArgument(node, *nchwc_input); + removed_nodes_.push_front(node.Index()); + } else { + CreateNchwcArgument(node, node, nchwc_input->channels_, nchwc_input->shape_); + } + } +} + +void NchwcTransformerImpl::Transform(Node& node) { + if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Conv", {1}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "FusedConv", {1}, kMSDomain)) { + TransformConv(node); + } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "MaxPool", {1, 8, 10}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "AveragePool", {1, 7, 10}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "GlobalMaxPool", {1}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "GlobalAveragePool", {1})) { + TransformPool(node); + } else if (node.GetInputEdgesCount() == 0 && node.InputDefs().size() != 0) { + // The following transforms only run when the input edge count has already + // been decremented to zero by earlier transforms. This is a hint that the + // node may already have all inputs converted to NCHWc format and is not + // needed for correct operation. This avoids doing extra string checks for + // nodes unrelated to this transformer. + if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Add", {7}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "Sum", {6, 8})) { + TransformAdd(node); + } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Concat", {4})) { + TransformConcat(node); + } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Relu", {6})) { + TransformActivation(node); + } + } + + // The node may not match any of the checks above or may not have been + // transformed for other reasons such as unsupported attributes or alignment. + // However, the node may still use an input that has been produced by a NCHWc + // node. Finalize() walks through the list of NCHWc outputs and inserts the + // needed reorder operations to ensure that these inputs remain in NCHW + // format. +} + +void NchwcTransformerImpl::Finalize(bool& modified) { + // Create ReorderOutput nodes for any NCHWc outputs that still have uses with + // the original tensor format. + for (auto& nchwc_output : nchwc_args_) { + if (nchwc_output.second->remaining_original_uses_ > 0) { + auto* output_original_arg = nchwc_output.first; + auto* output_nchwc_arg = nchwc_output.second->nchwc_arg_; + Node& reorder_output_node = graph_.AddNode(graph_.GenerateNodeName("ReorderOutput"), + "ReorderOutput", + "ReorderOutput", + {output_nchwc_arg}, + {output_original_arg}, + nullptr, + kMSNchwcDomain); + reorder_output_node.AddAttribute("channels", nchwc_output.second->channels_); + reorder_output_node.SetExecutionProviderType(kCpuExecutionProvider); + } + } + + for (auto index : removed_nodes_) { + graph_.RemoveNode(index); + } + + if (!removed_nodes_.empty()) { + modified = true; + } +} + +Status NchwcTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level) const { + NchwcTransformerImpl impl(graph); + GraphViewer graph_viewer(graph); + + for (auto index : graph_viewer.GetNodesInTopologicalOrder()) { + auto& node = *graph.GetNode(index); + ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level)); + if (node.GetExecutionProviderType() == kCpuExecutionProvider) { + impl.Transform(node); + } + } + impl.Finalize(modified); + return Status::OK(); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/nchwc_transformer.h b/onnxruntime/core/optimizer/nchwc_transformer.h new file mode 100644 index 0000000000000..68b7ddd6e22bc --- /dev/null +++ b/onnxruntime/core/optimizer/nchwc_transformer.h @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/optimizer/graph_transformer.h" + +namespace onnxruntime { + +/** +@Class NchwcTransformer + +Transformer that optimizes the graph by using NCHWc nodes instead of NCHW nodes +and inserts nodes to reorder tensors as needed. +*/ +class NchwcTransformer : public GraphTransformer { + public: + NchwcTransformer() noexcept : GraphTransformer("NchwcTransformer") {} + + private: + Status ApplyImpl(Graph& graph, bool& modified, int graph_level) const override; +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc index 5b6e660f1a5ae..cd64d2398228b 100644 --- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc +++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc @@ -3,6 +3,7 @@ #include "core/common/status.h" #include "core/common/logging/logging.h" #include "core/common/logging/macros.h" +#include "core/framework/data_transfer_manager.h" #include "core/framework/tensorprotoutils.h" #include "core/framework/data_types.h" #include "core/framework/mldata_type_utils.h" @@ -16,12 +17,14 @@ namespace onnxruntime { OptimizerExecutionFrame::Info::Info(const std::vector& nodes, const InitializedTensorSet& initialized_tensor_set) { // Create CPU execution provider - // For now, CPU execution provider will be created every time when initilizing Info. + // For now, CPU execution provider will be created every time when initializing Info. // Later, it will be changed to pass by Info ctor. cpu_execution_provider_ = std::make_unique(CPUExecutionProviderInfo()); allocator_ptr_ = cpu_execution_provider_->GetAllocator(device_id_, mem_type_); ORT_ENFORCE(allocator_ptr_ != nullptr, "Failed to get allocator for optimizer"); + data_transfer_mgr_.RegisterDataTransfer(std::make_unique()); + // Create MLValues related maps auto initialize_maps = [this, &initialized_tensor_set](const NodeArg& arg, size_t /*index*/) -> Status { int idx = ort_value_name_idx_map_.Add(arg.Name()); @@ -63,7 +66,7 @@ OptimizerExecutionFrame::Info::Info(const std::vector& nodes, std::unique_ptr op_kernel; std::shared_ptr kernel_registry = cpu_execution_provider_->GetKernelRegistry(); auto status = kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_, - ort_value_name_idx_map_, FuncManager(), op_kernel); + ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, op_kernel); kernels_[node->Index()] = std::move(op_kernel); } } diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.h b/onnxruntime/core/optimizer/optimizer_execution_frame.h index 531fa67515386..41f5e85215699 100644 --- a/onnxruntime/core/optimizer/optimizer_execution_frame.h +++ b/onnxruntime/core/optimizer/optimizer_execution_frame.h @@ -7,19 +7,20 @@ #include "core/graph/graph.h" #include "core/providers/cpu/cpu_execution_provider.h" +#include "core/framework/data_transfer_manager.h" #include "core/framework/execution_frame.h" #include "core/framework/ort_value_name_idx_map.h" #include "core/framework/ml_value.h" #include "core/common/callback.h" namespace onnxruntime { +class DataTransferManager; class OptimizerExecutionFrame final : public IExecutionFrame { public: class Info { public: - Info(const std::vector& nodes, - const InitializedTensorSet& initialized_tensor_set); + Info(const std::vector& nodes, const InitializedTensorSet& initialized_tensor_set); ~Info() { for (auto& kvp : deleter_for_initialized_tensors_) { kvp.second.f(kvp.second.param); @@ -55,13 +56,13 @@ class OptimizerExecutionFrame final : public IExecutionFrame { const int device_id_{0}; const OrtMemType mem_type_{OrtMemTypeDefault}; AllocatorPtr allocator_ptr_; - + DataTransferManager data_transfer_mgr_; // MLValues for optimizer OrtValueNameIdxMap ort_value_name_idx_map_; std::unordered_map ort_value_idx_nodearg_map_; std::unordered_map initializers_; std::unordered_map> buffer_for_initialized_tensors_; - // This data structure is for unintializing string tensors and + // This data structure is for uninitializing string tensors and // munmap memory region and close file descriptor std::unordered_map deleter_for_initialized_tensors_; std::unique_ptr node_index_info_; diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc index b52d7f41afb0a..c6f57900b9881 100644 --- a/onnxruntime/core/optimizer/transformer_memcpy.cc +++ b/onnxruntime/core/optimizer/transformer_memcpy.cc @@ -18,10 +18,10 @@ class TransformerMemcpyImpl { bool ModifyGraph(const KernelRegistryManager& schema_registries); private: - void ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries); + void ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed); void BuildDefsMapping(const onnxruntime::NodeArg* arg, const KernelRegistryManager& kernel_registries); void AddCopyNode(onnxruntime::NodeArg* arg, bool is_input); - void ProcessInitializers(const KernelRegistryManager& kernel_registries); + bool ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed); private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TransformerMemcpyImpl); @@ -52,6 +52,19 @@ class TransformerMemcpyImpl { std::string provider_; }; +/** Helper that returns a pointer to the corresponding TensorProto for a name if it is an initializer. +@param check_outer_scope If true and the graph is a subgraph, check parent graph/s for 'name' if not found in 'graph'. +*/ +static const onnx::TensorProto* GetInitializer(const Graph& graph, const std::string& name, bool check_outer_scope) { + const onnx::TensorProto* initializer = nullptr; + if (graph.GetInitializedTensor(name, initializer)) { + return initializer; + } else if (check_outer_scope && graph.IsSubgraph()) { + return GetInitializer(*graph.ParentGraph(), name, check_outer_scope); + } + return initializer; +} + // very simple GraphTransformer that uses TransformerMemcpyImpl for each graph // and mainly provides the subgraph recursion functionality common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level) const { @@ -63,7 +76,8 @@ common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int gr provider != onnxruntime::kTensorrtExecutionProvider && provider != onnxruntime::kOpenVINOExecutionProvider) { TransformerMemcpyImpl copy_impl(graph, provider); - modified = copy_impl.ModifyGraph(registry_manager_); + auto current_modified = copy_impl.ModifyGraph(registry_manager_); + modified = modified || current_modified; } } @@ -109,14 +123,16 @@ This transformer does not currently optimize copies between, e.g., two different bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_registries) { bool modified = false; + InitializedTensorSet initializers_consumed; // find defs that require copy for (auto& node : graph_.Nodes()) { - //don't need to do node placement here now, onnxruntime will do it according to registered kernels. - ProcessDefs(node, kernel_registries); + //as we process the defs, collect all the initializers consumed at the current graph level + ProcessDefs(node, kernel_registries, initializers_consumed); } // for initializers shared by different providers, create dups - ProcessInitializers(kernel_registries); + if (ProcessInitializers(kernel_registries, initializers_consumed)) + modified = true; for (auto arg : graph_.GetInputs()) BuildDefsMapping(arg, kernel_registries); @@ -150,21 +166,27 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi return modified; } -void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries) { +void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed) { if (node.GetExecutionProviderType() == provider_) { provider_nodes_.insert(&node); // note KernelCreateInfo might be nullptr for custom kernel const KernelCreateInfo* kci = nullptr; kernel_registries.SearchKernelRegistry(node, &kci); - auto status = onnxruntime::Node::ForEachWithIndex(node.InputDefs(), - [this, &kci](const onnxruntime::NodeArg& arg, size_t index) { - if (kci && kci->kernel_def->IsInputOnCpu(index)) - non_provider_input_defs_.insert(&arg); - else - provider_input_defs_.insert(&arg); - return Status::OK(); - }); + auto status = onnxruntime::Node::ForEachWithIndex( + node.InputDefs(), + [this, &kci, &initializers_consumed](const onnxruntime::NodeArg& arg, size_t index) { + // check if this NodeArg is an initializer defined in current outer graph level + const auto* initializer_tensor_proto = + GetInitializer(graph_, arg.Name(), true); + if (initializer_tensor_proto != nullptr) + initializers_consumed[arg.Name()] = initializer_tensor_proto; + if (kci && kci->kernel_def->IsInputOnCpu(index)) + non_provider_input_defs_.insert(&arg); + else + provider_input_defs_.insert(&arg); + return Status::OK(); + }); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); @@ -274,9 +296,9 @@ static const onnxruntime::NodeArg* FindNodeArg(const NodeArgSetType& def_set, co // We duplicate any initializer that is used by both provider nodes and non-provider nodes // to ensure that provider nodes and non-provider nodes don't share initializers, as they // need to stay in different memory locations. -void TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& kernel_registries) { +bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed) { std::map replacements; - for (const auto& pair : graph_.GetAllInitializedTensors()) { + for (const auto& pair : initializers_consumed) { const auto& name = pair.first; const onnxruntime::NodeArg* provider_def = FindNodeArg(provider_input_defs_, name); const onnxruntime::NodeArg* non_provider_def = FindNodeArg(non_provider_input_defs_, name); @@ -284,10 +306,15 @@ void TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& ker std::string new_def_name = graph_.GenerateNodeArgName(name); auto& new_def = graph_.GetOrCreateNodeArg(new_def_name, provider_def->TypeAsProto()); - const TensorProto* tensor_proto = nullptr; - bool found = graph_.GetInitializedTensor(name, tensor_proto); - ORT_ENFORCE(found, "Failed to get initialized tensor ", name); - + // We make a copy of the initializer that is to be consumed by the provider Node so that + // session state initializer can copy it over to the provider device during its operation + // TODO: The copy being made is possibly redundant if this occurs in a subgraph + // When multiple subgraphs consume the same initializer as an implicit input, + // multiple copies of the initializer will be made into the provider device + // This should not directly affect runtime performance as the copies occur during initialization + // but overuse of the provider device's memory is definitely inefficient + // In future, we need to "statefully" make the copy only once and use it in all subgraphs referencing the initializer + const TensorProto* tensor_proto = pair.second; TensorProto new_tensor_proto = *tensor_proto; *(new_tensor_proto.mutable_name()) = new_def_name; graph_.AddInitializedTensor(new_tensor_proto); @@ -322,6 +349,9 @@ void TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& ker p_node->ReplaceDefs(dup_replacements); } + + // This denotes a modification to the graph + return !replacements.empty(); } } // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/unsqueeze_elimination.cc b/onnxruntime/core/optimizer/unsqueeze_elimination.cc index c35d078546fa6..109be80ad7d85 100644 --- a/onnxruntime/core/optimizer/unsqueeze_elimination.cc +++ b/onnxruntime/core/optimizer/unsqueeze_elimination.cc @@ -25,13 +25,11 @@ Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect& // Generate new dims. NodeArg* input_def = node.MutableInputDefs()[0]; - const ONNX_NAMESPACE::TensorProto* tensor_proto = nullptr; - graph.GetInitializedTensor(input_def->Name(), tensor_proto); - if (tensor_proto == nullptr) { - return Status::OK(); - } + const auto* tensor_proto = graph_utils::GetConstantInitializer(graph, input_def->Name()); + ORT_ENFORCE(tensor_proto); + std::vector new_dims(axes.size() + tensor_proto->dims().size(), 0); - if (new_dims.size() >= std::numeric_limits::max()) { + if (new_dims.size() >= static_cast(std::numeric_limits::max())) { return Status(ONNXRUNTIME, FAIL, "index out of range"); } @@ -56,8 +54,13 @@ Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect& new_tensor_proto.add_dims(new_dims[i]); } } - graph.RemoveInitializedTensor(input_def->Name()); - graph.AddInitializedTensor(new_tensor_proto); + + // TODO: This seems wrong as there's no check whether another node is using the initializer before replacing it. + // Shouldn't we check that or alternatively create an initializer with a different name and let + // Graph::CleanUnusedInitializers remove the original one if nothing else consumes it? + // graph.RemoveInitializedTensor(input_def->Name()); + // graph.AddInitializedTensor(new_tensor_proto); + graph_utils::ReplaceInitializer(graph, input_def->Name(), new_tensor_proto); // Update shape of NodeArg. TensorShapeProto shape; @@ -75,8 +78,8 @@ Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect& } // namespace onnxruntime bool UnsqueezeElimination::SatisfyCondition(const Graph& graph, const Node& node) const { - // Attempt to remove an Unsqueeze operator only if it gets an initializer as input. - return node.GetInputEdgesCount() == 0 && + // Attempt to remove an Unsqueeze operator only if it gets a constant initializer as input. + return graph_utils::IsConstantInitializer(graph, node.InputDefs()[0]->Name()) && !graph.IsNodeOutputsInGraphOutputs(node); } diff --git a/onnxruntime/core/providers/cpu/controlflow/if.cc b/onnxruntime/core/providers/cpu/controlflow/if.cc index ec0750852a994..b6dc52d3759db 100644 --- a/onnxruntime/core/providers/cpu/controlflow/if.cc +++ b/onnxruntime/core/providers/cpu/controlflow/if.cc @@ -156,7 +156,7 @@ Status IfImpl::AllocateOutputTensors() { graph_output->Name(), " did not."); } - TensorShape output_shape{onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*graph_output_shape)}; + TensorShape output_shape = onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*graph_output_shape); // if size < 0 we have a symbolic dimension and need to use a temporary OrtValue in the subgraph execution if (output_shape.Size() < 0) { diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.cc b/onnxruntime/core/providers/cpu/controlflow/loop.cc index 84d911fb0ef47..3937ab4ed53f3 100644 --- a/onnxruntime/core/providers/cpu/controlflow/loop.cc +++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc @@ -302,7 +302,7 @@ void LoopImpl::SaveOutputsAndUpdateFeeds(const std::vector& last_outpu Status LoopImpl::ConcatenateLoopOutput(std::vector& per_iteration_output, int output_index) { const auto& first_output = per_iteration_output.front().Get(); - size_t bytes_per_iteration = first_output.Size(); + size_t bytes_per_iteration = first_output.SizeInBytes(); const auto& per_iteration_shape = first_output.Shape(); const auto& per_iteration_dims = per_iteration_shape.GetDims(); @@ -317,19 +317,19 @@ Status LoopImpl::ConcatenateLoopOutput(std::vector& per_iteration_outp // we can't easily use a C++ template for the tensor element type, // so use a span for some protection but work in bytes gsl::span output_span = gsl::make_span(static_cast(output->MutableDataRaw()), - output->Size()); + output->SizeInBytes()); for (int64_t i = 0; i < num_iterations; ++i) { auto& ort_value = per_iteration_output[i]; auto& iteration_data = ort_value.Get(); // sanity check - if (bytes_per_iteration != iteration_data.Size()) { + if (bytes_per_iteration != iteration_data.SizeInBytes()) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Inconsistent shape in loop output for output ", output_index, " Expected:", per_iteration_shape, " Got:", iteration_data.Shape()); } - auto num_bytes = iteration_data.Size(); + auto num_bytes = iteration_data.SizeInBytes(); auto src = gsl::make_span(static_cast(iteration_data.DataRaw()), num_bytes); auto dst = output_span.subspan(i * bytes_per_iteration, bytes_per_iteration); gsl::copy(src, dst); @@ -382,8 +382,8 @@ Status LoopImpl::Execute(FeedsFetchesManager* ffm, const FeedsFetchesManager* ca auto copy_tensor_from_mlvalue_to_output = [this](const OrtValue& input, int output_idx) { auto& data = input.Get(); Tensor* output = context_.Output(output_idx, data.Shape()); - auto src = gsl::make_span(static_cast(data.DataRaw()), data.Size()); - auto dst = gsl::make_span(static_cast(output->MutableDataRaw()), output->Size()); + auto src = gsl::make_span(static_cast(data.DataRaw()), data.SizeInBytes()); + auto dst = gsl::make_span(static_cast(output->MutableDataRaw()), output->SizeInBytes()); gsl::copy(src, dst); }; @@ -412,18 +412,21 @@ Status LoopImpl::Execute(FeedsFetchesManager* ffm, const FeedsFetchesManager* ca auto& graph_outputs = subgraph_.GetOutputs(); for (int i = num_loop_carried_vars_; i < num_outputs_; ++i) { - std::vector output_dims; - output_dims.push_back(0); // num iterations is first dim - // get shape from subgraph output if possible to attempt to have the correct rank auto* graph_output = graph_outputs.at(i + 1); // + 1 as first subgraph output is condition value auto* graph_output_shape = graph_output->Shape(); + std::vector output_dims; + output_dims.reserve((graph_output_shape ? graph_output_shape->dim_size() : 0) + 1); + output_dims.push_back(0); // num iterations is first dim + if (graph_output_shape) { - output_dims.reserve(graph_output_shape->dim_size() + 1); + const auto& tensor_shape = onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*graph_output_shape); + const auto& dims = tensor_shape.GetDims(); - auto dims = onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*graph_output_shape); - std::copy(dims.cbegin(), dims.cend(), std::back_inserter(output_dims)); + // copy to output dims and use 0 for any symbolic dim + std::for_each(dims.cbegin(), dims.cend(), + [&output_dims](const int64_t dim) { output_dims.push_back(dim < 0 ? 0 : dim); }); } else { // TODO: We could try and call ExecuteGraph to get the output shape from fetches so the rank is correct, // however that could still fail as we would potentially be passing in invalid data. diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc index 894c40842e997..821c84a78c723 100644 --- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc +++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc @@ -61,7 +61,7 @@ Status AllocateOutput(OpKernelContextInternal& context, const GraphViewer& subgr graph_output->Name(), " did not."); } - TensorShape output_shape{onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*graph_output_shape)}; + TensorShape output_shape = onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*graph_output_shape); auto& graph_output_dims{output_shape.GetDims()}; std::vector scan_output_dims; diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h index c3006024277d3..7840463c02483 100644 --- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h +++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h @@ -103,7 +103,7 @@ class OutputIterator { // set the output for the current iteration to zeros. used for short sequence lengths void ZeroOutCurrent() { auto* tensor = (**this).GetMutable(); - memset(tensor->MutableDataRaw(), 0, tensor->Size()); + memset(tensor->MutableDataRaw(), 0, tensor->SizeInBytes()); } const OrtValue& GetOutput() const { diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index 6f52311dfcfde..08b6a31938111 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -31,6 +31,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Ran class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniformLike); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Multinomial); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Add); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, double, Add); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, int32_t, Add); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, int64_t, Add); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Sub); @@ -58,19 +59,23 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, float, Neg); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, int8_t, Neg); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, int32_t, Neg); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Floor); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Ceil); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Reciprocal); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Sqrt); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Pow); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Exp); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Log); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, Sum); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, Sum); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, Min); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, Min); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, Max); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, Max); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, float, Floor); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, float, Ceil); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, float, Reciprocal); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, float, Sqrt); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, double, Sqrt); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Pow); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, double, Pow); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, float, Exp); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, double, Exp); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, float, Log); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, float, Sum); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, float, Sum); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, float, Min); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, float, Min); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, float, Max); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, float, Max); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, double, Max); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Not); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, And); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Or); @@ -80,9 +85,11 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, bool, Equal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, int32_t, Equal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, int64_t, Equal); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, Mean); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, Mean); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Sin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, Equal); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7, float, Mean); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, float, Mean); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Sin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, double, Sin); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Cos); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Tan); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Asin); @@ -132,8 +139,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, ReduceProd); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, int32_t, ReduceProd); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, ReduceSum); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, ReduceSum); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, int32_t, ReduceSum); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, ReduceSumSquare); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, ReduceSumSquare); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, int32_t, ReduceSumSquare); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, ArgMax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, int32_t, ArgMax); @@ -207,6 +216,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Com class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, ConstantOfShape); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, MeanVarianceNormalization); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int32_t, Greater); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int64_t, Greater); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int32_t, Less); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, string, Cast); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, EyeLike); @@ -214,13 +224,14 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, MLFloat16, IsNaN); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Sign); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Shrink); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Erf); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, float, Erf); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int64_t_int64_t_int64_t, OneHot); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, float_int64_t_int64_t, OneHot); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int64_t_string_int64_t, OneHot); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, float_string_int64_t, OneHot); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, float_float_float, OneHot); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int64_t_int32_t_float, OneHot); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, int64_t_float_int64_t, OneHot); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, MaxUnpool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Sinh); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Cosh); @@ -296,6 +307,7 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -323,19 +335,23 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -345,9 +361,11 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -398,8 +416,10 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -472,6 +492,7 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -479,13 +500,14 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.h b/onnxruntime/core/providers/cpu/cpu_execution_provider.h index b72244510e8c6..5dfdeb0c2c0d1 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.h +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.h @@ -46,10 +46,6 @@ class CPUExecutionProvider : public IExecutionProvider { #endif } - Status CopyTensor(const Tensor&, Tensor&) const override { - return Status(common::ONNXRUNTIME, common::FAIL, "Shouldn't reach here. CPUExecutionProvider doesn't support CopyTensor"); - } - std::shared_ptr GetKernelRegistry() const override; diff --git a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc index 297f188ae92f8..ee1f437f264fa 100644 --- a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc +++ b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc @@ -35,5 +35,6 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CPU, _In_ OrtSessio } ORT_API_STATUS_IMPL(OrtCreateCpuAllocatorInfo, enum OrtAllocatorType type, enum OrtMemType mem_type, _Out_ OrtAllocatorInfo** out) { - return OrtCreateAllocatorInfo(onnxruntime::CPU, type, 0, mem_type, out); + *out = new OrtAllocatorInfo(onnxruntime::CPU, type, OrtDevice(), 0, mem_type); + return nullptr; } diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc index 0aa1427db8637..ece68834e7525 100644 --- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc +++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc @@ -10,222 +10,100 @@ namespace onnxruntime { -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Add, - 7, - float, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Add); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Add, - 7, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Add); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Add, - 7, - int64_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Add); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Sub, - 7, - float, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Sub); - -ONNX_CPU_OPERATOR_TYPED_KERNEL(Sub, 7, double, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Sub); +#define REG_ELEMENTWISE_TYPED_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS) \ + ONNX_CPU_OPERATOR_TYPED_KERNEL( \ + OP_TYPE, \ + VERSION, \ + TYPE, \ + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + KERNEL_CLASS); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Sub, - 7, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Sub); +#define REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, TYPE, KERNEL_CLASS) \ + ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL( \ + OP_TYPE, \ + VERSION_FROM, VERSION_TO, \ + TYPE, \ + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + KERNEL_CLASS); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Sub, - 7, - int64_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Sub); +REG_ELEMENTWISE_TYPED_KERNEL(Add, 7, float, Add); +REG_ELEMENTWISE_TYPED_KERNEL(Add, 7, double, Add); +REG_ELEMENTWISE_TYPED_KERNEL(Add, 7, int32_t, Add); +REG_ELEMENTWISE_TYPED_KERNEL(Add, 7, int64_t, Add); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Mul, - 7, - float, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Mul); +REG_ELEMENTWISE_TYPED_KERNEL(Sub, 7, float, Sub); +REG_ELEMENTWISE_TYPED_KERNEL(Sub, 7, double, Sub); +REG_ELEMENTWISE_TYPED_KERNEL(Sub, 7, int32_t, Sub); +REG_ELEMENTWISE_TYPED_KERNEL(Sub, 7, int64_t, Sub); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Mul, - 7, - double, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Mul); +REG_ELEMENTWISE_TYPED_KERNEL(Mul, 7, float, Mul); +REG_ELEMENTWISE_TYPED_KERNEL(Mul, 7, double, Mul); +REG_ELEMENTWISE_TYPED_KERNEL(Mul, 7, int32_t, Mul); +REG_ELEMENTWISE_TYPED_KERNEL(Mul, 7, int64_t, Mul); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Mul, - 7, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Mul); +REG_ELEMENTWISE_TYPED_KERNEL(Div, 7, float, Div); +REG_ELEMENTWISE_TYPED_KERNEL(Div, 7, double, Div); +REG_ELEMENTWISE_TYPED_KERNEL(Div, 7, int32_t, Div); +REG_ELEMENTWISE_TYPED_KERNEL(Div, 7, int64_t, Div); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Mul, - 7, - int64_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Mul); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, float, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, double, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, int8_t, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, int16_t, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, int32_t, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, int64_t, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, uint8_t, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, uint16_t, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, uint32_t, Abs); +REG_ELEMENTWISE_TYPED_KERNEL(Abs, 6, uint64_t, Abs); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Div, - 7, - float, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Div); +REG_ELEMENTWISE_TYPED_KERNEL(Neg, 6, float, Neg); +REG_ELEMENTWISE_TYPED_KERNEL(Neg, 6, int8_t, Neg); +REG_ELEMENTWISE_TYPED_KERNEL(Neg, 6, int32_t, Neg); -ONNX_CPU_OPERATOR_TYPED_KERNEL(Div, 7, double, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Div); +REG_ELEMENTWISE_TYPED_KERNEL(Floor, 6, float, Floor); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Div, - 7, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Div); +REG_ELEMENTWISE_TYPED_KERNEL(Ceil, 6, float, Ceil); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Div, - 7, - int64_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Div); +REG_ELEMENTWISE_TYPED_KERNEL(Reciprocal, 6, float, Reciprocal); -#define REG_ABS_KERNEL(TYPE) \ - ONNX_CPU_OPERATOR_TYPED_KERNEL( \ - Abs, \ - 6, \ - TYPE, \ - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Abs); - -REG_ABS_KERNEL(float) -REG_ABS_KERNEL(double) -REG_ABS_KERNEL(int8_t) -REG_ABS_KERNEL(int16_t) -REG_ABS_KERNEL(int32_t) -REG_ABS_KERNEL(int64_t) -REG_ABS_KERNEL(uint8_t) -REG_ABS_KERNEL(uint16_t) -REG_ABS_KERNEL(uint32_t) -REG_ABS_KERNEL(uint64_t) +REG_ELEMENTWISE_TYPED_KERNEL(Sqrt, 6, float, Sqrt); +REG_ELEMENTWISE_TYPED_KERNEL(Sqrt, 6, double, Sqrt); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Neg, - 6, - float, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Neg); +REG_ELEMENTWISE_TYPED_KERNEL(Pow, 7, float, Pow); +REG_ELEMENTWISE_TYPED_KERNEL(Pow, 7, double, Pow); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Neg, - 6, - int8_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Neg); +REG_ELEMENTWISE_TYPED_KERNEL(Exp, 6, float, Exp); +REG_ELEMENTWISE_TYPED_KERNEL(Exp, 6, double, Exp); -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Neg, - 6, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Neg); +REG_ELEMENTWISE_TYPED_KERNEL(Log, 6, float, Log); -ONNX_CPU_OPERATOR_KERNEL( - Floor, - 6, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Floor); +REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Sum, 6, 7, float, Sum_6); +REG_ELEMENTWISE_TYPED_KERNEL(Sum, 8, float, Sum_8); -ONNX_CPU_OPERATOR_KERNEL( - Ceil, - 6, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Ceil); +REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Min, 6, 7, float, Min_6); +REG_ELEMENTWISE_TYPED_KERNEL(Min, 8, float, Min_8); -ONNX_CPU_OPERATOR_KERNEL( - Reciprocal, - 6, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Reciprocal); +REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Max, 6, 7, float, Max_6); +REG_ELEMENTWISE_TYPED_KERNEL(Max, 8, float, Max_8); +REG_ELEMENTWISE_TYPED_KERNEL(Max, 8, double, Max_8); -ONNX_CPU_OPERATOR_KERNEL( - Sqrt, - 6, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Sqrt); +REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Less, 7, 9, float, Less); +REG_ELEMENTWISE_TYPED_KERNEL(Less, 9, int32_t, Less); -ONNX_CPU_OPERATOR_KERNEL( - Pow, - 7, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Pow); +REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Greater, 7, 9, float, Greater) +REG_ELEMENTWISE_TYPED_KERNEL(Greater, 9, int32_t, Greater); +REG_ELEMENTWISE_TYPED_KERNEL(Greater, 9, int64_t, Greater); -ONNX_CPU_OPERATOR_KERNEL( - Exp, - 6, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Exp); +REG_ELEMENTWISE_TYPED_KERNEL(Equal, 7, bool, Equal); +REG_ELEMENTWISE_TYPED_KERNEL(Equal, 7, int32_t, Equal); +REG_ELEMENTWISE_TYPED_KERNEL(Equal, 7, int64_t, Equal); +REG_ELEMENTWISE_TYPED_KERNEL(Equal, 11, float, Equal); -ONNX_CPU_OPERATOR_KERNEL( - Log, - 6, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Log); +REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Mean, 6, 7, float, Mean_6); +REG_ELEMENTWISE_TYPED_KERNEL(Mean, 8, float, Mean_8); -ONNX_CPU_OPERATOR_VERSIONED_KERNEL( - Sum, - 6, 7, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Sum_6); - -ONNX_CPU_OPERATOR_KERNEL( - Sum, - 8, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Sum_8); - -ONNX_CPU_OPERATOR_VERSIONED_KERNEL( - Min, - 6, 7, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Min_6); - -ONNX_CPU_OPERATOR_KERNEL( - Min, - 8, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Min_8); - -ONNX_CPU_OPERATOR_VERSIONED_KERNEL( - Max, - 6, 7, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Max_6); - -ONNX_CPU_OPERATOR_KERNEL( - Max, - 8, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Max_8); +REG_ELEMENTWISE_TYPED_KERNEL(Erf, 9, float, Erf); ONNX_CPU_OPERATOR_KERNEL( Not, @@ -251,73 +129,6 @@ ONNX_CPU_OPERATOR_KERNEL( KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Xor); -ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL( - Less, - 7, 9, - float, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Less); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Less, - 9, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Less); - -ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL( - Greater, - 7, 9, - float, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Greater); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Greater, - 9, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Greater); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Equal, - 7, - bool, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Equal); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Equal, - 7, - int32_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Equal); - -ONNX_CPU_OPERATOR_TYPED_KERNEL( - Equal, - 7, - int64_t, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Equal); - -ONNX_CPU_OPERATOR_VERSIONED_KERNEL( - Mean, - 6, 7, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Mean_6); - -ONNX_CPU_OPERATOR_KERNEL( - Mean, - 8, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Mean_8); - -ONNX_CPU_OPERATOR_KERNEL( - Erf, - 9, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Erf); - template Status Add::Compute(OpKernelContext* context) const { return BroadcastTwo( @@ -384,43 +195,43 @@ Status Reciprocal::Compute(OpKernelContext* ctx) const { return Status::OK(); } -template <> -Status Sqrt::Compute(OpKernelContext* ctx) const { +template +Status Sqrt::Compute(OpKernelContext* ctx) const { auto& X = *ctx->Input(0); auto& Y = *ctx->Output(0, X.Shape()); - EigenMap(Y) = EigenMap(X).cwiseSqrt(); + EigenMap(Y) = EigenMap(X).cwiseSqrt(); return Status::OK(); } -template <> -Status Pow::Compute(OpKernelContext* context) const { +template +Status Pow::Compute(OpKernelContext* context) const { const Tensor& Y = *context->Input(1); - std::function, ConstEigenVectorMap, float)> input1scalar = - [](EigenVectorMap output, ConstEigenVectorMap input0, float input1) { output = Eigen::pow(input0.array(), input1); }; + std::function, ConstEigenVectorMap, T)> input1scalar = + [](EigenVectorMap output, ConstEigenVectorMap input0, T input1) { output = Eigen::pow(input0.array(), input1); }; if (Y.Shape().Size() == 1) { - float value = *Y.Data(); + T value = *Y.Data(); if (value == 2.0) { - input1scalar = [](EigenVectorMap output, ConstEigenVectorMap input0, float) { output = Eigen::square(input0.array()); }; + input1scalar = [](EigenVectorMap output, ConstEigenVectorMap input0, T) { output = Eigen::square(input0.array()); }; } else if (value == 3.0) { - input1scalar = [](EigenVectorMap output, ConstEigenVectorMap input0, float) { output = Eigen::cube(input0.array()); }; + input1scalar = [](EigenVectorMap output, ConstEigenVectorMap input0, T) { output = Eigen::cube(input0.array()); }; } } - return BroadcastTwo( + return BroadcastTwo( *context, - [](EigenVectorMap output, float input0, ConstEigenVectorMap input1) { output = Eigen::pow(input0, input1.array()); }, + [](EigenVectorMap output, T input0, ConstEigenVectorMap input1) { output = Eigen::pow(input0, input1.array()); }, input1scalar, - [](EigenVectorMap output, ConstEigenVectorMap input0, ConstEigenVectorMap input1) { output = Eigen::pow(input0.array(), input1.array()); }); + [](EigenVectorMap output, ConstEigenVectorMap input0, ConstEigenVectorMap input1) { output = Eigen::pow(input0.array(), input1.array()); }); } -template <> -Status Exp::Compute(OpKernelContext* ctx) const { +template +Status Exp::Compute(OpKernelContext* ctx) const { auto& X = *ctx->Input(0); auto& Y = *ctx->Output(0, X.Shape()); - EigenMap(Y) = EigenMap(X).array().exp(); + EigenMap(Y) = EigenMap(X).array().exp(); return Status::OK(); } @@ -514,13 +325,13 @@ Status Max_6::Compute(OpKernelContext* ctx) const { return Status::OK(); } -template <> -Status Max_8::Compute(OpKernelContext* context) const { - return BroadcastVariadic( +template +Status Max_8::Compute(OpKernelContext* context) const { + return BroadcastVariadic( Node(), *context, - [](EigenVectorMap output, float input0, ConstEigenVectorMap input1) { output = input1.array().max(input0); }, - [](EigenVectorMap output, ConstEigenVectorMap input0, float input1) { output = input0.array().max(input1); }, - [](EigenVectorMap output, ConstEigenVectorMap input0, ConstEigenVectorMap input1) { output = input0.array().max(input1.array()); }); + [](EigenVectorMap output, T input0, ConstEigenVectorMap input1) { output = input1.array().max(input0); }, + [](EigenVectorMap output, ConstEigenVectorMap input0, T input1) { output = input0.array().max(input1); }, + [](EigenVectorMap output, ConstEigenVectorMap input0, ConstEigenVectorMap input1) { output = input0.array().max(input1.array()); }); } Status Not::Compute(OpKernelContext* context) const { @@ -669,17 +480,25 @@ class Sin final : public OpKernel { Status Compute(OpKernelContext* context) const override { auto& X = *context->Input(0); auto& Y = *context->Output(0, X.Shape()); - MakeEigenArrayMap(Y) = MakeEigenArrayMap(X).sin(); + MakeEigenArrayMap(Y) = MakeEigenArrayMap(X).sin(); return Status::OK(); } }; -ONNX_CPU_OPERATOR_KERNEL( +ONNX_CPU_OPERATOR_TYPED_KERNEL( Sin, 7, + float, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Sin); +ONNX_CPU_OPERATOR_TYPED_KERNEL( + Sin, + 7, + double, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Sin); + template class Cos final : public OpKernel { public: diff --git a/onnxruntime/core/providers/cpu/math/gemm.cc b/onnxruntime/core/providers/cpu/math/gemm.cc index 6554e3c60145f..c54e910b071ed 100644 --- a/onnxruntime/core/providers/cpu/math/gemm.cc +++ b/onnxruntime/core/providers/cpu/math/gemm.cc @@ -10,5 +10,5 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL( 7, 9, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Gemm); -} \ No newline at end of file + Gemm); +} diff --git a/onnxruntime/core/providers/cpu/math/gemm.h b/onnxruntime/core/providers/cpu/math/gemm.h index c72c5bc1e02f3..a3aa724ab410d 100644 --- a/onnxruntime/core/providers/cpu/math/gemm.h +++ b/onnxruntime/core/providers/cpu/math/gemm.h @@ -11,10 +11,7 @@ namespace onnxruntime { -template +template class Gemm : public OpKernel { public: Gemm(const OpKernelInfo& info) : OpKernel(info) { @@ -40,75 +37,47 @@ class Gemm : public OpKernel { int64_t M = helper.M(); int64_t N = helper.N(); - int64_t K = helper.K(); - auto Y = context->Output(0, TensorShape({M, N})); + auto Y = context->Output(0, {M, N}); // if input is emtpy tensor, return directly as nothing need to be calculated. if (M == 0 || N == 0) return Status::OK(); - T_Y* y_data = Y->template MutableData(); + T* y_data = Y->template MutableData(); - //bias - // Todo: we might should move this part into math::gemm to let eigen - // have better chance to further optimize it. + // Broadcast the bias as needed. if (beta_ != 0) { - auto output_mat = EigenMatrixMapRowMajor( - Y->template MutableData(), - M, - N); - output_mat.setZero(); - - auto& b_shape = B->Shape(); - // if B is (), (1,) or (1, 1), add the scalar + auto output_mat = EigenMatrixMapRowMajor(y_data, M, N); + const auto& b_shape = B->Shape(); + const T* b_data = B->template Data(); if (b_shape.Size() == 1) { - output_mat.array() += *(B->template Data()); - } - // B is (N,) - else if (b_shape.NumDimensions() == 1) { - auto bias_vec = ConstEigenVectorMap( - B->template Data(), - N); - output_mat.rowwise() += bias_vec.transpose(); - } else if (b_shape.NumDimensions() == 2) { + // B is (), (1,) or (1, 1), set the scalar + output_mat.setConstant(*b_data); + } else if (b_shape.NumDimensions() == 1 || b_shape[0] == 1) { + // B is (N,) or (1, N) + output_mat.rowwise() = ConstEigenVectorMap(b_data, N).transpose(); + } else if (b_shape[1] == 1) { // B is (M, 1) - if (b_shape[1] == 1) { - auto bias_vec = ConstEigenVectorMap( - B->template Data(), - M); - output_mat.colwise() += bias_vec; - } - // B is (1, N) - else if (b_shape[0] == 1) { - auto bias_vec = ConstEigenVectorMap( - B->template Data(), - N); - output_mat.rowwise() += bias_vec.transpose(); - } + output_mat.colwise() = ConstEigenVectorMap(b_data, M); + } else { // B is (M, N), no broadcast needed. - else { - auto bias_mat = ConstEigenMatrixMapRowMajor( - B->template Data(), - M, - N); - output_mat += bias_mat; - } + output_mat = ConstEigenMatrixMapRowMajor(b_data, M, N); } } // W * x - math::Gemm( + math::Gemm( trans_A_, trans_B_, M, N, - K, + helper.K(), alpha_, - X->template Data(), - W->template Data(), + X->template Data(), + W->template Data(), beta_, y_data, &CPUMathUtil::Instance()); - FuseActivation(activation_, y_data, M * N, leaky_relu_alpha_); + FuseActivation(activation_, y_data, M * N, leaky_relu_alpha_); return Status::OK(); } @@ -119,7 +88,7 @@ class Gemm : public OpKernel { float alpha_; float beta_; -protected: + protected: // For fused gemm + activation std::string activation_; float leaky_relu_alpha_; diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc index 19a0f28f7e01f..539157e92bd95 100644 --- a/onnxruntime/core/providers/cpu/math/matmul.cc +++ b/onnxruntime/core/providers/cpu/math/matmul.cc @@ -61,21 +61,15 @@ Status MatMul::Compute(OpKernelContext* ctx) const { Tensor* Y = ctx->Output(0, helper.OutputShape()); - // TODO: replace it with GemmBatch for performance, it's OK for now as GemmBatch unrolls as well size_t max_len = helper.OutputOffsets().size(); for (size_t i = 0; i < max_len; i++) { - math::Gemm( - CblasNoTrans, - CblasNoTrans, + math::MatMul( static_cast(helper.M()), static_cast(helper.N()), static_cast(helper.K()), - /* alpha */ 1.0f, left_X->template Data() + helper.LeftOffsets()[i], right_X->template Data() + helper.RightOffsets()[i], - /* beta */ 0.0f, - Y->template MutableData() + helper.OutputOffsets()[i], - &CPUMathUtil::Instance()); + Y->template MutableData() + helper.OutputOffsets()[i]); } return Status::OK(); diff --git a/onnxruntime/core/providers/cpu/ml/onehotencoder.cc b/onnxruntime/core/providers/cpu/ml/onehotencoder.cc index 1392e660722f9..4030bb77a8dbc 100644 --- a/onnxruntime/core/providers/cpu/ml/onehotencoder.cc +++ b/onnxruntime/core/providers/cpu/ml/onehotencoder.cc @@ -52,7 +52,8 @@ template OneHotEncoderOp::OneHotEncoderOp(const OpKernelInfo& info) : OpKernel(info), zeros_(info.GetAttrOrDefault("zeros", 1)), num_categories_(0) { std::vector tmp_cats_int64s = info.GetAttrsOrDefault("cats_int64s"); std::vector tmp_cats_strings = info.GetAttrsOrDefault("cats_strings"); - ORT_ENFORCE(tmp_cats_int64s.empty() || tmp_cats_strings.empty()); + ORT_ENFORCE(tmp_cats_int64s.empty() || tmp_cats_strings.empty(), + "One and only one of the 'cats_*' attributes must be defined"); if (!tmp_cats_int64s.empty()) { num_categories_ = tmp_cats_int64s.size(); for (size_t idx = 0, end = tmp_cats_int64s.size(); idx < end; ++idx) { @@ -71,18 +72,18 @@ template common::Status OneHotEncoderOp::Compute(OpKernelContext* context) const { const auto* X = context->Input(0); const TensorShape& input_shape = X->Shape(); - ORT_ENFORCE(input_shape.NumDimensions() <= 2); std::vector output_shape(input_shape.GetDims()); output_shape.push_back(num_categories_); Tensor* Y = context->Output(0, TensorShape(output_shape)); - auto y_data = Y->template MutableData(); + auto* y_data = Y->template MutableData(); std::fill_n(y_data, Y->Shape().Size(), 0.0f); - auto x_data = X->template Data(); + const auto* x_data = X->template Data(); + const auto x_size = input_shape.Size(); std::unordered_map::const_iterator idx; - for (int64_t i = 0; i < input_shape.Size(); ++i) { + for (int64_t i = 0; i < x_size; ++i) { auto int_idx = cats_int64s_.find(static_cast(x_data[i])); if (int_idx != cats_int64s_.cend()) y_data[i * num_categories_ + int_idx->second] = 1.0f; @@ -96,17 +97,17 @@ template <> common::Status OneHotEncoderOp::Compute(OpKernelContext* context) const { const auto* X = context->Input(0); const TensorShape& input_shape = X->Shape(); - ORT_ENFORCE(input_shape.NumDimensions() <= 2); std::vector output_shape(input_shape.GetDims()); output_shape.push_back(num_categories_); Tensor* Y = context->Output(0, TensorShape(output_shape)); - auto y_data = Y->template MutableData(); + auto* y_data = Y->template MutableData(); std::fill_n(y_data, Y->Shape().Size(), 0.0f); - auto x_data = X->template Data(); - for (int64_t i = 0; i < input_shape.Size(); ++i) { + const auto* x_data = X->template Data(); + const auto x_size = input_shape.Size(); + for (int64_t i = 0; i < x_size; ++i) { auto str_idx = cats_strings_.find(x_data[i]); if (str_idx != cats_strings_.cend()) y_data[i * num_categories_ + str_idx->second] = 1.0f; diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc index 13a62400e72b4..4c7ac2b327022 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc @@ -293,58 +293,6 @@ void TreeEnsembleClassifier::Initialize() { base_values_.size() == weights_classes_.size()); } -void get_max_weight(const std::map& classes, int64_t& maxclass, float& maxweight) { - maxclass = -1; - maxweight = 0.f; - for (auto& classe : classes) { - if (maxclass == -1 || classe.second > maxweight) { - maxclass = classe.first; - maxweight = classe.second; - } - } -} - -void get_weight_class_positive(std::map& classes, float& pos_weight) { - auto it_classes = classes.find(1); - pos_weight = it_classes == classes.end() - ? (classes.size() > 0 ? classes[0] : 0.f) // only 1 class - : it_classes->second; -} - -template -void _set_score_binary(int64_t i, LabelType* y_data, int& write_additional_scores, - bool weights_are_all_positive_, - std::map& classes, - const std::vector& classes_labels_, - const std::set& weights_classes_, - LabelType positive_label, LabelType negative_label) { - float pos_weight; - get_weight_class_positive(classes, pos_weight); - if (classes_labels_.size() == 2 && weights_classes_.size() == 1) { - if (weights_are_all_positive_) { - if (pos_weight > 0.5) { - y_data[i] = classes_labels_[1]; // positive label - write_additional_scores = 0; - } else { - y_data[i] = classes_labels_[0]; // negative label - write_additional_scores = 1; - } - } else { - if (pos_weight > 0) { - y_data[i] = classes_labels_[1]; // positive label - write_additional_scores = 2; - } else { - y_data[i] = classes_labels_[0]; // negative label - write_additional_scores = 3; - } - } - } else if (pos_weight > 0) { - y_data[i] = positive_label; // positive label - } else { - y_data[i] = negative_label; // negative label - } -} - template common::Status TreeEnsembleClassifier::Compute(OpKernelContext* context) const { const Tensor& X = *context->Input(0); @@ -358,41 +306,37 @@ common::Status TreeEnsembleClassifier::Compute(OpKernelContext* context) cons int64_t N = x_dims.size() == 1 ? 1 : x_dims[0]; Tensor* Y = context->Output(0, TensorShape({N})); auto* Z = context->Output(1, TensorShape({N, class_count_})); + + int64_t zindex = 0; const T* x_data = X.template Data(); - common::Status status; -#ifdef USE_OPENMP -#pragma omp parallel for -#endif + // for each class + std::vector scores; + scores.reserve(class_count_); for (int64_t i = 0; i < N; ++i) { - int64_t zindex = i * class_count_; - std::vector scores; + scores.clear(); int64_t current_weight_0 = i * stride; std::map classes; + // fill in base values, this might be empty but that is ok + for (int64_t k = 0, end = static_cast(base_values_.size()); k < end; ++k) { + auto p1 = std::make_pair(k, base_values_[k]); + classes.insert(p1); + } // walk each tree from its root for (size_t j = 0, end = roots_.size(); j < end; ++j) { - auto process_status = ProcessTreeNode(classes, roots_[j], x_data, current_weight_0); - if (!process_status.IsOK()) { - status = process_status; - } + ORT_RETURN_IF_ERROR(ProcessTreeNode(classes, roots_[j], x_data, current_weight_0)); } float maxweight = 0.f; int64_t maxclass = -1; // write top class int write_additional_scores = -1; if (class_count_ > 2) { - // add base values - std::map::iterator it_classes; - for (int64_t k = 0, end = static_cast(base_values_.size()); k < end; ++k) { - it_classes = classes.find(k); - if (it_classes == classes.end()) { - auto p1 = std::make_pair(k, base_values_[k]); - classes.insert(p1); - } else { - it_classes->second += base_values_[k]; + for (auto& classe : classes) { + if (maxclass == -1 || classe.second > maxweight) { + maxclass = classe.first; + maxweight = classe.second; } } - get_max_weight(classes, maxclass, maxweight); if (using_strings_) { Y->template MutableData()[i] = classlabels_strings_[maxclass]; } else { @@ -400,32 +344,68 @@ common::Status TreeEnsembleClassifier::Compute(OpKernelContext* context) cons } } else // binary case { - if (base_values_.size() == 2) { - // add base values - std::map::iterator it_classes; - it_classes = classes.find(1); - if (it_classes == classes.end()) { - // base_value_[0] is not used. It assumes base_value[0] == base_value[1] in this case. - // The specification does not forbid it but does not say what the output should be in that case. - std::map::iterator it_classes0 = classes.find(0); - classes[1] = base_values_[1] + it_classes0->second; - it_classes0->second = -classes[1]; + maxweight = !classes.empty() ? classes[0] : 0.f; // only 1 class + if (using_strings_) { + auto* y_data = Y->template MutableData(); + if (classlabels_strings_.size() == 2 && + weights_are_all_positive_ && + maxweight > 0.5 && + weights_classes_.size() == 1) { + y_data[i] = classlabels_strings_[1]; // positive label + write_additional_scores = 0; + } else if (classlabels_strings_.size() == 2 && + weights_are_all_positive_ && + maxweight <= 0.5 && + weights_classes_.size() == 1) { + y_data[i] = classlabels_strings_[0]; // negative label + write_additional_scores = 1; + } else if (classlabels_strings_.size() == 2 && + maxweight > 0 && + !weights_are_all_positive_ && weights_classes_.size() == 1) { + y_data[i] = classlabels_strings_[1]; // pos label + write_additional_scores = 2; + } else if (classlabels_strings_.size() == 2 && + maxweight <= 0 && + !weights_are_all_positive_ && + weights_classes_.size() == 1) { + y_data[i] = classlabels_strings_[0]; // neg label + write_additional_scores = 3; + } else if (maxweight > 0) { + y_data[i] = "1"; // positive label } else { - // binary as multiclass - it_classes->second += base_values_[1]; - classes[0] += base_values_[0]; + y_data[i] = "0"; // negative label } - } - if (using_strings_) { - _set_score_binary(i, Y->template MutableData(), - write_additional_scores, weights_are_all_positive_, - classes, classlabels_strings_, - weights_classes_, "1", "0"); } else { - _set_score_binary(i, Y->template MutableData(), - write_additional_scores, weights_are_all_positive_, - classes, classlabels_int64s_, - weights_classes_, 1, 0); + auto* y_data = Y->template MutableData(); + if (classlabels_int64s_.size() == 2 && + weights_are_all_positive_ && + maxweight > 0.5 && + weights_classes_.size() == 1) { + y_data[i] = classlabels_int64s_[1]; // positive label + write_additional_scores = 0; + } else if (classlabels_int64s_.size() == 2 && + weights_are_all_positive_ && + maxweight <= 0.5 && + weights_classes_.size() == 1) { + y_data[i] = classlabels_int64s_[0]; // negative label + write_additional_scores = 1; + } else if (classlabels_int64s_.size() == 2 && + maxweight > 0 && + !weights_are_all_positive_ && + weights_classes_.size() == 1) { + y_data[i] = classlabels_int64s_[1]; // pos label + write_additional_scores = 2; + } else if (classlabels_int64s_.size() == 2 && + maxweight <= 0 && + !weights_are_all_positive_ && + weights_classes_.size() == 1) { + y_data[i] = classlabels_int64s_[0]; // neg label + write_additional_scores = 3; + } else if (maxweight > 0) { + y_data[i] = 1; // positive label + } else { + y_data[i] = 0; // negative label + } } } // write float values, might not have all the classes in the output yet @@ -445,9 +425,10 @@ common::Status TreeEnsembleClassifier::Compute(OpKernelContext* context) cons } } write_scores(scores, post_transform_, zindex, Z, write_additional_scores); - } // namespace ml - return status; -} // namespace ml + zindex += scores.size(); + } // for every batch + return Status::OK(); +} template common::Status TreeEnsembleClassifier::ProcessTreeNode(std::map& classes, diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc index 7505aca264433..c3acbd02a62c5 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.cc +++ b/onnxruntime/core/providers/cpu/nn/conv.cc @@ -1,13 +1,148 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. +/** +* Copyright (c) 2016-present, Facebook, Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +/* Modifications Copyright (c) Microsoft. */ +#include "core/providers/cpu/nn/conv.h" #include "core/framework/op_kernel_context_internal.h" -#include "core/providers/cpu/nn/conv_impl.h" #include "core/util/math_cpuonly.h" namespace onnxruntime { -template <> +template +Status Conv::Compute(OpKernelContext* context) const { + size_t num_inputs = OpKernel::Node().InputDefs().size(); + + const auto* X = context->Input(0); + const auto* W = context->Input(1); + const Tensor* B = num_inputs == 3 ? context->Input(2) : nullptr; + const int64_t N = X->Shape()[0]; + const int64_t C = X->Shape()[1]; + const int64_t M = W->Shape()[0]; + ORT_RETURN_IF_ERROR(ValidateInputShape(X, W)); + + std::vector kernel_shape; + ORT_RETURN_IF_ERROR(ComputeKernelShape(W->Shape(), kernel_shape)); + + bool Is2DKernel = kernel_shape.size() == 2; + std::vector pads(pads_); + if (pads.empty()) { + pads.resize(kernel_shape.size() * 2, 0); + } + std::vector dilations(dilations_); + if (dilations.empty()) { + dilations.resize(kernel_shape.size(), 1); + } + std::vector strides(strides_); + if (strides.empty()) { + strides.resize(kernel_shape.size(), 1); + } + + std::vector Y_dims; + Y_dims.insert(Y_dims.begin(), {N, M}); + TensorShape input_shape = X->Shape().Slice(2); + ORT_RETURN_IF_ERROR(InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims)); + Tensor* Y = context->Output(0, TensorShape(Y_dims)); + TensorShape output_shape = Y->Shape().Slice(2); + + const int64_t input_image_size = input_shape.Size(); + const int64_t output_image_size = output_shape.Size(); + const int64_t kernel_size = TensorShape(kernel_shape).Size(); + const int64_t X_offset = C / group_ * input_image_size; + const int64_t Y_offset = Y->Shape().Size() / Y->Shape()[0] / group_; + const int64_t W_offset = W->Shape().Size() / group_; + const int64_t kernel_dim = C / group_ * kernel_size; + const int64_t col_buffer_size = kernel_dim * output_image_size; + + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); + + auto col_data = alloc->Alloc(sizeof(T) * col_buffer_size); + BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc)); + T* col_buffer_data = static_cast(col_buffer.get()); + + const T* Xdata = X->template Data(); + T* Ydata = Y->template MutableData(); + + TensorShape image_shape = X->Shape().Slice(1); + std::vector col_buffer_shape{kernel_dim}; + col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(), + output_shape.GetDims().end()); + + for (int image_id = 0; image_id < N; ++image_id) { + for (int group_id = 0; group_id < group_; ++group_id) { + if (Is2DKernel) { + math::Im2col( + Xdata + group_id * X_offset, + C / group_, + input_shape[0], + input_shape[1], + kernel_shape[0], + kernel_shape[1], + dilations[0], + dilations[1], + pads[0], + pads[1], + pads[2], + pads[3], + strides[0], + strides[1], + col_buffer_data, + &CPUMathUtil::Instance()); + } else { + math::Im2colNd()( + Xdata + group_id * X_offset, + image_shape.GetDims().data(), + col_buffer_shape.data(), + C * input_image_size, + col_buffer_size, + kernel_shape.data(), + strides.data(), + dilations.data(), + pads.data(), + static_cast(kernel_shape.size()), + col_buffer_data, + &CPUMathUtil::Instance()); + } + math::Gemm( + CblasNoTrans, + CblasNoTrans, + M / group_, + output_image_size, + kernel_dim, + 1, + W->template Data() + group_id * W_offset, + col_buffer_data, + 0, + Ydata + group_id * Y_offset, + &CPUMathUtil::Instance()); + } + + if (B != nullptr) { + auto Ymatrix = EigenMatrixMap(Ydata, output_image_size, M); + auto Bvec = ConstEigenVectorMap(B->template Data(), M); + Ymatrix.rowwise() += Bvec.transpose(); + } + + Xdata += X_offset * group_; + Ydata += Y_offset * group_; + } + + return Status::OK(); +} + Status Conv::Compute(OpKernelContext* context) const { size_t num_inputs = OpKernel::Node().InputDefs().size(); const auto* X = context->Input(0); @@ -45,27 +180,12 @@ Status Conv::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); const auto* Xdata = X->template Data(); + const auto* Bdata = B != nullptr ? B->template Data() : nullptr; auto* Ydata = Y->template MutableData(); const size_t kernel_rank = kernel_shape.size(); if (kernel_rank == 2 || kernel_rank == 3) { - MLAS_ACTIVATION Activation; - if (activation_.empty()) { - Activation.ActivationKind = MlasIdentityActivation; - } else if (activation_ == "Relu") { - Activation.ActivationKind = MlasReluActivation; - } else if (activation_ == "LeakyRelu") { - Activation.ActivationKind = MlasLeakyReluActivation; - Activation.alpha = alpha_; - } else if (activation_ == "Tanh") { - Activation.ActivationKind = MlasTanhActivation; - } else if (activation_ == "Sigmoid") { - Activation.ActivationKind = MlasLogisticActivation; - } else { - ORT_NOT_IMPLEMENTED("Not implemented fused activation: ", activation_); - } - // Get access to the internal threadpool // Temporarily derive concurrency parameters without access to session state auto ctx_internal = static_cast(context); @@ -85,7 +205,7 @@ Status Conv::Compute(OpKernelContext* context) const { strides.data(), output_shape.GetDims().data(), static_cast(M / group_), - &Activation, + &activation_, &WorkingBufferSize, const_cast(thread_pool)); @@ -95,7 +215,7 @@ Status Conv::Compute(OpKernelContext* context) const { MlasConv(&Parameters, Xdata, W->template Data(), - B != nullptr ? B->template Data() : nullptr, + Bdata, static_cast(working_buffer.get()), Ydata, const_cast(thread_pool)); @@ -147,13 +267,7 @@ Status Conv::Compute(OpKernelContext* context) const { &CPUMathUtil::Instance()); } - if (B != nullptr) { - auto Ymatrix = EigenMatrixMap(Ydata, output_image_size, M); - auto Bvec = ConstEigenVectorMap(B->template Data(), M); - Ymatrix.rowwise() += Bvec.transpose(); - } - - FuseActivation(activation_, Ydata, Y_offset * group_, alpha_); + MlasActivation(&activation_, Ydata, Bdata, M, output_image_size, output_image_size); Xdata += X_offset * group_; Ydata += Y_offset * group_; @@ -168,4 +282,5 @@ ONNX_CPU_OPERATOR_KERNEL( 1, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Conv); + } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/conv.h b/onnxruntime/core/providers/cpu/nn/conv.h index cf6484ab417a2..3e366e1b49775 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.h +++ b/onnxruntime/core/providers/cpu/nn/conv.h @@ -1,23 +1,10 @@ -/** -* Copyright (c) 2016-present, Facebook, Inc. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -/* Modifications Copyright (c) Microsoft. */ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. #pragma once #include "core/providers/cpu/nn/conv_base.h" +#include "core/mlas/inc/mlas.h" namespace onnxruntime { @@ -30,4 +17,17 @@ class Conv : public OpKernel, public ConvBase { Status Compute(OpKernelContext* context) const override; }; +template <> +class Conv : public OpKernel, public ConvBase { + public: + Conv(const OpKernelInfo& info) : OpKernel(info), ConvBase(info) { + activation_.ActivationKind = MlasIdentityActivation; + } + + Status Compute(OpKernelContext* context) const override; + + protected: + MLAS_ACTIVATION activation_; +}; + } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/conv_impl.h b/onnxruntime/core/providers/cpu/nn/conv_impl.h deleted file mode 100644 index 679630f4654f3..0000000000000 --- a/onnxruntime/core/providers/cpu/nn/conv_impl.h +++ /dev/null @@ -1,153 +0,0 @@ -/** -* Copyright (c) 2016-present, Facebook, Inc. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -/* Modifications Copyright (c) Microsoft. */ - -#pragma once - -#include "core/providers/cpu/nn/conv.h" -#include "core/util/math.h" -#include "core/util/math_cpuonly.h" -#include "core/mlas/inc/mlas.h" - -namespace onnxruntime { - -template -Status Conv::Compute(OpKernelContext* context) const { - size_t num_inputs = OpKernel::Node().InputDefs().size(); - - const auto* X = context->Input(0); - const auto* W = context->Input(1); - const Tensor* B = num_inputs == 3 ? context->Input(2) : nullptr; - const int64_t N = X->Shape()[0]; - const int64_t C = X->Shape()[1]; - const int64_t M = W->Shape()[0]; - ORT_RETURN_IF_ERROR(ValidateInputShape(X, W)); - - std::vector kernel_shape; - ORT_RETURN_IF_ERROR(ComputeKernelShape(W->Shape(), kernel_shape)); - - bool Is2DKernel = kernel_shape.size() == 2; - std::vector pads(pads_); - if (pads.empty()) { - pads.resize(kernel_shape.size() * 2, 0); - } - std::vector dilations(dilations_); - if (dilations.empty()) { - dilations.resize(kernel_shape.size(), 1); - } - std::vector strides(strides_); - if (strides.empty()) { - strides.resize(kernel_shape.size(), 1); - } - - std::vector Y_dims; - Y_dims.insert(Y_dims.begin(), {N, M}); - TensorShape input_shape = X->Shape().Slice(2); - ORT_RETURN_IF_ERROR(InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims)); - Tensor* Y = context->Output(0, TensorShape(Y_dims)); - TensorShape output_shape = Y->Shape().Slice(2); - - const int64_t input_image_size = input_shape.Size(); - const int64_t output_image_size = output_shape.Size(); - const int64_t kernel_size = TensorShape(kernel_shape).Size(); - const int64_t X_offset = C / group_ * input_image_size; - const int64_t Y_offset = Y->Shape().Size() / Y->Shape()[0] / group_; - const int64_t W_offset = W->Shape().Size() / group_; - const int64_t kernel_dim = C / group_ * kernel_size; - const int64_t col_buffer_size = kernel_dim * output_image_size; - - AllocatorPtr alloc; - ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); - - auto col_data = alloc->Alloc(sizeof(T) * col_buffer_size); - BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc)); - T* col_buffer_data = static_cast(col_buffer.get()); - - const T* Xdata = X->template Data(); - T* Ydata = Y->template MutableData(); - - TensorShape image_shape = X->Shape().Slice(1); - std::vector col_buffer_shape{kernel_dim}; - col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(), - output_shape.GetDims().end()); - - for (int image_id = 0; image_id < N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { - if (Is2DKernel) { - math::Im2col( - Xdata + group_id * X_offset, - C / group_, - input_shape[0], - input_shape[1], - kernel_shape[0], - kernel_shape[1], - dilations[0], - dilations[1], - pads[0], - pads[1], - pads[2], - pads[3], - strides[0], - strides[1], - col_buffer_data, - &CPUMathUtil::Instance()); - } else { - math::Im2colNd()( - Xdata + group_id * X_offset, - image_shape.GetDims().data(), - col_buffer_shape.data(), - C * input_image_size, - col_buffer_size, - kernel_shape.data(), - strides.data(), - dilations.data(), - pads.data(), - static_cast(kernel_shape.size()), - col_buffer_data, - &CPUMathUtil::Instance()); - } - math::Gemm( - CblasNoTrans, - CblasNoTrans, - M / group_, - output_image_size, - kernel_dim, - 1, - W->template Data() + group_id * W_offset, - col_buffer_data, - 0, - Ydata + group_id * Y_offset, - &CPUMathUtil::Instance()); - } - - if (B != nullptr) { - auto Ymatrix = EigenMatrixMap(Ydata, output_image_size, M); - auto Bvec = ConstEigenVectorMap(B->template Data(), M); - Ymatrix.rowwise() += Bvec.transpose(); - } - FuseActivation(activation_, Ydata, Y_offset * group_, alpha_); - - Xdata += X_offset * group_; - Ydata += Y_offset * group_; - } - - return Status::OK(); -} - -template <> -Status Conv::Compute(OpKernelContext* context) const; - -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/pool.cc b/onnxruntime/core/providers/cpu/nn/pool.cc index d37424c3ed3f9..367a9256a0c16 100644 --- a/onnxruntime/core/providers/cpu/nn/pool.cc +++ b/onnxruntime/core/providers/cpu/nn/pool.cc @@ -3,7 +3,7 @@ #include "core/framework/op_kernel_context_internal.h" #include "core/providers/cpu/nn/pool.h" -#include + using namespace ::onnxruntime::common; namespace onnxruntime { @@ -25,7 +25,7 @@ Status Pool::Compute(OpKernelContext* context) const { } std::vector output_dims = PoolBase::SetOutputSize(x_shape, x_shape[1], &pads, dilations_, ceil_mode_); - Tensor* Y = context->Output(0, TensorShape(output_dims)); + Tensor* Y = context->Output(0, output_dims); const auto* X_data = X->template Data(); auto* Y_data = Y->template MutableData(); @@ -185,7 +185,7 @@ Status PoolBase::Compute(OpKernelContext* context, MLAS_POOLING_KIND kind) const std::vector pads = pads_; std::vector output_dims = PoolBase::SetOutputSize(x_shape, x_shape[1], &pads, dilations_, ceil_mode_); - Tensor* Y = context->Output(0, TensorShape(output_dims)); + Tensor* Y = context->Output(0, output_dims); // Get access to the internal threadpool // Temporarily derive concurrency parameters without access to session state @@ -222,8 +222,9 @@ Status Pool>::Compute(OpKernelContext* context) co // and also if dilation is not required bool need_dilation = false; - for (auto n : dilations_) + for (auto n : dilations_) { need_dilation |= n > 1; + } if (OpKernel::Node().OutputDefs().size() == 1 && !need_dilation) { return PoolBase::Compute(context, MlasMaximumPooling); @@ -238,8 +239,8 @@ Status Pool>::Compute(OpKernelContext* context) co std::vector kernel_shape = kernel_shape_; std::vector output_dims = PoolBase::SetOutputSize(x_shape, x_shape[1], &pads, dilations_, ceil_mode_); - Tensor* Y = context->Output(0, TensorShape(output_dims)); - Tensor* I = context->Output(1, TensorShape(output_dims)); + Tensor* Y = context->Output(0, output_dims); + Tensor* I = context->Output(1, output_dims); const auto* X_data = X->template Data(); auto* Y_data = Y->template MutableData(); @@ -270,14 +271,15 @@ Status Pool>::Compute(OpKernelContext* context) co int64_t* i_d = I_data ? I_data + c * y_step : nullptr; for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h() - pads[0]; - int64_t hend = std::min(hstart + kernel_shape[0] * dilation_h - dilation_h + 1, height); - hstart = std::max(hstart, static_cast(0)); + int64_t hend = hstart + kernel_shape[0] * dilation_h; float Yh = std::numeric_limits::lowest(); int64_t h_index = -1; for (int64_t h = hstart; h < hend; h += dilation_h) { - if (x_d[h] > Yh) { - Yh = x_d[h]; - h_index = h; + if (math::is_a_ge_zero_and_a_lt_b(h, height)) { + if (x_d[h] > Yh) { + Yh = x_d[h]; + h_index = h; + } } } y_d[ph] = Yh; @@ -305,23 +307,25 @@ Status Pool>::Compute(OpKernelContext* context) co for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h() - pads[0]; - int64_t hend = std::min(hstart + kernel_shape[0] * dilation_h - dilation_h + 1, height); - hstart = std::max(hstart, static_cast(0)); + int64_t hend = hstart + kernel_shape[0] * dilation_h; for (int64_t pw = 0; pw < pooled_width; ++pw) { int64_t wstart = pw * stride_w() - pads[1]; - int64_t wend = std::min(wstart + kernel_shape[1] * dilation_w - dilation_w + 1, width); - wstart = std::max(wstart, static_cast(0)); + int64_t wend = wstart + kernel_shape[1] * dilation_w; const int64_t pool_index = ph * pooled_width + pw; float Yh = std::numeric_limits::lowest(); int64_t h_index = -1; int64_t w_index = -1; for (int64_t h = hstart; h < hend; h += dilation_h) { - for (int64_t w = wstart; w < wend; w += dilation_w) { - const int64_t input_index = h * width + w; - if (x_d[input_index] > Yh) { - Yh = x_d[input_index]; - h_index = h; - w_index = w; + if (math::is_a_ge_zero_and_a_lt_b(h, height)) { + for (int64_t w = wstart; w < wend; w += dilation_w) { + if (math::is_a_ge_zero_and_a_lt_b(w, width)) { + const int64_t input_index = h * width + w; + if (x_d[input_index] > Yh) { + Yh = x_d[input_index]; + h_index = h; + w_index = w; + } + } } } } @@ -353,16 +357,13 @@ Status Pool>::Compute(OpKernelContext* context) co for (int64_t ph = 0; ph < pooled_height; ++ph) { int64_t hstart = ph * stride_h() - pads[0]; - int64_t hend = std::min(hstart + kernel_shape[0] * dilation_h - dilation_h + 1, height); - hstart = std::max(hstart, static_cast(0)); + int64_t hend = hstart + kernel_shape[0] * dilation_h; for (int64_t pw = 0; pw < pooled_width; ++pw) { int64_t wstart = pw * stride_w() - pads[1]; - int64_t wend = std::min(wstart + kernel_shape[1] * dilation_w - dilation_w + 1, width); - wstart = std::max(wstart, static_cast(0)); + int64_t wend = wstart + kernel_shape[1] * dilation_w; for (int64_t pd = 0; pd < pooled_depth; ++pd) { int64_t dstart = pd * stride_d() - pads[2]; - int64_t dend = std::min(dstart + kernel_shape[2] * dilation_d - dilation_d + 1, depth); - dstart = std::max(dstart, static_cast(0)); + int64_t dend = dstart + kernel_shape[2] * dilation_d; const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd; float Yh = std::numeric_limits::lowest(); @@ -370,14 +371,20 @@ Status Pool>::Compute(OpKernelContext* context) co int64_t w_index = -1; int64_t d_index = -1; for (int64_t h = hstart; h < hend; h += dilation_h) { - for (int64_t w = wstart; w < wend; w += dilation_w) { - for (int64_t d = dstart; d < dend; d += dilation_d) { - const int64_t input_index = h * width * depth + w * depth + d; - if (x_d[input_index] > Yh) { - Yh = x_d[input_index]; - h_index = h; - w_index = w; - d_index = d; + if (math::is_a_ge_zero_and_a_lt_b(h, height)) { + for (int64_t w = wstart; w < wend; w += dilation_w) { + if (math::is_a_ge_zero_and_a_lt_b(w, width)) { + for (int64_t d = dstart; d < dend; d += dilation_d) { + if (math::is_a_ge_zero_and_a_lt_b(d, depth)) { + const int64_t input_index = h * width * depth + w * depth + d; + if (x_d[input_index] > Yh) { + Yh = x_d[input_index]; + h_index = h; + w_index = w; + d_index = d; + } + } + } } } } diff --git a/onnxruntime/core/providers/cpu/nn/pool_base.h b/onnxruntime/core/providers/cpu/nn/pool_base.h index 11b70ac364b6d..43f81982dd3a9 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_base.h +++ b/onnxruntime/core/providers/cpu/nn/pool_base.h @@ -7,6 +7,7 @@ #include "core/common/common.h" #include "core/framework/op_kernel.h" #include "core/providers/cpu/nn/autopad_type.h" +#include "core/util/math.h" #include "core/mlas/inc/mlas.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/cpu/nn/shrink.cc b/onnxruntime/core/providers/cpu/nn/shrink.cc index e89b437a2f7ff..6af09fba42300 100644 --- a/onnxruntime/core/providers/cpu/nn/shrink.cc +++ b/onnxruntime/core/providers/cpu/nn/shrink.cc @@ -11,7 +11,9 @@ namespace onnxruntime { ONNX_CPU_OPERATOR_KERNEL( Shrink, 9, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllNumericTensorTypes()), + KernelDefBuilder() + .MayInplace(0, 0) + .TypeConstraint("T", DataTypeImpl::AllNumericTensorTypes()), Shrink); namespace shrink_internal { diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc index 04b28b4b2da46..66084547810ad 100644 --- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc +++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc @@ -12,6 +12,7 @@ limitations under the License. /* Modifications Copyright (c) Microsoft. */ #include "non_max_suppression.h" +#include "non_max_suppression_helper.h" #include namespace onnxruntime { @@ -24,128 +25,94 @@ ONNX_OPERATOR_KERNEL_EX( KernelDefBuilder(), NonMaxSuppression); -void NonMaxSuppression::MaxMin(const float& lhs, const float& rhs, float& min, float& max) const { - if (lhs >= rhs) { - min = rhs; - max = lhs; - } else { - min = lhs; - max = rhs; +using namespace nms_helpers; + +// CPU version +namespace nms_helpers { +Status GetThresholdsFromInputs(const PrepareContext& pc, + int64_t& max_output_boxes_per_class, + float& iou_threshold, + float& score_threshold) { + if (pc.max_output_boxes_per_class_ != nullptr) { + max_output_boxes_per_class = std::max(*pc.max_output_boxes_per_class_, 0); } -} -bool NonMaxSuppression::SuppressByIOU(const float* boxes_data, int64_t box_index1, int64_t box_index2, float iou_threshold) const { - float x1_min; - float y1_min; - float x1_max; - float y1_max; - float x2_min; - float y2_min; - float x2_max; - float y2_max; - // center_point_box_ only support 0 or 1 - if (0 == center_point_box_) { - // boxes data format [y1, x1, y2, x2], - MaxMin(boxes_data[4 * box_index1 + 1], boxes_data[4 * box_index1 + 3], x1_min, x1_max); - MaxMin(boxes_data[4 * box_index1 + 0], boxes_data[4 * box_index1 + 2], y1_min, y1_max); - MaxMin(boxes_data[4 * box_index2 + 1], boxes_data[4 * box_index2 + 3], x2_min, x2_max); - MaxMin(boxes_data[4 * box_index2 + 0], boxes_data[4 * box_index2 + 2], y2_min, y2_max); - } else { - // 1 == center_point_box_ => boxes data format [x_center, y_center, width, height] - float box1_width_half = boxes_data[4 * box_index1 + 2] / 2; - float box1_height_half = boxes_data[4 * box_index1 + 3] / 2; - float box2_width_half = boxes_data[4 * box_index2 + 2] / 2; - float box2_height_half = boxes_data[4 * box_index2 + 3] / 2; - - x1_min = boxes_data[4 * box_index1 + 0] - box1_width_half; - x1_max = boxes_data[4 * box_index1 + 0] + box1_width_half; - y1_min = boxes_data[4 * box_index1 + 1] - box1_height_half; - y1_max = boxes_data[4 * box_index1 + 1] + box1_height_half; - - x2_min = boxes_data[4 * box_index2 + 0] - box2_width_half; - x2_max = boxes_data[4 * box_index2 + 0] + box2_width_half; - y2_min = boxes_data[4 * box_index2 + 1] - box2_height_half; - y2_max = boxes_data[4 * box_index2 + 1] + box2_height_half; + if (pc.iou_threshold_ != nullptr) { + iou_threshold = *pc.iou_threshold_; + ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1.f), "iou_threshold must be in range [0, 1]."); } - const float intersection_x_min = std::max(x1_min, x2_min); - const float intersection_y_min = std::max(y1_min, y2_min); - const float intersection_x_max = std::min(x1_max, x2_max); - const float intersection_y_max = std::min(y1_max, y2_max); + if (pc.score_threshold_ != nullptr) { + score_threshold = *pc.score_threshold_; + } - const float intersection_area = std::max(intersection_x_max - intersection_x_min, static_cast(0.0)) * - std::max(intersection_y_max - intersection_y_min, static_cast(0.0)); + return Status::OK(); +} +} // namespace nms_helpers - if (intersection_area <= static_cast(0.0)) { - return false; - } +Status NonMaxSuppressionBase::PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) { + const auto* boxes_tensor = ctx->Input(0); + ORT_ENFORCE(boxes_tensor); + pc.boxes_data_ = boxes_tensor->Data(); - const float area1 = (x1_max - x1_min) * (y1_max - y1_min); - const float area2 = (x2_max - x2_min) * (y2_max - y2_min); - const float union_area = area1 + area2 - intersection_area; + const auto* scores_tensor = ctx->Input(1); + ORT_ENFORCE(scores_tensor); + pc.scores_data_ = scores_tensor->Data(); - if (area1 <= static_cast(0.0) || area2 <= static_cast(0.0) || union_area <= static_cast(0.0)) { - return false; + const auto num_inputs = ctx->InputCount(); + + if (num_inputs > 2) { + const auto* max_output_boxes_per_class_tensor = ctx->Input(2); + if (max_output_boxes_per_class_tensor != nullptr) { + pc.max_output_boxes_per_class_ = max_output_boxes_per_class_tensor->Data(); + } } - const float intersection_over_union = intersection_area / union_area; + if (num_inputs > 3) { + const auto* iou_threshold_tensor = ctx->Input(3); + if (iou_threshold_tensor != nullptr) { + pc.iou_threshold_ = iou_threshold_tensor->Data(); + } + } - return intersection_over_union > iou_threshold; -} + if (num_inputs > 4) { + const auto* score_threshold_tensor = ctx->Input(4); + if (score_threshold_tensor != nullptr) { + pc.score_threshold_ = score_threshold_tensor->Data(); + } + } + + const auto& boxes_shape = boxes_tensor->Shape(); + pc.boxes_size_ = boxes_shape.Size(); + const auto& scores_shape = scores_tensor->Shape(); + pc.scores_size_ = scores_shape.Size(); -Status NonMaxSuppression::ParepareCompute(OpKernelContext* ctx, const TensorShape& boxes_shape, const TensorShape& scores_shape, - int64_t& max_output_boxes_per_class, float& iou_threshold, float& score_threshold, bool& has_score_threshold) const { ORT_RETURN_IF_NOT(boxes_shape.NumDimensions() == 3, "boxes must be a 3D tensor."); ORT_RETURN_IF_NOT(scores_shape.NumDimensions() == 3, "scores must be a 3D tensor."); auto boxes_dims = boxes_shape.GetDims(); auto scores_dims = scores_shape.GetDims(); ORT_RETURN_IF_NOT(boxes_dims[0] == scores_dims[0], "boxes and scores should have same num_batches."); - ORT_RETURN_IF_NOT(boxes_dims[1] == scores_dims[2], "boxes and scores should have same spatial_dimention."); + ORT_RETURN_IF_NOT(boxes_dims[1] == scores_dims[2], "boxes and scores should have same spatial_dimension."); ORT_RETURN_IF_NOT(boxes_dims[2] == 4, "The most inner dimension in boxes must have 4 data."); - const_cast(num_batches_) = boxes_dims[0]; - const_cast(num_classes_) = scores_dims[1]; - const_cast(num_boxes_) = boxes_dims[1]; - - const auto* max_output_boxes_per_class_tensor = ctx->Input(2); - if (max_output_boxes_per_class_tensor != nullptr) { - max_output_boxes_per_class = *(max_output_boxes_per_class_tensor->Data()); - max_output_boxes_per_class = max_output_boxes_per_class > 0 ? max_output_boxes_per_class : 0; - } - - const auto* iou_threshold_tensor = ctx->Input(3); - if (iou_threshold_tensor != nullptr) { - iou_threshold = *(iou_threshold_tensor->Data()); - ORT_RETURN_IF_NOT((iou_threshold >= 0 && iou_threshold <= 1), "iou_threshold must be in range [0, 1]."); - } - - const auto* score_threshold_tensor = ctx->Input(4); - if (score_threshold_tensor != nullptr) { - has_score_threshold = true; - score_threshold = *(score_threshold_tensor->Data()); - } + pc.num_batches_ = boxes_dims[0]; + pc.num_classes_ = scores_dims[1]; + pc.num_boxes_ = boxes_dims[1]; return Status::OK(); } Status NonMaxSuppression::Compute(OpKernelContext* ctx) const { - const auto* boxes = ctx->Input(0); - ORT_ENFORCE(boxes); - const auto* scores = ctx->Input(1); - ORT_ENFORCE(scores); - - auto& boxes_shape = boxes->Shape(); - auto& scores_shape = scores->Shape(); + PrepareContext pc; + auto ret = PrepareCompute(ctx, pc); + ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); int64_t max_output_boxes_per_class = 0; - float iou_threshold = 0; - // Not so sure for the value range of score_threshold, so set a bool to indicate whether it has this input - bool has_score_threshold = false; - float score_threshold = 0; + float iou_threshold = .0f; + float score_threshold = .0f; - auto ret = ParepareCompute(ctx, boxes_shape, scores_shape, max_output_boxes_per_class, - iou_threshold, score_threshold, has_score_threshold); + ret = GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold); ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); if (0 == max_output_boxes_per_class) { @@ -153,63 +120,78 @@ Status NonMaxSuppression::Compute(OpKernelContext* ctx) const { return Status::OK(); } - const auto* boxes_data = boxes->Data(); - const auto* scores_data = scores->Data(); + const auto* const boxes_data = pc.boxes_data_; + const auto* const scores_data = pc.scores_data_; struct ScoreIndexPair { - float score; - int64_t index; - }; + float score_{}; + int64_t index_{}; + + ScoreIndexPair() = default; + explicit ScoreIndexPair(float score, int64_t idx) : score_(score), index_(idx) {} - auto LessCompare = [](const ScoreIndexPair& lhs, const ScoreIndexPair& rhs) { - return lhs.score < rhs.score; + bool operator<(const ScoreIndexPair& rhs) const { + return score_ < rhs.score_; + } }; - std::vector tmp_selected_indices; - for (int64_t batch_index = 0; batch_index < num_batches_; ++batch_index) { - for (int64_t class_index = 0; class_index < num_classes_; ++class_index) { - int64_t box_score_offset = (batch_index * num_classes_ + class_index) * num_boxes_; - int64_t box_offset = batch_index * num_classes_ * num_boxes_ * 4; + const auto center_point_box = GetCenterPointBox(); + + std::vector selected_indices; + for (int64_t batch_index = 0; batch_index < pc.num_batches_; ++batch_index) { + for (int64_t class_index = 0; class_index < pc.num_classes_; ++class_index) { + int64_t box_score_offset = (batch_index * pc.num_classes_ + class_index) * pc.num_boxes_; + int64_t box_offset = batch_index * pc.num_classes_ * pc.num_boxes_ * 4; // Filter by score_threshold_ - std::priority_queue, decltype(LessCompare)> sorted_scores_with_index(LessCompare); - for (int64_t box_index = 0; box_index < num_boxes_; ++box_index) { - if (!has_score_threshold || (has_score_threshold && scores_data[box_score_offset + box_index] > score_threshold)) { - sorted_scores_with_index.emplace(ScoreIndexPair({scores_data[box_score_offset + box_index], box_index})); + std::priority_queue> sorted_scores_with_index; + const auto* class_scores = scores_data + box_score_offset; + if (pc.score_threshold_ != nullptr) { + for (int64_t box_index = 0; box_index < pc.num_boxes_; ++box_index, ++class_scores) { + if (*class_scores > score_threshold) { + sorted_scores_with_index.push(ScoreIndexPair(*class_scores, box_index)); + } + } + } else { + for (int64_t box_index = 0; box_index < pc.num_boxes_; ++box_index, ++class_scores) { + sorted_scores_with_index.push(ScoreIndexPair(*class_scores, box_index)); } } ScoreIndexPair next_top_score; std::vector selected_indicies_inside_class; - // Get the next box with top score, filter by iou_threshold_ + // Get the next box with top score, filter by iou_threshold while (!sorted_scores_with_index.empty()) { next_top_score = sorted_scores_with_index.top(); sorted_scores_with_index.pop(); bool selected = true; // Check with existing selected boxes for this class, suppress if exceed the IOU (Intersection Over Union) threshold - for (int64_t selected_indicies_inside_clas : selected_indicies_inside_class) { - if (SuppressByIOU(boxes_data + box_offset, selected_indicies_inside_clas, next_top_score.index, - iou_threshold)) { + for (int64_t selected_index : selected_indicies_inside_class) { + if (SuppressByIOU(boxes_data + box_offset, selected_index, next_top_score.index_, + center_point_box, iou_threshold)) { selected = false; break; } } if (selected) { - if (max_output_boxes_per_class > 0 && static_cast(selected_indicies_inside_class.size()) >= max_output_boxes_per_class) { + if (max_output_boxes_per_class > 0 && + static_cast(selected_indicies_inside_class.size()) >= max_output_boxes_per_class) { break; } - selected_indicies_inside_class.push_back(next_top_score.index); - tmp_selected_indices.emplace_back(batch_index, class_index, next_top_score.index); + selected_indicies_inside_class.push_back(next_top_score.index_); + selected_indices.emplace_back(batch_index, class_index, next_top_score.index_); } } //while } //for class_index } //for batch_index - auto num_selected = static_cast(tmp_selected_indices.size()); - Tensor* selected_indices = ctx->Output(0, {num_selected, 3}); - ORT_ENFORCE(selected_indices); - memcpy(selected_indices->MutableData(), tmp_selected_indices.data(), num_selected * sizeof(selected_index)); + const auto last_dim = 3; + const auto num_selected = selected_indices.size(); + Tensor* output = ctx->Output(0, {static_cast(num_selected), last_dim}); + ORT_ENFORCE(output != nullptr); + static_assert(last_dim * sizeof(int64_t) == sizeof(SelectedIndex), "Possible modification of SelectedIndex"); + memcpy(output->MutableData(), selected_indices.data(), num_selected * sizeof(SelectedIndex)); return Status::OK(); } diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h index d3846280133e8..37578539cda58 100644 --- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h +++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.h @@ -8,37 +8,30 @@ namespace onnxruntime { -class NonMaxSuppression final : public OpKernel { - public: - NonMaxSuppression(const OpKernelInfo& info) : OpKernel(info) { +struct PrepareContext; + +class NonMaxSuppressionBase { + protected: + explicit NonMaxSuppressionBase(const OpKernelInfo& info) { center_point_box_ = info.GetAttrOrDefault("center_point_box", 0); ORT_ENFORCE(0 == center_point_box_ || 1 == center_point_box_, "center_point_box only support 0 or 1"); - num_batches_ = 0; - num_classes_ = 0; - num_boxes_ = 0; } - Status Compute(OpKernelContext* context) const override; + static Status PrepareCompute(OpKernelContext* ctx, PrepareContext& pc); - private: - bool SuppressByIOU(const float* boxes_data, int64_t box_index1, int64_t box_index2, float iou_threshold) const; - void MaxMin(const float& lhs, const float& rhs, float& min, float& max) const; - Status ParepareCompute(OpKernelContext* ctx, const TensorShape& boxes_shape, const TensorShape& scores_shape, - int64_t& max_output_boxes_per_batch, float& iou_threshold, float& score_threshold, bool& has_score_threshold) const; + int64_t GetCenterPointBox() const { + return center_point_box_; + } private: int64_t center_point_box_; +}; - int64_t num_batches_; - int64_t num_classes_; - int64_t num_boxes_; - - struct selected_index { - selected_index(int64_t batch_index, int64_t class_index, int64_t box_index) - : batch_index_(batch_index), class_index_(class_index), box_index_(box_index) {} - int64_t batch_index_ = 0; - int64_t class_index_ = 0; - int64_t box_index_ = 0; - }; +class NonMaxSuppression final : public OpKernel, public NonMaxSuppressionBase { + public: + explicit NonMaxSuppression(const OpKernelInfo& info) : OpKernel(info), NonMaxSuppressionBase(info) { + } + + Status Compute(OpKernelContext* context) const override; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression_helper.h b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression_helper.h new file mode 100644 index 0000000000000..40b0f296182eb --- /dev/null +++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression_helper.h @@ -0,0 +1,129 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +#ifdef __NVCC__ +#include "core/providers/cuda/cu_inc/common.cuh" +#define ORT_DEVICE __device__ +#define HelperMin(a, b) _Min(a, b) +#define HelperMax(a, b) _Max(a, b) +#else +#include +#define ORT_DEVICE +#define HelperMin(a, b) std::min(a, b) +#define HelperMax(a, b) std::max(a, b) +#endif + +namespace onnxruntime { + +struct PrepareContext { + const float* boxes_data_ = nullptr; + int64_t boxes_size_ = 0ll; + const float* scores_data_ = nullptr; + int64_t scores_size_ = 0ll; + // The below are ptrs since they cab be device specific + const int64_t* max_output_boxes_per_class_ = nullptr; + const float* score_threshold_ = nullptr; + const float* iou_threshold_ = nullptr; + int64_t num_batches_ = 0; + int64_t num_classes_ = 0; + int64_t num_boxes_ = 0; +}; + +struct SelectedIndex { + ORT_DEVICE + SelectedIndex(int64_t batch_index, int64_t class_index, int64_t box_index) + : batch_index_(batch_index), class_index_(class_index), box_index_(box_index) {} + SelectedIndex() = default; + int64_t batch_index_ = 0; + int64_t class_index_ = 0; + int64_t box_index_ = 0; +}; + +#ifdef __NVCC__ +namespace cuda { +#endif +namespace nms_helpers { + +ORT_DEVICE +inline void MaxMin(float lhs, float rhs, float& min, float& max) { + if (lhs >= rhs) { + min = rhs; + max = lhs; + } else { + min = lhs; + max = rhs; + } +} + +ORT_DEVICE +inline bool SuppressByIOU(const float* boxes_data, int64_t box_index1, int64_t box_index2, + int64_t center_point_box, float iou_threshold) { + float x1_min{}; + float y1_min{}; + float x1_max{}; + float y1_max{}; + float x2_min{}; + float y2_min{}; + float x2_max{}; + float y2_max{}; + + const float* box1 = boxes_data + 4 * box_index1; + const float* box2 = boxes_data + 4 * box_index2; + // center_point_box_ only support 0 or 1 + if (0 == center_point_box) { + // boxes data format [y1, x1, y2, x2], + MaxMin(box1[1], box1[3], x1_min, x1_max); + MaxMin(box1[0], box1[2], y1_min, y1_max); + MaxMin(box2[1], box2[3], x2_min, x2_max); + MaxMin(box2[0], box2[2], y2_min, y2_max); + } else { + // 1 == center_point_box_ => boxes data format [x_center, y_center, width, height] + float box1_width_half = box1[2] / 2; + float box1_height_half = box1[3] / 2; + float box2_width_half = box2[2] / 2; + float box2_height_half = box2[3] / 2; + + x1_min = box1[0] - box1_width_half; + x1_max = box1[0] + box1_width_half; + y1_min = box1[1] - box1_height_half; + y1_max = box1[1] + box1_height_half; + + x2_min = box2[0] - box2_width_half; + x2_max = box2[0] + box2_width_half; + y2_min = box2[1] - box2_height_half; + y2_max = box2[1] + box2_height_half; + } + + const float intersection_x_min = HelperMax(x1_min, x2_min); + const float intersection_y_min = HelperMax(y1_min, y2_min); + const float intersection_x_max = HelperMin(x1_max, x2_max); + const float intersection_y_max = HelperMin(y1_max, y2_max); + + const float intersection_area = HelperMax(intersection_x_max - intersection_x_min, .0f) * + HelperMax(intersection_y_max - intersection_y_min, .0f); + + if (intersection_area <= .0f) { + return false; + } + + const float area1 = (x1_max - x1_min) * (y1_max - y1_min); + const float area2 = (x2_max - x2_min) * (y2_max - y2_min); + const float union_area = area1 + area2 - intersection_area; + + if (area1 <= .0f || area2 <= .0f || union_area <= .0f) { + return false; + } + + const float intersection_over_union = intersection_area / union_area; + + return intersection_over_union > iou_threshold; +} +#ifdef __NVCC__ +} // namespace cuda +#endif +} // nms_helpers +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc index 007752f83da60..8c9143a238868 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc @@ -22,6 +22,14 @@ namespace onnxruntime { KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ x); +#define REGISTER_UNARY_ELEMENTWISE_KERNEL_DOUBLE_ONLY(x, sinceVersion) \ + ONNX_CPU_OPERATOR_TYPED_KERNEL( \ + x, \ + sinceVersion, \ + double, \ + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + x); + REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL1, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL2, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSum, 1); @@ -31,7 +39,9 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceProd, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 1); +REGISTER_UNARY_ELEMENTWISE_KERNEL_DOUBLE_ONLY(ReduceSum, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 1); +REGISTER_UNARY_ELEMENTWISE_KERNEL_DOUBLE_ONLY(ReduceSumSquare, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMax, 1); REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 1); diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc index 3c9f30f7b244a..c5be268f59e2d 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc @@ -238,7 +238,7 @@ class UniDirectionalGru { // #define DUMP_MATRIXES to provide lots of diagnostic output #if defined(DUMP_MATRIXES) -#define DumpMatrix(...) ::onnxruntime::rnn::detail::DumpMatrixImpl(__VA_ARGS__) +#define DumpMatrix(...) onnxruntime::rnn::detail::DumpMatrixImpl(__VA_ARGS__) #else #define DumpMatrix(...) ((void)0) #endif @@ -363,48 +363,29 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const { gsl::span hidden_output_2 = hidden_output.subspan(hidden_output_size_per_direction, hidden_output_size_per_direction); - std::unique_ptr> fw = std::make_unique>( - alloc, - seq_length, - batch_size, - input_size, - hidden_size_, - linear_before_reset_, - Direction::kForward, - bias_1, initial_hidden_1, - activation_funcs_.Entries()[0], - activation_funcs_.Entries()[1], - clip_); - fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1); - - std::unique_ptr> bw = std::make_unique>( - alloc, - seq_length, - batch_size, - input_size, - hidden_size_, - linear_before_reset_, - Direction::kReverse, - bias_2, initial_hidden_2, - activation_funcs_.Entries()[2], - activation_funcs_.Entries()[3], - clip_); - bw->Compute(input, sequence_lens_span, num_directions_, input_weights_2, recurrent_weights_2, output_2, hidden_output_2); + detail::UniDirectionalGru fw(alloc, seq_length, batch_size, input_size, hidden_size_, + linear_before_reset_, Direction::kForward, bias_1, initial_hidden_1, + activation_funcs_.Entries()[0], + activation_funcs_.Entries()[1], + clip_); + fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, + output_1, hidden_output_1); + + detail::UniDirectionalGru bw(alloc, seq_length, batch_size, input_size, hidden_size_, + linear_before_reset_, Direction::kReverse, bias_2, initial_hidden_2, + activation_funcs_.Entries()[2], + activation_funcs_.Entries()[3], + clip_); + bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, recurrent_weights_2, + output_2, hidden_output_2); } else { - std::unique_ptr> gru_p = std::make_unique>( - alloc, - seq_length, - batch_size, - input_size, - hidden_size_, - linear_before_reset_, - direction_, - bias_1, initial_hidden_1, - activation_funcs_.Entries()[0], - activation_funcs_.Entries()[1], - clip_); - - gru_p->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1); + detail::UniDirectionalGru gru_p(alloc, seq_length, batch_size, input_size, hidden_size_, + linear_before_reset_, direction_, bias_1, initial_hidden_1, + activation_funcs_.Entries()[0], + activation_funcs_.Entries()[1], + clip_); + gru_p.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, + output_1, hidden_output_1); } if (!output.empty()) @@ -610,8 +591,9 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, // for each item in sequence run all calculations for (int step = 0; step < max_sequence_length; step++) { +#if defined(DUMP_MATRIXES) const std::string seqno_str = " [seqno=" + std::to_string(step) + "]"; - +#endif DumpMatrix("Ht-1" + seqno_str, &*prev_Ht, batch_size_, hidden_size_); out_added_offset = (step * batch_size_) * hidden_size_x3; @@ -676,7 +658,9 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, } } +#if defined(DUMP_MATRIXES) std::string label = linear_before_reset_ ? "rt (.) (Ht-1 * (Rh^T) + Rbh)" : "rt (.) Ht-1"; +#endif DumpMatrix(label + seqno_str, &*cur_h_local, batch_size_, hidden_size_); if (linear_before_reset_) { @@ -695,7 +679,9 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, } } } else { +#if defined(DUMP_MATRIXES) label += " * Rh^T"; +#endif // out_H currently contains Xt*(Wh^T). auto out_H = outputZRH_.begin() + out_added_offset + hidden_size_x2; @@ -727,9 +713,11 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, for (int r = 0; r < batch_size_; r++) { if (step >= min_sequence_length && step >= sequence_lengths[r]) { - if (output_sequence) { + // if we need output for every step, + // or we need to set prev_Ht for an empty sequence to avoid warnings about using uninitialized values + if (output_sequence || (step == 0 && sequence_lengths[r] == 0)) { auto fill_output = output + r * hidden_size_; - std::fill_n(fill_output, hidden_size_, T{}); + std::fill_n(&*fill_output, hidden_size_, T{}); } continue; @@ -791,28 +779,29 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, // copy last output to final_hidden_state for (int i = 0; i < batch_size_; i++) { const int seq_len = sequence_lengths[i]; - if (seq_len == 0) { - auto final_hidden_state_dst = final_hidden_state.begin() + i * hidden_size_; - std::fill_n(final_hidden_state_dst, hidden_size_, T{}); - continue; - } if (output_sequence) { - auto src = outputs.subspan((seq_len - 1) * output_step_length + i * hidden_size_, hidden_size_); - auto dest = final_hidden_state.subspan(i * hidden_size_, hidden_size_); - gsl::copy(src, dest); + if (seq_len == 0) { + auto final_hidden_state_dst = final_hidden_state.begin() + i * hidden_size_; + std::fill_n(&*final_hidden_state_dst, hidden_size_, T{}); + } else { + auto src = outputs.subspan((seq_len - 1) * output_step_length + i * hidden_size_, hidden_size_); + auto dest = final_hidden_state.subspan(i * hidden_size_, hidden_size_); + gsl::copy(src, dest); + } } } - // zero any values beyond the evaluated steps + // zero any values beyond the evaluated steps if the maximum explicit sequence length we saw (max_sequence_length) + // was shorter than the maximum possible sequence length (seq_length_) if (output_sequence && max_sequence_length < seq_length_) { if (output_step_length == batch_size_ * hidden_size_) { // contiguous const auto span_to_zero = outputs.subspan( max_sequence_length * output_step_length, (seq_length_ - max_sequence_length) * output_step_length); - std::fill_n(span_to_zero.begin(), span_to_zero.size(), T{}); + std::fill_n(&*span_to_zero.begin(), span_to_zero.size(), T{}); } else { for (int i = max_sequence_length; i < seq_length_; ++i) { // non-contiguous const auto span_to_zero = outputs.subspan(i * output_step_length, batch_size_ * hidden_size_); - std::fill_n(span_to_zero.begin(), span_to_zero.size(), T{}); + std::fill_n(&*span_to_zero.begin(), span_to_zero.size(), T{}); } } } diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc index b8b569add4548..8f4e8236981f8 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc @@ -418,9 +418,6 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { gsl::span last_cell_1 = last_cell.subspan(0, last_cell_size_per_direction); - std::unique_ptr> fw; - std::unique_ptr> bw; - if (direction_ == Direction::kBidirectional) { // spans for second direction gsl::span input_weights_2 = input_weights.subspan(input_weights_size_per_direction, @@ -449,37 +446,37 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { gsl::span last_cell_2 = last_cell.subspan(last_cell_size_per_direction, last_cell_size_per_direction); - fw = std::make_unique>(alloc, logger, - seq_length, batch_size, input_size, - hidden_size_, Direction::kForward, input_forget_, - bias_1, peephole_weights_1, initial_hidden_1, initial_cell_1, - activation_funcs_.Entries()[0], - activation_funcs_.Entries()[1], - activation_funcs_.Entries()[2], - clip_, ttp_); - - bw = std::make_unique>(alloc, logger, - seq_length, batch_size, input_size, - hidden_size_, Direction::kReverse, input_forget_, - bias_2, peephole_weights_2, initial_hidden_2, initial_cell_2, - activation_funcs_.Entries()[3], - activation_funcs_.Entries()[4], - activation_funcs_.Entries()[5], - clip_, ttp_); - - fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); - bw->Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2); + detail::UniDirectionalLstm fw(alloc, logger, seq_length, batch_size, input_size, + hidden_size_, Direction::kForward, input_forget_, + bias_1, peephole_weights_1, initial_hidden_1, initial_cell_1, + activation_funcs_.Entries()[0], + activation_funcs_.Entries()[1], + activation_funcs_.Entries()[2], + clip_, ttp_); + + detail::UniDirectionalLstm bw(alloc, logger, seq_length, batch_size, input_size, + hidden_size_, Direction::kReverse, input_forget_, + bias_2, peephole_weights_2, initial_hidden_2, initial_cell_2, + activation_funcs_.Entries()[3], + activation_funcs_.Entries()[4], + activation_funcs_.Entries()[5], + clip_, ttp_); + + fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, + output_1, hidden_output_1, last_cell_1); + bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, + output_2, hidden_output_2, last_cell_2); } else { - fw = std::make_unique>(alloc, logger, - seq_length, batch_size, input_size, - hidden_size_, direction_, input_forget_, - bias_1, peephole_weights_1, initial_hidden_1, initial_cell_1, - activation_funcs_.Entries()[0], - activation_funcs_.Entries()[1], - activation_funcs_.Entries()[2], - clip_, ttp_); - - fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); + detail::UniDirectionalLstm fw(alloc, logger, seq_length, batch_size, input_size, + hidden_size_, direction_, input_forget_, + bias_1, peephole_weights_1, initial_hidden_1, initial_cell_1, + activation_funcs_.Entries()[0], + activation_funcs_.Entries()[1], + activation_funcs_.Entries()[2], + clip_, ttp_); + + fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, + output_1, hidden_output_1, last_cell_1); } if (!output.empty()) diff --git a/onnxruntime/core/providers/cpu/symbols.txt b/onnxruntime/core/providers/cpu/symbols.txt index 265b10260b342..fc7560f5b7696 100644 --- a/onnxruntime/core/providers/cpu/symbols.txt +++ b/onnxruntime/core/providers/cpu/symbols.txt @@ -19,6 +19,7 @@ OrtCreateRunOptions OrtCreateSession OrtCreateSessionFromArray OrtCreateSessionOptions +OrtCreateStatus OrtCreateTensorAsOrtValue OrtCreateTensorTypeAndShapeInfo OrtCreateTensorWithDataAsOrtValue @@ -50,7 +51,7 @@ OrtGetValueCount OrtGetValueType OrtGetVersionString OrtIsTensor -OrtOnnxTypeFromTypeInfo +OrtGetOnnxTypeFromTypeInfo OrtReleaseAllocator OrtReleaseAllocatorInfo OrtReleaseCustomOpDomain @@ -68,7 +69,8 @@ OrtRunOptionsGetRunLogVerbosityLevel OrtRunOptionsGetRunTag OrtRunOptionsSetRunLogVerbosityLevel OrtRunOptionsSetRunTag -OrtRunOptionsSetTerminate +OrtRunOptionsEnableTerminate +OrtRunOptionsDisableTerminate OrtSessionGetInputCount OrtSessionGetInputName OrtSessionGetInputTypeInfo diff --git a/onnxruntime/core/providers/cpu/tensor/concat.cc b/onnxruntime/core/providers/cpu/tensor/concat.cc index b970270509dde..afca4d421efe8 100644 --- a/onnxruntime/core/providers/cpu/tensor/concat.cc +++ b/onnxruntime/core/providers/cpu/tensor/concat.cc @@ -21,7 +21,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep const size_t inputs_0_rank = inputs_0_dims.size(); ORT_RETURN_IF_NOT(inputs_0_rank > 0, "Cannot concatenate scalars"); - uint64_t axis = static_cast(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions())); + p.axis = static_cast(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions())); // cache num of elements in tensor for later use // as it's expensive to call Size() on TensorShape over and over @@ -39,7 +39,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep // Ensure all the other (non-concat) axes match for (size_t axis_index = 0; axis_index < inputs_0_rank; ++axis_index) { num_elements *= inputs_n_dims[axis_index]; - if (axis_index == axis) + if (axis_index == p.axis) continue; ORT_RETURN_IF_NOT(inputs_n_dims[axis_index] == inputs_0_dims[axis_index], "Non concat axis dimensions must match: Axis ", @@ -53,7 +53,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep size_t concat_axis_size = 0; for (int index = 0; index < input_count; index++) { tensor_pointer = ctx->Input(index); - concat_axis_size += tensor_pointer->Shape()[int(axis)]; + concat_axis_size += tensor_pointer->Shape()[int(p.axis)]; } // Calculate the shape of the output tensor @@ -64,7 +64,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep num_elements *= inputs_0_dims[dimension_index]; } tensor_num_elements[0] = num_elements; - dims[axis] = concat_axis_size; + dims[p.axis] = concat_axis_size; TensorShape output_shape(dims); auto& concat_result = *ctx->Output(0, output_shape); @@ -78,7 +78,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep // The output_axis_pitch is the number of elements to add to move to the next split axis in the output p.output_axis_pitch = 1; - for (size_t i = inputs_0_rank; i-- > axis;) p.output_axis_pitch *= dims[i]; + for (size_t i = inputs_0_rank; i-- > p.axis;) p.output_axis_pitch *= dims[i]; p.inputs.reserve(input_count); for (int input_index = 0; input_index < input_count; input_index++) { @@ -90,7 +90,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep // The input_axis_pitch is the number of elements to add to move to the next split axis in the input int64_t input_axis_pitch = 1; const auto& data_dims = data_n.Shape().GetDims(); - for (size_t i = inputs_0_rank; i-- > axis;) input_axis_pitch *= data_dims[i]; + for (size_t i = inputs_0_rank; i-- > p.axis;) input_axis_pitch *= data_dims[i]; p.inputs.push_back({&data_n, tensor_num_elements[input_index], input_axis_pitch}); } diff --git a/onnxruntime/core/providers/cpu/tensor/concat.h b/onnxruntime/core/providers/cpu/tensor/concat.h index f7df2e3d169de..f5267ea3d51f5 100644 --- a/onnxruntime/core/providers/cpu/tensor/concat.h +++ b/onnxruntime/core/providers/cpu/tensor/concat.h @@ -28,6 +28,7 @@ class ConcatBase { int64_t output_num_elements; int64_t output_axis_pitch; Tensor* output_tensor; + uint64_t axis; }; Status PrepareForCompute(OpKernelContext* ctx, int input_count, Prepare& p) const; diff --git a/onnxruntime/core/providers/cpu/tensor/identity_op.h b/onnxruntime/core/providers/cpu/tensor/identity_op.h index 5a583e48f1679..7cea426ff9a5d 100644 --- a/onnxruntime/core/providers/cpu/tensor/identity_op.h +++ b/onnxruntime/core/providers/cpu/tensor/identity_op.h @@ -43,7 +43,18 @@ class IdentityOp final : public OpKernel { } if (is_dropout) { - context->Output(1, std::vector()); + Tensor* mask = context->Output(1, shape); + // a 'nullptr' returned would make it an unused optional output + if (mask != nullptr) { + // Opset 7 differs with Opset 10 in that the type of the 'mask' + // output is tied with the type of the input in Opset 7 whereas + // the type of 'mask' in Opset 10 is 'bool' always + // so we have a common solution + void* mask_data = mask->MutableDataRaw(); + // In 'test'/'inference' mode, there are no input values dropped out + // so fill the buffer with 0/false + memset(mask_data, 0, mask->SizeInBytes()); + } } return Status::OK(); diff --git a/onnxruntime/core/providers/cpu/tensor/onehot.cc b/onnxruntime/core/providers/cpu/tensor/onehot.cc index e35e1b6b72729..1dfbaaf37640f 100644 --- a/onnxruntime/core/providers/cpu/tensor/onehot.cc +++ b/onnxruntime/core/providers/cpu/tensor/onehot.cc @@ -45,6 +45,7 @@ REG_ONE_HOT_OP(int64_t, int64_t, int64_t); REG_ONE_HOT_OP(float, int64_t, int64_t); REG_ONE_HOT_OP(int64_t, string, int64_t); REG_ONE_HOT_OP(float, string, int64_t); +REG_ONE_HOT_OP(int64_t, float, int64_t); REG_ONE_HOT_OP(float, float, float); // added this to satisfy onnx model tests REG_ONE_HOT_OP(int64_t, int32_t, float); // added this to satisfy onnx model tests @@ -120,16 +121,28 @@ Status OneHotOp::Compute(OpKernelContext* p_op_ke const auto& indices_dims = indices_shape.GetDims(); const auto indices_num_dims = indices_shape.NumDimensions(); std::vector output_shape(indices_shape.GetDims()); - output_shape.insert(axis_ == -1 ? output_shape.end() : output_shape.begin() + axis_, - depth_val); + + // output rank is always 1 more than the input rank as a new dimension is added to the input shape + const auto output_rank = static_cast(indices_num_dims + 1); + if (axis_ >= output_rank || axis_ < -output_rank) { + std::ostringstream oss; + oss << "'axis' attribute must have a value in the range [" << -output_rank + << "," << indices_num_dims << "]"; + return Status(ONNXRUNTIME, INVALID_ARGUMENT, oss.str()); + } + + auto true_axis = axis_; + if (true_axis < 0) + true_axis += output_rank; + + output_shape.insert(output_shape.begin() + true_axis, depth_val); // allocate output const auto* values_data = values->Data(); Tensor* output = p_op_kernel_context->Output(0, TensorShape(output_shape)); - const int64_t axis = (axis_ == -1) ? indices_num_dims : axis_; int64_t prefix_dim_size = 1; - for (int64_t i = 0; i < axis; ++i) { + for (int64_t i = 0; i < true_axis; ++i) { prefix_dim_size *= indices_dims[i]; } const int64_t suffix_dim_size = indices_shape.Size() / prefix_dim_size; diff --git a/onnxruntime/core/providers/cpu/tensor/onehot.h b/onnxruntime/core/providers/cpu/tensor/onehot.h index a05731b777929..495342d99b3e7 100644 --- a/onnxruntime/core/providers/cpu/tensor/onehot.h +++ b/onnxruntime/core/providers/cpu/tensor/onehot.h @@ -14,9 +14,6 @@ class OneHotOp final : public OpKernel { explicit OneHotOp(const OpKernelInfo& op_kernel_info) : OpKernel(op_kernel_info) { int64_t tmp_axis; if (op_kernel_info.GetAttr("axis", &tmp_axis).IsOK()) { - if (tmp_axis < -1) { // as per spec it can be -1 or more - ORT_THROW("Value of axis is < -1"); - } axis_ = tmp_axis; } } diff --git a/onnxruntime/core/providers/cpu/tensor/scatter.cc b/onnxruntime/core/providers/cpu/tensor/scatter.cc index 0cdacbef2eba5..403ef62ec3663 100644 --- a/onnxruntime/core/providers/cpu/tensor/scatter.cc +++ b/onnxruntime/core/providers/cpu/tensor/scatter.cc @@ -45,7 +45,7 @@ Status CopyScatterData(const Tensor* data_input, const Tensor* indices_input, co } const auto input_elements = input_data_shape.Size(); - const auto total_input_bytes = data_input->Size(); + const auto total_input_bytes = data_input->SizeInBytes(); const auto* src_base = static_cast(data_input->DataRaw()); auto* dst_base = static_cast(data_output->MutableDataRaw()); diff --git a/onnxruntime/core/providers/cpu/tensor/size.cc b/onnxruntime/core/providers/cpu/tensor/size.cc index a649560c105f8..675c14b8cfee6 100644 --- a/onnxruntime/core/providers/cpu/tensor/size.cc +++ b/onnxruntime/core/providers/cpu/tensor/size.cc @@ -12,7 +12,7 @@ Status Size::Compute(OpKernelContext* ctx) const { TensorShape scalar_shape; Tensor* p_output_tensor = ctx->Output(0, scalar_shape); auto* p_output_scalar = p_output_tensor->template MutableData(); - assert(p_output_tensor->Size() == sizeof(int64_t)); + assert(p_output_tensor->SizeInBytes() == sizeof(int64_t)); *p_output_scalar = input_tensor->Shape().Size(); diff --git a/onnxruntime/core/providers/cpu/tensor/squeeze.h b/onnxruntime/core/providers/cpu/tensor/squeeze.h index 2387832018ae4..f6489e5cf2e03 100644 --- a/onnxruntime/core/providers/cpu/tensor/squeeze.h +++ b/onnxruntime/core/providers/cpu/tensor/squeeze.h @@ -13,13 +13,15 @@ class SqueezeBase { protected: explicit SqueezeBase(const OpKernelInfo& info) { std::vector axes; - Status status = info.GetAttrs("axes", axes); - ORT_ENFORCE(status.IsOK(), "Attribute axes is not set."); + // Parse attribute 'axes' + Status status = info.GetAttrs("axes", axes); - // Handle out of order and repeating dims. - std::sort(axes.begin(), axes.end()); - axes.erase(std::unique(axes.begin(), axes.end()), axes.end()); - axes_ = axes; + // Handle out of order and repeating dims when 'axes' exists. + if (status.IsOK()) { + std::sort(axes.begin(), axes.end()); + axes.erase(std::unique(axes.begin(), axes.end()), axes.end()); + axes_ = axes; + } } static std::vector ComputeOutputShape( @@ -28,7 +30,8 @@ class SqueezeBase { size_t j = 0; std::vector output_shape; for (size_t i = 0; i < input_shape.NumDimensions(); ++i) { - if (j < axes.NumDimensions() && axes[j] == static_cast(i)) { + if ((j < axes.NumDimensions() && axes[j] == static_cast(i)) || + (axes.NumDimensions() == 0 && input_shape[i] == 1)) { ORT_ENFORCE(input_shape[i] == 1, "Dimension of input ", i, " must be 1 instead of ", input_shape[i], ". shape=", input_shape); ++j; @@ -59,4 +62,4 @@ class Squeeze final : public OpKernel, public SqueezeBase { } }; -} // namespace onnxruntime +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc index 6cbacc9b4dc48..95605dbef4a68 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsample.cc +++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc @@ -66,30 +66,121 @@ Status UpsampleNearest(const T* input, return Status(ONNXRUNTIME, FAIL, "Upsample: input/output value is nullptr"); if (input_shape.NumDimensions() != output_shape.NumDimensions()) return Status(ONNXRUNTIME, FAIL, "Upsample: input/output value's dimension mismatch"); - auto n_dim = input_shape.NumDimensions(); - if (scales.size() == 4 && scales[0] == 1 && scales[1] == 1 && scales[2] == 2 && scales[3] == 2) { - UpsampleNearest2x(input_shape[0], input_shape[1], input_shape[2], input_shape[3], input, output); - } else { - for (size_t i = 0, size = output_shape.Size(); i < size; i++) { - size_t old_idx = 0; - size_t cur_idx = i; + if (input_shape.NumDimensions() == 0) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "Upsample: input shape needs to be at least a single dimension."); + } - int64_t base = 1; - for (auto j = static_cast(n_dim - 1); j >= 0; j--) { - auto tmp = cur_idx % output_shape[j]; + int64_t n_dim = static_cast(input_shape.NumDimensions()); + + std::vector input_dim_counters(n_dim); + std::vector input_dim_factor(n_dim); + input_dim_factor[n_dim - 1] = 1; // initialize dimension factor + for (int64_t dim_idx = n_dim - 2; dim_idx >= 0; dim_idx--) { + input_dim_factor[dim_idx] = input_dim_factor[dim_idx + 1] * input_shape[dim_idx + 1]; + } + + int64_t output_idx = 0; + int64_t input_idx = 0; - if (scales[j] < 1) { //downsample - old_idx += (std::min(static_cast(std::ceil(tmp / scales[j])), input_shape[j] - 1)) * base; - } else { //upsample - old_idx += (std::min(static_cast(tmp / scales[j]), input_shape[j] - 1)) * base; +#define OneDemensionProcessor(dim_inx) \ + int64_t input_dim##dim_inx##_inx = \ + static_cast(scales[dim_inx] < 1 ? std::ceil(output_dim##dim_inx##_inx / scales[dim_inx]) : output_dim##dim_inx##_inx / scales[dim_inx]); \ + if (input_dim##dim_inx##_inx > input_shape[dim_inx] - 1) input_dim##dim_inx##_inx = input_shape[dim_inx] - 1; \ + if (input_dim##dim_inx##_inx != input_dim_counters[dim_inx]) { \ + input_idx += (input_dim##dim_inx##_inx - input_dim_counters[dim_inx]) * input_dim_factor[dim_inx]; \ + input_dim_counters[dim_inx] = input_dim##dim_inx##_inx; \ + } + + if (n_dim == 1) { + for (int64_t output_dim0_inx = 0; output_dim0_inx < output_shape[0]; output_dim0_inx++) { + OneDemensionProcessor(0); + output[output_idx++] = input[input_idx]; + } + return Status::OK(); + } + + if (n_dim == 2) { + for (int64_t output_dim0_inx = 0; output_dim0_inx < output_shape[0]; output_dim0_inx++) { + OneDemensionProcessor(0); + for (int64_t output_dim1_inx = 0; output_dim1_inx < output_shape[1]; output_dim1_inx++) { + OneDemensionProcessor(1); + output[output_idx++] = input[input_idx]; + } + } + return Status::OK(); + } + + if (n_dim == 3) { + for (int64_t output_dim0_inx = 0; output_dim0_inx < output_shape[0]; output_dim0_inx++) { + OneDemensionProcessor(0); + for (int64_t output_dim1_inx = 0; output_dim1_inx < output_shape[1]; output_dim1_inx++) { + OneDemensionProcessor(1); + for (int64_t output_dim2_inx = 0; output_dim2_inx < output_shape[2]; output_dim2_inx++) { + OneDemensionProcessor(2); + output[output_idx++] = input[input_idx]; + } + } + } + return Status::OK(); + } + + if (n_dim == 4) { + if (scales[0] == 1 && scales[1] == 1 && scales[2] == 2 && scales[3] == 2) { + UpsampleNearest2x(input_shape[0], input_shape[1], input_shape[2], input_shape[3], input, output); + return Status::OK(); + } + for (int64_t output_dim0_inx = 0; output_dim0_inx < output_shape[0]; output_dim0_inx++) { + OneDemensionProcessor(0); + for (int64_t output_dim1_inx = 0; output_dim1_inx < output_shape[1]; output_dim1_inx++) { + OneDemensionProcessor(1); + for (int64_t output_dim2_inx = 0; output_dim2_inx < output_shape[2]; output_dim2_inx++) { + OneDemensionProcessor(2); + for (int64_t output_dim3_inx = 0; output_dim3_inx < output_shape[3]; output_dim3_inx++) { + OneDemensionProcessor(3); + output[output_idx++] = input[input_idx]; + } } - base *= input_shape[j]; - cur_idx /= output_shape[j]; } + } + return Status::OK(); + } + +#undef OneDemensionProcessor - output[i] = input[old_idx]; + std::vector output_dim_counter(n_dim); + output_dim_counter[n_dim - 1] = -1; // initialize dimension counter + + for (; output_idx < output_shape.Size(); output_idx++) { + for (int64_t dim_idx = n_dim - 1; dim_idx >= 0; dim_idx--) { + if (++output_dim_counter[dim_idx] < output_shape[dim_idx]) { + int64_t current_input_dim_counter = 0; + if (scales[dim_idx] < 1) //downsample + { + current_input_dim_counter = static_cast(std::ceil(output_dim_counter[dim_idx] / scales[dim_idx])); + } else //upsample + { + current_input_dim_counter = static_cast(output_dim_counter[dim_idx] / scales[dim_idx]); + } + + if (current_input_dim_counter >= input_shape[dim_idx] - 1) + current_input_dim_counter = input_shape[dim_idx] - 1; + + if (current_input_dim_counter != input_dim_counters[dim_idx]) { + input_idx += (current_input_dim_counter - input_dim_counters[dim_idx]) * input_dim_factor[dim_idx]; + input_dim_counters[dim_idx] = current_input_dim_counter; + } + break; + } else { + output_dim_counter[dim_idx] = 0; + input_idx += (0 - input_dim_counters[dim_idx]) * input_dim_factor[dim_idx]; + input_dim_counters[dim_idx] = 0; + } } + + output[output_idx] = input[input_idx]; } + return Status::OK(); } @@ -165,15 +256,15 @@ void upsampleBilinear( auto output_width = static_cast(input_width * width_scale); auto output_height = static_cast(input_height * height_scale); - size_t inx_buffer_size = 2 * sizeof(int64_t) * (output_height + output_width); + size_t idx_buffer_size = 2 * sizeof(int64_t) * (output_height + output_width); size_t scale_buffer_size = 2 * sizeof(float_t) * (output_height + output_width); - auto inx_scale_data_buffer = alloc->Alloc(inx_buffer_size + scale_buffer_size); - BufferUniquePtr inx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc)); - auto* inx_data = static_cast(inx_scale_data_buffer_holder.get()); - int64_t* input_width_mul_y1 = inx_data; - int64_t* input_width_mul_y2 = inx_data + output_height; - int64_t* in_x1 = inx_data + 2 * output_height; - int64_t* in_x2 = inx_data + 2 * output_height + output_width; + auto inx_scale_data_buffer = alloc->Alloc(idx_buffer_size + scale_buffer_size); + BufferUniquePtr idx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc)); + auto* idx_data = static_cast(idx_scale_data_buffer_holder.get()); + int64_t* input_width_mul_y1 = idx_data; + int64_t* input_width_mul_y2 = idx_data + output_height; + int64_t* in_x1 = idx_data + 2 * output_height; + int64_t* in_x2 = idx_data + 2 * output_height + output_width; auto* scale_data = reinterpret_cast(in_x2 + output_width); float* dy1 = scale_data; @@ -240,12 +331,21 @@ Status Upsample::BaseCompute(OpKernelContext* context, const std::vector Y_dims; + Y_dims.reserve( dims.size() ); for (std::size_t i = 0; i < dims.size(); i++) { - Y_dims.push_back(static_cast(scales[i] * dims[i])); + int64_t dim_y = static_cast(scales[i] * dims[i]); + if (no_scale && dim_y != dims[i]) no_scale = false; + Y_dims.push_back(dim_y); } Tensor* Y = context->Output(0, Y_dims); + if (no_scale) { + memcpy(Y->MutableDataRaw(), X->DataRaw(), Y->SizeInBytes()); + return Status::OK(); + } + switch (mode_) { case UpsampleMode::NN: return UpsampleNearest(X->template Data(), Y->template MutableData(), X->Shape(), Y->Shape(), scales); diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc index b94d9cba9fa15..44cbbd75d0fc2 100644 --- a/onnxruntime/core/providers/cuda/cuda_allocator.cc +++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc @@ -1,17 +1,19 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "cuda_common.h" #include "cuda_allocator.h" +#include "cuda_common.h" #include "core/framework/allocatormgr.h" #include "core/framework/session_state.h" #include "cuda_fence.h" +#include "gpu_data_transfer.h" namespace onnxruntime { -static const CUDAExecutionProvider* GetCUDAExecutionProvider(const SessionState* session_state) { - return dynamic_cast( - session_state->GetExecutionProviders().Get(onnxruntime::kCudaExecutionProvider)); +static const GPUDataTransfer* GetGPUDataTransfer(const SessionState* session_state) { + OrtDevice gpu_device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0); + OrtDevice cpu_device; + return dynamic_cast(session_state->GetDataTransferMgr().GetDataTransfer(gpu_device, cpu_device)); } void CUDAAllocator::CheckDevice() const { @@ -43,7 +45,7 @@ const OrtAllocatorInfo& CUDAAllocator::Info() const { } FencePtr CUDAAllocator::CreateFence(const SessionState* session_state) { - return std::make_shared(GetCUDAExecutionProvider(session_state)); + return std::make_shared(GetGPUDataTransfer(session_state)); } void* CUDAPinnedAllocator::Alloc(size_t size) { @@ -59,12 +61,12 @@ void CUDAPinnedAllocator::Free(void* p) { } const OrtAllocatorInfo& CUDAPinnedAllocator::Info() const { - static constexpr OrtAllocatorInfo cuda_allocator_info(CUDA_PINNED, OrtDeviceAllocator, 0, OrtMemTypeCPUOutput); + static constexpr OrtAllocatorInfo cuda_allocator_info(CUDA_PINNED, OrtDeviceAllocator, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, 0), 0, OrtMemTypeCPUOutput); return cuda_allocator_info; } FencePtr CUDAPinnedAllocator::CreateFence(const SessionState* session_state) { - return std::make_shared(GetCUDAExecutionProvider(session_state)); + return std::make_shared(GetGPUDataTransfer(session_state)); } } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.h b/onnxruntime/core/providers/cuda/cuda_allocator.h index 951b27bccb8ef..06f6caa784c0e 100644 --- a/onnxruntime/core/providers/cuda/cuda_allocator.h +++ b/onnxruntime/core/providers/cuda/cuda_allocator.h @@ -6,12 +6,10 @@ #include "core/framework/allocator.h" namespace onnxruntime { -constexpr const char* CUDA = "Cuda"; -constexpr const char* CUDA_PINNED = "CudaPinned"; class CUDAAllocator : public IDeviceAllocator { public: - CUDAAllocator(int device_id) : info_(CUDA, OrtAllocatorType::OrtDeviceAllocator, device_id, OrtMemTypeDefault) {} + CUDAAllocator(int device_id) : info_(CUDA, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id), device_id, OrtMemTypeDefault) {} virtual void* Alloc(size_t size) override; virtual void Free(void* p) override; virtual const OrtAllocatorInfo& Info() const override; diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h index 9ce0ffbfb9535..ce271297e4286 100644 --- a/onnxruntime/core/providers/cuda/cuda_common.h +++ b/onnxruntime/core/providers/cuda/cuda_common.h @@ -4,6 +4,7 @@ #pragma once #include "cuda_pch.h" #include "core/common/status.h" +#include "core/framework/data_transfer_manager.h" #include "core/framework/op_kernel.h" #include "core/graph/graph_viewer.h" #include "shared_inc/cuda_call.h" @@ -137,7 +138,7 @@ class CudaKernel : public OpKernel { } inline Status CopyTensor(const Tensor& src, Tensor& dst) const { - return provider_->CopyTensor(src, dst); + return Info().GetDataTransferManager().CopyTensor(src, dst); } inline int GetDeviceId() const { return provider_->GetDeviceId(); } diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index a2854e3b6365f..6509cf01fdf9a 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -64,10 +64,6 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() { CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& info) : IExecutionProvider{onnxruntime::kCudaExecutionProvider}, device_id_(info.device_id) { CUDA_CALL_THROW(cudaSetDevice(device_id_)); - // create streams, default is nullptr - streams_[kCudaStreamDefault] = nullptr; - CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking)); - CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking)); DeviceAllocatorRegistrationInfo default_allocator_info( {OrtMemTypeDefault, [](int id) { return std::make_unique(id); }, std::numeric_limits::max()}); @@ -93,9 +89,6 @@ CUDAExecutionProvider::~CUDAExecutionProvider() { CUDA_CALL_THROW(cudaEventDestroy(e)); it = deferred_release_cpu_ptr_.erase(it); } - CUDA_CALL_THROW(cudaStreamDestroy(streams_[kCudaStreamCopyIn])); - CUDA_CALL_THROW(cudaStreamDestroy(streams_[kCudaStreamCopyOut])); - ReleasePerThreadStuffs(); } @@ -199,52 +192,6 @@ Status CUDAExecutionProvider::OnRunEnd() { return Status::OK(); } -Status CUDAExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const { - return CopyTensor(src, dst, kCudaStreamDefault); -} - -Status CUDAExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const { - if (src.Shape().Size() != dst.Shape().Size()) { - return Status(ONNXRUNTIME, FAIL, "Tensor size mismatch"); - } - - if (strcmp(src.Location().name, CUDA) != 0 && strcmp(src.Location().name, CUDA_PINNED) != 0 && - strcmp(dst.Location().name, CUDA) != 0 && strcmp(dst.Location().name, CUDA_PINNED) != 0) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported tensor location: src_location is: ", src.Location().name, " and dst_location is: ", dst.Location().name); - } - - size_t bytes = src.Size(); - - const void* src_data = src.DataRaw(); - void* dst_data = dst.MutableDataRaw(); - - if (strcmp(dst.Location().name, CUDA) == 0) { - if (strcmp(src.Location().name, CUDA_PINNED) == 0) { - // copy from pinned memory to GPU, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id])); - } else if (strcmp(src.Location().name, CUDA) == 0) { - // copying between GPU, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault])); - } else { - // copy from other CPU memory to GPU, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice)); - } - } else if (strcmp(src.Location().name, CUDA) == 0) { - if (strcmp(dst.Location().name, CUDA_PINNED) == 0) { - // copying from GPU to pinned memory, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id])); - } else { - // copying from GPU to CPU memory, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost)); - } - } else { - // copying between cpu memory - memcpy(dst_data, src_data, bytes); - } - - return Status::OK(); -} - namespace cuda { class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost); @@ -253,7 +200,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Un class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Squeeze); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Identity); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, Dropout); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, Dropout); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Gather); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Gemm); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Gemm); @@ -568,6 +515,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Shrink); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Shrink); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout); static void RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { @@ -578,7 +526,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -893,6 +841,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { @@ -1083,7 +1032,11 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, if (!force_inside && (not_supported || force_outside)) { defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend()); - LOGS_DEFAULT(WARNING) << "Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name(); + if (not_supported) { + LOGS_DEFAULT(WARNING) << "CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name(); + } else if (force_outside) { + LOGS_DEFAULT(INFO) << "Force fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name(); + } } else { // for nodes placed on CUDA, check if its output is on CPU node.ForEachWithIndex( diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index b42ffc8854938..bd6e25b18b7bb 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -7,6 +7,7 @@ #include "core/graph/constants.h" #include "core/framework/allocatormgr.h" #include "core/framework/execution_provider.h" +#include "core/providers/cuda/gpu_data_transfer.h" #include "shared_inc/cuda_utils.h" #include @@ -17,13 +18,6 @@ struct CUDAExecutionProviderInfo { int device_id{0}; }; -enum CUDAStreamType : int { - kCudaStreamDefault = 0, - kCudaStreamCopyIn, - kCudaStreamCopyOut, - kTotalCudaStreams, -}; - // Logical device representation. class CUDAExecutionProvider : public IExecutionProvider { public: @@ -38,10 +32,6 @@ class CUDAExecutionProvider : public IExecutionProvider { Status OnRunEnd() override; - Status CopyTensor(const Tensor& src, Tensor& dst) const override; - - Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override; - cublasHandle_t PerThreadCublasHandle() { return GetPerThreadContext().CublasHandle(); } @@ -50,11 +40,6 @@ class CUDAExecutionProvider : public IExecutionProvider { return GetPerThreadContext().CudnnHandle(); } - cudaStream_t GetStream(int queue_id) const { - ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams); - return streams_[queue_id]; - } - template const T* GetConstOnes(size_t count) { return GetPerThreadContext().template GetConstOnes(count); @@ -79,7 +64,6 @@ class CUDAExecutionProvider : public IExecutionProvider { int GetDeviceId() const { return device_id_; } private: - cudaStream_t streams_[kTotalCudaStreams]; int device_id_; struct DeferredReleaseCPUPtrs { diff --git a/onnxruntime/core/providers/cuda/cuda_fence.cc b/onnxruntime/core/providers/cuda/cuda_fence.cc index ce368a7daa5c8..463a560ee8037 100644 --- a/onnxruntime/core/providers/cuda/cuda_fence.cc +++ b/onnxruntime/core/providers/cuda/cuda_fence.cc @@ -3,10 +3,11 @@ #include "cuda_common.h" #include "cuda_fence.h" +#include "gpu_data_transfer.h" namespace onnxruntime { -CUDAFence::CUDAFence(const CUDAExecutionProvider* provider) : provider_(provider) { +CUDAFence::CUDAFence(const GPUDataTransfer* data_transfer) : data_transfer_(data_transfer) { // NOTE: cudaEventBlockingSync may leads to longer wait time because of thread yield/switching in kernel // if lower CPU usage is more important than latency, we should use this flag to avoid spin-loop in WaitOnCPU int event_flags = /*cudaEventBlockingSync |*/ cudaEventDisableTiming; @@ -22,7 +23,7 @@ CUDAFence::~CUDAFence() { void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int async_queue_id) { if (provider_type == onnxruntime::kCudaExecutionProvider) { // sync in GPU, the call is non-blocking on CPU - CUDA_CALL_THROW(cudaStreamWaitEvent(provider_->GetStream(async_queue_id), write_event_, 0)); + CUDA_CALL_THROW(cudaStreamWaitEvent(data_transfer_->GetStream(async_queue_id), write_event_, 0)); } else { // sync on CPU for all other providers, this is blocking CUDA_CALL_THROW(cudaEventSynchronize(write_event_)); @@ -32,7 +33,7 @@ void CUDAFence::BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int void CUDAFence::BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) { if (provider_type == onnxruntime::kCudaExecutionProvider) { // sync in GPU, the call is non-blocking on CPU - cudaStream_t stream = provider_->GetStream(queue_id); + cudaStream_t stream = data_transfer_->GetStream(queue_id); CUDA_CALL_THROW(cudaStreamWaitEvent(stream, read_event_, 0)); CUDA_CALL_THROW(cudaStreamWaitEvent(stream, write_event_, 0)); } else { @@ -49,13 +50,13 @@ bool CUDAFence::CanRelease() { void CUDAFence::AfterUsedAsInput(int queue_id) { // update read fence - cudaStream_t stream = provider_->GetStream(queue_id); + cudaStream_t stream = data_transfer_->GetStream(queue_id); CUDA_CALL_THROW(cudaEventRecord(read_event_, stream)); } void CUDAFence::AfterUsedAsOutput(int queue_id) { // update write fence - cudaStream_t stream = provider_->GetStream(queue_id); + cudaStream_t stream = data_transfer_->GetStream(queue_id); CUDA_CALL_THROW(cudaEventRecord(write_event_, stream)); } diff --git a/onnxruntime/core/providers/cuda/cuda_fence.h b/onnxruntime/core/providers/cuda/cuda_fence.h index c52c55041f97e..e14883d27820a 100644 --- a/onnxruntime/core/providers/cuda/cuda_fence.h +++ b/onnxruntime/core/providers/cuda/cuda_fence.h @@ -3,12 +3,14 @@ #pragma once #include "core/framework/tensor.h" -#include "cuda_execution_provider.h" +#include "core/graph/basic_types.h" + namespace onnxruntime { +class GPUDataTransfer; class CUDAFence : public IFence { public: - CUDAFence(const CUDAExecutionProvider* provider); + CUDAFence(const GPUDataTransfer* data_transfer); virtual ~CUDAFence(); virtual void BeforeUsingAsInput(onnxruntime::ProviderType provider_type, int queue_id) override; virtual void BeforeUsingAsOutput(onnxruntime::ProviderType provider_type, int queue_id) override; @@ -19,7 +21,7 @@ class CUDAFence : public IFence { private: cudaEvent_t read_event_; cudaEvent_t write_event_; - const CUDAExecutionProvider* provider_; + const GPUDataTransfer* data_transfer_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc index 910f828c01776..1322b016befd7 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.cc +++ b/onnxruntime/core/providers/cuda/cudnn_common.cc @@ -83,7 +83,8 @@ Status CudnnDataTensor::Set(cudnnDataType_t dataType, const int32_t* seq_lengths) { ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); - cudnnRNNDataLayout_t layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED; + // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences + cudnnRNNDataLayout_t layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED; float padding_fill = 0.0f; CUDNN_RETURN_IF_ERROR(cudnnSetRNNDataDescriptor(tensor_, dataType, layout, static_cast(max_seq_length), diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h index 158847e8d87af..02a3ba6b694bb 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.h +++ b/onnxruntime/core/providers/cuda/cudnn_common.h @@ -60,6 +60,53 @@ class CudnnFilterDescriptor final { cudnnFilterDescriptor_t desc_; }; +class CudnnDropout final { + public: + CudnnDropout() : dropout_desc_(nullptr) { + } + + Status GetCudnnDropoutStatesSize(const cudnnHandle_t& cudnnHandle, size_t& stateSize) { + CUDNN_RETURN_IF_ERROR(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize)); + + return Status::OK(); + } + + Status Set(const cudnnHandle_t& cudnnHandle, + void* states, + size_t stateSize, + float dropout = 0.0f, + unsigned long long seed = 1) { + ORT_RETURN_IF_ERROR(CreateDescriptorIfNeeded()); + CUDNN_RETURN_IF_ERROR(cudnnSetDropoutDescriptor(dropout_desc_, + cudnnHandle, + dropout, + states, + stateSize, + seed)); + + return Status::OK(); + } + + ~CudnnDropout() { + if (dropout_desc_ != nullptr) { + cudnnDestroyDropoutDescriptor(dropout_desc_); + } + } + + operator cudnnDropoutDescriptor_t() const { + return dropout_desc_; + } + + private: + Status CreateDescriptorIfNeeded() { + if (!dropout_desc_) + CUDNN_RETURN_IF_ERROR(cudnnCreateDropoutDescriptor(&dropout_desc_)); + return Status::OK(); + } + + cudnnDropoutDescriptor_t dropout_desc_; +}; + template struct Consts { static const ElemType Zero; diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc new file mode 100644 index 0000000000000..8fae7ae8b0d34 --- /dev/null +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/gpu_data_transfer.h" +#include "cuda_common.h" + +namespace onnxruntime { +GPUDataTransfer::GPUDataTransfer() { + // create streams, default is nullptr + streams_[kCudaStreamDefault] = nullptr; + CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking)); + CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking)); +} + +GPUDataTransfer::~GPUDataTransfer() { + CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn])); + CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut])); +} + +bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { + return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED + || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED; +} + +common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const { + size_t bytes = src.SizeInBytes(); + const void* src_data = src.DataRaw(); + void* dst_data = dst.MutableDataRaw(); + + auto& src_device = src.Location().device; + auto& dst_device = dst.Location().device; + + if (dst_device.Type() == OrtDevice::GPU) { + if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { + // copy from pinned memory to GPU, this is non-blocking + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id])); + } else if (src_device.Type() == OrtDevice::GPU) { + // copying between GPU, this is non-blocking + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault])); + } else { + // copy from other CPU memory to GPU, this is blocking + CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice)); + } + } else if (src_device.Type() == OrtDevice::GPU) { + if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { + // copying from GPU to pinned memory, this is non-blocking + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id])); + } else { + // copying from GPU to CPU memory, this is blocking + CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost)); + } + } else { + // copying between cpu memory + memcpy(dst_data, src_data, bytes); + } + + return Status::OK(); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h new file mode 100644 index 0000000000000..0f3d4687eb5e5 --- /dev/null +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "cuda_pch.h" +#include "core/framework/data_transfer.h" + +namespace onnxruntime { + +enum CUDAStreamType : int { + kCudaStreamDefault = 0, + kCudaStreamCopyIn, + kCudaStreamCopyOut, + kTotalCudaStreams, +}; + +class GPUDataTransfer : public IDataTransfer { + public: + GPUDataTransfer(); + ~GPUDataTransfer(); + + bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; + + common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override; + + cudaStream_t GetStream(int queue_id) const { + ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams); + return streams_[queue_id]; + } + + private: + cudaStream_t streams_[kTotalCudaStreams]; +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/shrink.cc b/onnxruntime/core/providers/cuda/nn/shrink.cc index 90d0e86387057..09eb264b74790 100644 --- a/onnxruntime/core/providers/cuda/nn/shrink.cc +++ b/onnxruntime/core/providers/cuda/nn/shrink.cc @@ -17,6 +17,7 @@ namespace cuda { T, \ kCudaExecutionProvider, \ KernelDefBuilder() \ + .MayInplace(0, 0) \ .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ Shrink); diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc index 7645fb763fbe4..e45eb16dc5508 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc @@ -83,7 +83,9 @@ Status CudnnRnnBase::SetCudnnRnnDesc() { reverse_ = true; } - cudnn_dropout_desc_.Set(CudnnHandle()); + cudnn_dropout_desc_.GetCudnnDropoutStatesSize(CudnnHandle(), state_size_); + state_buffer_ = GetScratchBuffer(state_size_); + cudnn_dropout_desc_.Set(CudnnHandle(), state_buffer_.get(), state_size_); ORT_RETURN_IF_ERROR(rnn_desc_.Set(CudnnHandle(), hidden_size_, num_layers_, cudnn_dropout_desc_, cudnn_direction, rnn_mode_, CudnnTensor::GetDataType())); @@ -207,19 +209,20 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { } IAllocatorUniquePtr x_reversed_data; - T* x_data = const_cast(X->template Data()); + const T* x_data = X->template Data(); if (reverse_) { // reverse input data x_reversed_data = GetScratchBuffer(seq_length * batch_size * input_size); ReverseBySequence(gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(input_size), - reinterpret_cast(x_data), + reinterpret_cast(x_data), reinterpret_cast(x_reversed_data.get()), seq_length * batch_size * input_size); - x_data = x_reversed_data.get(); } + const T* x_data_input = reverse_ ? x_reversed_data.get() : x_data; + const T* hx_data = (initial_h == nullptr) ? nullptr : initial_h->template Data(); const T* cx_data = (initial_c == nullptr) ? nullptr : initial_c->template Data(); T* y_h_data = (Y_h == nullptr) ? nullptr : Y_h->template MutableData(); @@ -233,11 +236,14 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { y_alloc_data = GetScratchBuffer(output_size); y_data = y_alloc_data.get(); } + const int32_t* sequence_lens_data = (sequence_lens == nullptr) ? nullptr : sequence_lens->template Data(); + // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences + CUDNN_RETURN_IF_ERROR(cudnnSetRNNPaddingMode(rnn_desc_, CUDNN_RNN_PADDED_IO_ENABLED)); + size_t workspace_bytes; CUDNN_RETURN_IF_ERROR(cudnnGetRNNWorkspaceSize(CudnnHandle(), rnn_desc_, gsl::narrow_cast(seq_length), x_desc.data(), &workspace_bytes)); - workspace_bytes *= num_directions_; auto workspace_cuda = GetScratchBuffer(workspace_bytes); if (CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_ || nullptr == sequence_lens_data) { @@ -245,7 +251,7 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { rnn_desc_, gsl::narrow_cast(seq_length), x_desc.data(), - x_data, + x_data_input, hx_desc, hx_data, cx_desc, @@ -269,7 +275,7 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInferenceEx(CudnnHandle(), rnn_desc_, x_desc, - x_data, + x_data_input, hx_desc, hx_data, cx_desc, @@ -286,6 +292,10 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { nullptr, nullptr, nullptr, nullptr, workspace_cuda.get(), workspace_bytes)); + // Early terminate for this case since Y data is not required, and Y_h is obtained correctly, no need the following code to retrive Y_h from Y data. + if (nullptr == Y) { + return Status::OK(); + } } IAllocatorUniquePtr y_reorganized_data; diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h index 1c08d5af2b7c4..0afd35435cc7c 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h @@ -21,43 +21,6 @@ enum RNN_Input_Index { initial_c = 6 }; -class CudnnDropout { - public: - CudnnDropout() : dropout_desc_(nullptr) { - } - - Status Set(const cudnnHandle_t& cudnnHandle, float dropout = 0.0f, unsigned long long seed = 1) { - CUDNN_RETURN_IF_ERROR(cudnnCreateDropoutDescriptor(&dropout_desc_)); - size_t stateSize; - void* states; - CUDNN_RETURN_IF_ERROR(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize)); - - CUDA_CALL(cudaMalloc(&states, stateSize)); - - CUDNN_RETURN_IF_ERROR(cudnnSetDropoutDescriptor(dropout_desc_, - cudnnHandle, - dropout, - states, - stateSize, - seed)); - - return Status::OK(); - } - - ~CudnnDropout() { - if (dropout_desc_ != nullptr) { - cudnnDestroyDropoutDescriptor(dropout_desc_); - } - } - - operator cudnnDropoutDescriptor_t() const { - return dropout_desc_; - } - - private: - cudnnDropoutDescriptor_t dropout_desc_; -}; - class CudnnRNN { public: CudnnRNN() : rnn_desc_(nullptr) { @@ -167,6 +130,8 @@ class CudnnRnnBase : public CudaKernel { CudnnFilterDescriptor filter_desc_; IAllocatorUniquePtr w_data_cache_; bool weight_cached_; + IAllocatorUniquePtr state_buffer_; + size_t state_size_; enum Output_Index { Y = 0, diff --git a/onnxruntime/core/providers/cuda/tensor/concat.cc b/onnxruntime/core/providers/cuda/tensor/concat.cc index a4aeb51785d03..c1f0b066afa56 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat.cc +++ b/onnxruntime/core/providers/cuda/tensor/concat.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "concat.h" +#include "concat_impl.h" namespace onnxruntime { namespace cuda { @@ -24,25 +25,46 @@ Status Concat::ComputeInternal(OpKernelContext* ctx) const { if (p.output_num_elements == 0) return Status::OK(); - int64_t output_offset = 0; - auto element_bytes = p.output_tensor->DataType()->Size(); - for (int input_index = 0; input_index < input_count; input_index++) { - const auto& prep = p.inputs[input_index]; - // No data in this tensor - so skip it - if (prep.num_elements == 0) - continue; - // Copy the data across. For every 'input_axis_pitch' values copied, we move over by the 'output_axis_pitch' - CUDA_RETURN_IF_ERROR(cudaMemcpy2DAsync( - static_cast(p.output_tensor->MutableDataRaw()) + output_offset * element_bytes, - p.output_axis_pitch * element_bytes, - prep.tensor->DataRaw(), - prep.axis_pitch * element_bytes, - prep.axis_pitch * element_bytes, - prep.num_elements / prep.axis_pitch, - cudaMemcpyDeviceToDevice)); - - output_offset += prep.axis_pitch; + int device_id = GetDeviceId(); + std::vector concat_sizes(input_count); + + CudaAsyncBuffer input_ptr(this, device_id, input_count); + gsl::span input_ptr_cpuspan = input_ptr.CpuSpan(); + std::vector axis_dimension_input_output_mapping(p.output_tensor->Shape()[p.axis]); + int index = 0; + for (int i = 0; i < input_count; ++i) { + auto input = p.inputs[i]; + concat_sizes[i] = input.tensor->Shape()[p.axis]; + input_ptr_cpuspan[i] = input.tensor->DataRaw(); + for (int j = 0; j < input.tensor->Shape()[p.axis]; ++j) { + axis_dimension_input_output_mapping.at(index++) = i; + } + } + std::vector concat_sizes_range(concat_sizes); + for (int i = 1; i < concat_sizes_range.size(); ++i) { + concat_sizes_range[i] += concat_sizes_range[i - 1]; } + + CudaAsyncBuffer concat_sizes_gpu(this, device_id, concat_sizes); + CudaAsyncBuffer axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping); + CudaAsyncBuffer concat_sizes_range_gpu(this, device_id, concat_sizes_range); + concat_sizes_gpu.CopyToGpu(); + axis_dimension_input_output_mapping_gpu.CopyToGpu(); + concat_sizes_range_gpu.CopyToGpu(); + input_ptr.CopyToGpu(); + int block_size_inside_axis_dim = static_cast(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]); + int block_size_including_axis_dim = static_cast(p.output_axis_pitch); + auto element_bytes = p.output_tensor->DataType()->Size(); + ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes, + block_size_including_axis_dim, + block_size_inside_axis_dim, + concat_sizes_gpu.GpuPtr(), + concat_sizes_range_gpu.GpuPtr(), + axis_dimension_input_output_mapping_gpu.GpuPtr(), + input_count, + p.output_tensor->MutableDataRaw(), + input_ptr.GpuPtr(), + p.output_num_elements)); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.cu b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu new file mode 100644 index 0000000000000..95569fc441a85 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu @@ -0,0 +1,102 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/cuda_common.h" +#include "concat_impl.h" + +namespace onnxruntime { +namespace cuda { + +template +__global__ void _ConcatKernel(const fast_divmod block_size_including_axis_dim_div, + const fast_divmod block_size_inside_axis_dim_div, + const int64_t* concat_sizes, + const int64_t* concat_sizes_range, + const int64_t* axis_dimension_input_output_mapping, + const int num_inputs, + T* output_data, + const void** input_ptr, + const CUDA_LONG N) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + CUDA_LONG input_pos = 0; + + int outter_block_index = 0; + int block_index = 0; + int offset = 0; + + block_size_including_axis_dim_div.divmod(id, outter_block_index, offset); + block_size_inside_axis_dim_div.divmod(offset, block_index, offset); + + int input_index = axis_dimension_input_output_mapping[block_index]; + int64_t range_left = (input_index == 0) ? 0 : concat_sizes_range[input_index - 1]; + int block_offset = block_index - range_left; + + input_pos = (outter_block_index * concat_sizes[input_index] + block_offset) * + block_size_inside_axis_dim_div.d_ + + offset; + + output_data[id] = reinterpret_cast(input_ptr[input_index])[input_pos]; +} + +Status ConcatImpl(const size_t element_bytes, + const int block_size_including_axis_dim, + const int block_size_inside_axis_dim, + const int64_t* concat_sizes, + const int64_t* concat_sizes_range, + const int64_t* axis_dimension_input_output_mapping, + const int num_inputs, + void* output_data, + const void** input_ptr, + const size_t N) { + int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); + + fast_divmod block_size_including_axis_dim_div = fast_divmod(block_size_including_axis_dim); + fast_divmod block_size_inside_axis_dim_div = fast_divmod(block_size_inside_axis_dim); + + switch (element_bytes) { + case sizeof(int8_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + case sizeof(int16_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + case sizeof(int32_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + case sizeof(int64_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + default: + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Concat operator"); + } + + return Status::OK(); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.h b/onnxruntime/core/providers/cuda/tensor/concat_impl.h new file mode 100644 index 0000000000000..eeddedf64252d --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace cuda { + +Status ConcatImpl(const size_t element_bytes, + const int block_size_including_axis_dim, + const int block_size_inside_axis_dim, + const int64_t* concat_sizes, + const int64_t* concat_sizes_range, + const int64_t* axis_dimension_input_output_mapping, + const int num_inputs, + void* output_data, + const void** input_ptr, + const size_t N); + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc index 599fa39f406b2..897e981877ad0 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand.cc +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -21,6 +21,11 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { ORT_RETURN_IF_ERROR(ComputeOutputShape(Node().Name(), input0.Shape(), output_dims, output_shape)); auto rank = output_shape.NumDimensions(); auto& output_tensor = *ctx->Output(0, output_shape); + + if (0 == output_shape.Size()) { + return Status::OK(); + } + auto input_shape = input0.Shape().GetDims(); // pad input_dims with 1 to make ranks match @@ -40,6 +45,8 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { for (auto i = 0; i < rank; i++) { in_span[i] = fast_divmod(static_cast(input_shape[i])); out_span[i] = fast_divmod(static_cast(output_shape[i])); + // output_shape[i] won't be 0 here, it's covered in (0 == output_shape.Size()) + // a null output will be returned for that case subdim_size /= output_shape[i]; sdm_span[i] = static_cast(subdim_size); } diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.cc b/onnxruntime/core/providers/cuda/tensor/identity_op.cc index e9a4125c43188..890bdf5cacf87 100644 --- a/onnxruntime/core/providers/cuda/tensor/identity_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/identity_op.cc @@ -5,13 +5,28 @@ namespace onnxruntime { namespace cuda { +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + Dropout, + kOnnxDomain, + 7, 9, + kCudaExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .Alias(0, 0), + IdentityOp); + ONNX_OPERATOR_KERNEL_EX( Dropout, kOnnxDomain, - 7, + 10, kCudaExecutionProvider, KernelDefBuilder() - .TypeConstraint("T", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) .Alias(0, 0), IdentityOp); diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.h b/onnxruntime/core/providers/cuda/tensor/identity_op.h index d83fd541b3cb6..31dd544030b20 100644 --- a/onnxruntime/core/providers/cuda/tensor/identity_op.h +++ b/onnxruntime/core/providers/cuda/tensor/identity_op.h @@ -30,7 +30,18 @@ class IdentityOp final : public CudaKernel { } if (is_dropout) { - context->Output(1, std::vector()); + Tensor* mask = context->Output(1, shape); + // a 'nullptr' returned would make it an unused optional output + if (mask != nullptr) { + // Opset 7 differs with Opset 10 in that the type of the 'mask' + // output is tied with the type of the input in Opset 7 whereas + // the type of 'mask' in Opset 10 is 'bool' always + // so we have a common solution + void* mask_data = mask->MutableDataRaw(); + // In 'test'/'inference' mode, there are no input values dropped out + // so fill the buffer with 0/false + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask_data, 0, mask->SizeInBytes())); + } } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/tensor/split.cc b/onnxruntime/core/providers/cuda/tensor/split.cc index f6dd317c5bcd3..4f325a54cc87e 100644 --- a/onnxruntime/core/providers/cuda/tensor/split.cc +++ b/onnxruntime/core/providers/cuda/tensor/split.cc @@ -44,6 +44,8 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { int device_id = GetDeviceId(); CudaAsyncBuffer output_ptr(this, device_id, num_outputs); gsl::span output_ptr_span = output_ptr.CpuSpan(); + std::vector axis_dimension_input_output_mapping(input_dims[axis]); + int index = 0; for (int i = 0; i < num_outputs; ++i) { // update size of dimension for axis we're splitting on auto split_size = gsl::narrow(split_sizes[i]); @@ -52,6 +54,9 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { Tensor* output = ctx->Output(i, TensorShape{output_dimensions}); auto output_data = output->MutableDataRaw(); output_ptr_span[i] = output_data; + for (int j = 0; j < split_size; ++j) { + axis_dimension_input_output_mapping.at(index++) = i; + } } output_ptr.CopyToGpu(); @@ -65,12 +70,16 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { CudaAsyncBuffer split_sizes_range_gpu(this, device_id, split_sizes_range); split_sizes_range_gpu.CopyToGpu(); + CudaAsyncBuffer axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping); + axis_dimension_input_output_mapping_gpu.CopyToGpu(); + size_t element_size = input_tensor->DataType()->Size(); ORT_RETURN_IF_ERROR(SplitImpl(element_size, block_size_including_axis_dim, block_size_inside_axis_dim, split_sizes_gpu.GpuPtr(), split_sizes_range_gpu.GpuPtr(), + axis_dimension_input_output_mapping_gpu.GpuPtr(), num_outputs, input_data, output_ptr.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.cu b/onnxruntime/core/providers/cuda/tensor/split_impl.cu index 82cbbd80033c1..0c97d140ca643 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.cu @@ -13,6 +13,7 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div const fast_divmod block_size_inside_axis_dim_div, const int64_t* split_sizes, const int64_t* split_sizes_range, + const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const T* input_data, void** output_ptr, @@ -24,20 +25,12 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div int block_index = 0; int offset = 0; - int output_index = 0; - int block_offset = 0; - block_size_including_axis_dim_div.divmod(id, outter_block_index, offset); block_size_inside_axis_dim_div.divmod(offset, block_index, offset); - for (int i = 0; i < num_outputs; ++i) { - int64_t range_left = (i == 0) ? 0 : split_sizes_range[i - 1]; - if ((range_left <= block_index) && (block_index < split_sizes_range[i])) { - output_index = i; - block_offset = block_index - range_left; - break; - } - } + int output_index = axis_dimension_input_output_mapping[block_index]; + int64_t range_left = (output_index == 0) ? 0 : split_sizes_range[output_index - 1]; + int block_offset = block_index - range_left; output_pos = (outter_block_index * split_sizes[output_index] + block_offset) * block_size_inside_axis_dim_div.d_ + @@ -51,6 +44,7 @@ Status SplitImpl(const size_t element_size, const int block_size_inside_axis_dim, const int64_t* split_sizes, const int64_t* split_sizes_range, + const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const void* input_data, void** output_ptr, @@ -64,7 +58,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int8_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); @@ -72,7 +66,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int16_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); @@ -80,7 +74,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int32_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); @@ -88,7 +82,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int64_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.h b/onnxruntime/core/providers/cuda/tensor/split_impl.h index 0ad6c51b356d2..72eaa5a32c973 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.h @@ -14,6 +14,7 @@ Status SplitImpl(const size_t element_size, const int block_size_inside_axis_dim, const int64_t* split_sizes, const int64_t* split_sizes_range, + const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const void* input_data, void** output_ptr, diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc index 53f6d327156bb..49dd7a0801f38 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.cc +++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc @@ -20,13 +20,13 @@ namespace cuda { .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ Transpose); -// special case acceleration using cublas matrix tranpose +// special case acceleration using cublas matrix transpose std::tuple TryTransposeWithCublas(const std::vector& perm, const TensorShape& input_shape) { int M = 0; int N = 0; if (perm.size() == 4 && input_shape[0] == 1 && perm[0] == 0) { - // NCHW < ->NHWC when N == 1 + // NCHW <-> NHWC when N == 1 if ((perm[1] == 2 && perm[2] == 3 && perm[3] == 1) || (perm[1] == 3 && perm[2] == 1 && perm[3] == 2)) { if (perm[1] == 2) { diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc index 307cdb7454078..93cb36116f964 100644 --- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc +++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc @@ -5,46 +5,25 @@ #pragma warning(disable : 4996) #endif -#include "mkldnn_execution_provider.h" #include "core/framework/allocator.h" -#include "core/framework/memcpy.h" -#include "core/framework/kernel_registry.h" -#include "mkldnn_fwd.h" #include "core/framework/compute_capability.h" +#include "core/framework/kernel_registry.h" #include "core/providers/mkldnn/subgraph/mkldnn_func_kernel.h" +#include "mkldnn_execution_provider.h" +#include "mkldnn_fwd.h" namespace onnxruntime { constexpr const char* MKLDNN = "MklDnn"; constexpr const char* MKLDNN_CPU = "MklDnnCpu"; -namespace mkl_dnn { - -ONNX_OPERATOR_KERNEL_EX( - MemcpyFromHost, - kOnnxDomain, - 1, - kMklDnnExecutionProvider, - KernelDefBuilder().InputMemoryType(0).TypeConstraint("T", DataTypeImpl::AllTensorTypes()), - Memcpy); - -ONNX_OPERATOR_KERNEL_EX( - MemcpyToHost, - kOnnxDomain, - 1, - kMklDnnExecutionProvider, - KernelDefBuilder().OutputMemoryType(0).TypeConstraint("T", DataTypeImpl::AllTensorTypes()), - Memcpy); - -} // namespace mkl_dnn - MKLDNNExecutionProvider::MKLDNNExecutionProvider(const MKLDNNExecutionProviderInfo& info) : IExecutionProvider{onnxruntime::kMklDnnExecutionProvider} { DeviceAllocatorRegistrationInfo default_allocator_info({OrtMemTypeDefault, - [](int) { return std::make_unique(std::make_unique(MKLDNN, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeDefault)); }, std::numeric_limits::max()}); + [](int) { return std::make_unique(std::make_unique(MKLDNN, OrtAllocatorType::OrtDeviceAllocator)); }, std::numeric_limits::max()}); DeviceAllocatorRegistrationInfo cpu_allocator_info({OrtMemTypeCPUOutput, - [](int) { return std::make_unique(std::make_unique(MKLDNN_CPU, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeCPUOutput)); }, std::numeric_limits::max()}); + [](int) { return std::make_unique(std::make_unique(MKLDNN_CPU, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); }, std::numeric_limits::max()}); if (info.create_arena) { InsertAllocator(CreateAllocator(default_allocator_info)); @@ -62,28 +41,9 @@ MKLDNNExecutionProvider::MKLDNNExecutionProvider(const MKLDNNExecutionProviderIn MKLDNNExecutionProvider::~MKLDNNExecutionProvider() { } -Status MKLDNNExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const { - // Support CPU <-> MKLDNN for now - if (!(strcmp(src.Location().name, MKLDNN) == 0 && strcmp(dst.Location().name, CPU) == 0) && - !(strcmp(src.Location().name, CPU) == 0 && strcmp(dst.Location().name, MKLDNN) == 0) && - !(strcmp(src.Location().name, MKLDNN) == 0 && strcmp(dst.Location().name, MKLDNN_CPU) == 0)) { - ORT_NOT_IMPLEMENTED(src.Location().name, " copy to ", dst.Location().name, " is not implemented"); - } - - // Todo: Copy for now. May optimize later to avoid copy. - size_t bytes = src.DataType()->Size() * src.Shape().Size(); - const void* src_data = src.DataRaw(); - void* dst_data = dst.MutableDataRaw(); - memcpy(dst_data, src_data, bytes); - - return Status::OK(); -} - namespace mkl_dnn { class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 1, Conv); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 7, Gemm); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 1, MemcpyFromHost); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 1, MemcpyToHost); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 6, Relu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 6, Sum); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kMklDnnExecutionProvider, kOnnxDomain, 7, BatchNormalization); @@ -98,8 +58,6 @@ void RegisterMKLDNNKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -128,14 +86,14 @@ std::shared_ptr MKLDNNExecutionProvider::GetKernelRegistry() con return kernel_registry; } -bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_viewer, - const std::vector& kernel_registries, - std::vector>& result) const { +bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_viewer) const { // switch between mkldnn-vanilla and mkldnn-subgraph implementation using // MKLDNN_SUBGRAPH environment variable bool use_subgraph = true; bool FP16_graph = false; + bool mkldnn_nodes_in_the_graph = false; + if (graph_viewer.MaxNodeIndex() > 0) { int index = 0; auto node = graph_viewer.GetNode(index); @@ -147,16 +105,27 @@ bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_ FP16_graph = node->InputDefs()[0]->Type()->find("16") != std::string::npos; } - if (FP16_graph) { + for (auto node_index = 0; node_index < graph_viewer.MaxNodeIndex(); node_index++) { + auto node = graph_viewer.GetNode(node_index); + if (node == nullptr) { + node_index++; + continue; + } + auto op_it = mkldnn_ops_.find(node->OpType()); + if (op_it != mkldnn_ops_.end()) { + mkldnn_nodes_in_the_graph = true; + break; + } + } + + if (FP16_graph || !mkldnn_nodes_in_the_graph) { // FP16 not supported yet. use_subgraph = false; - result = IExecutionProvider::GetCapability(graph_viewer, kernel_registries); } else { const char* env = getenv("ORT_MKLDNN_SUBGRAPH"); if (env != nullptr) { if (atoi(env) == 0) { use_subgraph = false; - result = IExecutionProvider::GetCapability(graph_viewer, kernel_registries); } } } @@ -226,16 +195,16 @@ std::vector> MKLDNNExecutionProvider::GetCapa const onnxruntime::GraphViewer& graph_viewer, const std::vector& kernel_registries) const { ORT_UNUSED_PARAMETER(kernel_registries); - std::vector> result; // temporary switch to toggle between mkldnn-vanilla and mkldnn-subgraph implementation using // ORT_MKLDNN_SUBGRAPH environment variable - if (UseSubgraph(graph_viewer, kernel_registries, result) == false) { - return result; + if (UseSubgraph(graph_viewer) == false) { + return IExecutionProvider::GetCapability(graph_viewer, kernel_registries); } LOGS_DEFAULT(INFO) << "Using MKL-DNN Subgraph"; // use sub-graph implementation + std::vector> result; mkl_dnn::Subgraph::SubgraphVariables sub_var; std::shared_ptr subgraph_ptr; @@ -260,6 +229,12 @@ std::vector> MKLDNNExecutionProvider::GetCapa if (IsDimensionSupported(node) == false) { node_index++; + if (subgraph_ptr->mkldnn_nodes.size() > 0) { + CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result); + subgraph_ptr.reset(new mkl_dnn::Subgraph(graph_name)); + subgraph_attributes.clear(); + output_to_source_node_map.clear(); + } continue; } @@ -453,7 +428,7 @@ Status MKLDNNExecutionProvider::Compile(const std::vector& f compute_info.compute_func = [](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { onnxruntime::mkl_dnn::MkldnnFuncKernel* custom_op = reinterpret_cast*>(state); - return custom_op->Compute(api, context); + return custom_op->Compute(api, context); }; node_compute_funcs.push_back(compute_info); diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h index 7e844adce5a3c..2869698568bde 100644 --- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h +++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h @@ -35,8 +35,6 @@ class MKLDNNExecutionProvider : public IExecutionProvider { explicit MKLDNNExecutionProvider(const MKLDNNExecutionProviderInfo& info); virtual ~MKLDNNExecutionProvider(); - Status CopyTensor(const Tensor& src, Tensor& dst) const override; - virtual std::shared_ptr GetKernelRegistry() const override; std::shared_ptr GetWeightsMemoryBuffer(const std::string& weight_key) { @@ -100,9 +98,7 @@ class MKLDNNExecutionProvider : public IExecutionProvider { return graph_name; } - bool UseSubgraph(const onnxruntime::GraphViewer& graph_viewer, - const std::vector& kernel_registries, - std::vector>& result) const; + bool UseSubgraph(const onnxruntime::GraphViewer& graph_viewer) const; // Some dimensions are not supported by MKL-DNN // example: Pool with NumDimensions <= 3 is not supported diff --git a/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h b/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h index ceafa69e5f9c3..9e421ec365629 100644 --- a/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h +++ b/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h @@ -238,13 +238,13 @@ class MklDnnConv : public MklDnnKernel { if (!bias_dims_mkl.empty()) { fwd_desc_.reset(new mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, *src_md_, + mkldnn::prop_kind::forward_inference, mkldnn::convolution_direct, *src_md_, *filter_md_, *bias_md_, *primitive_dst_md_, strides_mkl, dilations_mkl, padding_left_mkl, padding_right_mkl, mkldnn::padding_kind::zero)); } else { fwd_desc_.reset(new mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, *src_md_, + mkldnn::prop_kind::forward_inference, mkldnn::convolution_direct, *src_md_, *filter_md_, *primitive_dst_md_, strides_mkl, dilations_mkl, padding_left_mkl, padding_right_mkl, mkldnn::padding_kind::zero)); diff --git a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc index 7749a21532430..459deae2c81f9 100644 --- a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc +++ b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.cc @@ -34,13 +34,13 @@ constexpr const char* NGRAPH = "nGraph"; NGRAPHExecutionProvider::NGRAPHExecutionProvider(const NGRAPHExecutionProviderInfo& info) : IExecutionProvider{onnxruntime::kNGraphExecutionProvider} { DeviceAllocatorRegistrationInfo default_allocator_info({OrtMemTypeDefault, - [](int) { return std::make_unique(std::make_unique(NGRAPH, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeDefault)); }, + [](int) { return std::make_unique(std::make_unique(NGRAPH, OrtAllocatorType::OrtDeviceAllocator)); }, std::numeric_limits::max()}); InsertAllocator(CreateAllocator(default_allocator_info)); DeviceAllocatorRegistrationInfo cpu_allocator_info({OrtMemTypeCPUOutput, - [](int) { return std::make_unique(std::make_unique(NGRAPH, OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemTypeCPUOutput)); }, + [](int) { return std::make_unique(std::make_unique(NGRAPH, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); }, std::numeric_limits::max()}); InsertAllocator(CreateAllocator(cpu_allocator_info)); @@ -76,24 +76,6 @@ bool TensorCopyPossible(const std::string& src_location, const std::string& dst_ }); } -Status NGRAPHExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const { - const size_t src_bytes = src.DataType()->Size() * src.Shape().Size(); - const size_t dst_bytes = dst.DataType()->Size() * dst.Shape().Size(); - if (src_bytes != dst_bytes) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "nGraph: Source and Destination data sizes are not equal - cannot copy tensors"); - } - - if (!TensorCopyPossible(src.Location().name, dst.Location().name)) { - ORT_NOT_IMPLEMENTED("Copying tensors between '", src.Location().name, "' and '", dst.Location().name, - "' is not implemented in NGRAPHExecutionProvider"); - } - - MEMCPY_S(dst.MutableDataRaw(), src.DataRaw(), dst_bytes, src_bytes); - - return Status::OK(); -} - // Returns true only if op is in a mode that is not currently supported static bool IsUnsupportedOpMode(const Node* node, const onnxruntime::GraphViewer& graph_viewer) { const auto& optype = node->OpType(); diff --git a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h index a2af812e31e06..f4081a43a555b 100644 --- a/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h +++ b/onnxruntime/core/providers/ngraph/ngraph_execution_provider.h @@ -24,8 +24,6 @@ class NGRAPHExecutionProvider : public IExecutionProvider { explicit NGRAPHExecutionProvider(const NGRAPHExecutionProviderInfo& info); ~NGRAPHExecutionProvider() = default; - Status CopyTensor(const Tensor& src, Tensor& dst) const override; - std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, const std::vector& kernel_registries) const override; diff --git a/onnxruntime/core/providers/nnapi/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_execution_provider.cc new file mode 100644 index 0000000000000..d1d0cdf7dc26a --- /dev/null +++ b/onnxruntime/core/providers/nnapi/nnapi_execution_provider.cc @@ -0,0 +1,316 @@ +// Copyright 2019 JD.com Inc. JD AI + +#include "nnapi_execution_provider.h" +#include "core/framework/allocatormgr.h" +#include "core/framework/compute_capability.h" +#include "core/session/onnxruntime_cxx_api.h" +#include "core/session/inference_session.h" +#include "core/graph/model.h" +#include "dnnlibrary/ModelBuilder.h" +#include "dnnlibrary/OnnxReader.h" +#include "tools/onnx2daq/OnnxConverter.h" + +namespace onnxruntime { + +constexpr const char* NNAPI = "Nnapi"; + +NnapiExecutionProvider::NnapiExecutionProvider() + : IExecutionProvider{onnxruntime::kNnapiExecutionProvider} { + DeviceAllocatorRegistrationInfo device_info{OrtMemTypeDefault, + [](int) { return std::make_unique( + std::make_unique(NNAPI, + OrtAllocatorType::OrtDeviceAllocator)); }, + std::numeric_limits::max()}; + InsertAllocator(CreateAllocator(device_info)); + + DeviceAllocatorRegistrationInfo cpu_allocator_info({OrtMemTypeCPUOutput, + [](int) { return std::make_unique(std::make_unique(NNAPI, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput)); }, + std::numeric_limits::max()}); + + InsertAllocator(CreateAllocator(cpu_allocator_info)); +} + +NnapiExecutionProvider::~NnapiExecutionProvider() {} + +std::vector> NnapiExecutionProvider::GetSupportedNodes(const ONNX_NAMESPACE::ModelProto& model_proto) const { + dnn::OnnxConverter converter; + return converter.GetSupportedNodes(model_proto); +} + +std::vector> +NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, + const std::vector& /*kernel_registries*/) const { + // This method is based on that of TRT EP + // Construct modelproto from graph + onnxruntime::Model model(graph.Name(), true, ModelMetaData(), IOnnxRuntimeOpSchemaRegistryList(), graph.DomainToVersionMap()); + onnxruntime::Graph& graph_build = model.MainGraph(); + const std::vector& node_index = graph.GetNodesInTopologicalOrder(); + std::set all_node_inputs; + for (const auto& node : graph.Nodes()) { + std::vector inputs, outputs; + for (auto input : node.InputDefs()) { + auto& n_input = graph_build.GetOrCreateNodeArg(input->Name(), input->TypeAsProto()); + inputs.push_back(&n_input); + all_node_inputs.insert(&n_input); + } + for (auto output : node.OutputDefs()) { + auto& n_output = graph_build.GetOrCreateNodeArg(output->Name(), output->TypeAsProto()); + outputs.push_back(&n_output); + } + graph_build.AddNode(node.Name(), node.OpType(), node.Description(), inputs, outputs, &node.GetAttributes(), node.Domain()); + } + const auto graph_outputs = graph.GetOutputs(); + //Add initializer to graph + const auto& init_tensors = graph.GetAllInitializedTensors(); + for (const auto& tensor : init_tensors) { + graph_build.AddInitializedTensor(*(tensor.second)); + } + + ORT_ENFORCE(graph_build.Resolve().IsOK()); + ONNX_NAMESPACE::ModelProto model_proto = model.ToProto(); + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + + const auto supported_nodes_vector = GetSupportedNodes(model_proto); + + std::unique_ptr sub_graph = std::make_unique(); + + // Find inputs, initializers and outputs for each supported subgraph + std::vector> result; + + int counter = 0; + + for (const auto& group : supported_nodes_vector) { + if (!group.empty()) { + std::unordered_set node_set; + node_set.reserve(group.size()); + for (const auto& index : group) { + node_set.insert(node_index[index]); + } + std::unique_ptr sub_graph = std::make_unique(); + // Find inputs and outputs of the subgraph + std::unordered_map fused_inputs, fused_outputs, fused_outputs_to_add; + std::unordered_set erased; + int input_order = 0; + int output_order = 0; + + for (const auto& index : group) { + sub_graph->nodes.push_back(node_index[index]); + const auto& node = graph.GetNode(node_index[index]); + + for (const auto& input : node->InputDefs()) { + const auto& it = fused_outputs.find(input); + + if (it != fused_outputs.end()) { + fused_outputs.erase(it); + erased.insert(input); + } + //only when input is neither in output list nor erased list, add the input to input list + else if (erased.find(input) == erased.end()) { + fused_inputs[input] = input_order++; + } + } + + // For output searching, there is a special case: + // If node's OutputEdges are more than its outputs, meaning certain output is used more than once, + // if the output is connected to nodes that don't belong to the subgraph, the output need to be added + // to the output list + if (node->GetOutputEdgesCount() > node->OutputDefs().size()) { + for (auto it = node->OutputEdgesBegin(), end = node->OutputEdgesEnd(); it != end; ++it) { + const auto& node_idx = it->GetNode().Index(); + const auto& output = (it->GetNode()).InputDefs()[it->GetDstArgIndex()]; + + if (node_set.find(node_idx) != node_set.end()) { + const auto& iter = fused_inputs.find(output); + + if (iter != fused_inputs.end()) { + fused_inputs.erase(iter); + erased.insert(output); + } else if (erased.find(output) == erased.end()) { + fused_outputs[output] = output_order++; + } + } else { + fused_outputs_to_add[output] = output_order++; + } + } + } else { + for (const auto& output : node->OutputDefs()) { + const auto& it = fused_inputs.find(output); + + if (it != fused_inputs.end()) { + fused_inputs.erase(it); + erased.insert(output); + } + // only when output is neither in input list nor erased list, add the output to output list + else if (erased.find(output) == erased.end()) { + fused_outputs[output] = output_order++; + } + } + } + } + + fused_outputs.insert(fused_outputs_to_add.begin(), fused_outputs_to_add.end()); + + // Sort inputs and outputs by the order they were added + std::multimap inputs, outputs; + + for (auto it = fused_inputs.begin(), end = fused_inputs.end(); it != end; ++it) { + inputs.insert(std::pair(it->second, it->first)); + } + + for (auto it = fused_outputs.begin(), end = fused_outputs.end(); it != end; ++it) { + for (const auto& x : all_node_inputs) { + if (x->Name() == it->first->Name()) { + outputs.insert(std::pair(it->second, it->first)); + break; + } + } + if (std::find(graph_outputs.begin(), graph_outputs.end(), it->first) != graph_outputs.end()) { + outputs.insert(std::pair(it->second, it->first)); + } + } + + // Assign inputs and outputs to subgraph's meta_def + auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); + meta_def->name = "NNAPI_" + std::to_string(counter++); + meta_def->domain = kMSDomain; + + for (const auto& input : inputs) { + meta_def->inputs.push_back(input.second->Name()); + } + + for (const auto& output : outputs) { + meta_def->outputs.push_back(output.second->Name()); + } + + meta_def->since_version = 1; + sub_graph->SetMetaDef(meta_def); + + result.push_back(std::make_unique(std::move(sub_graph))); + } + } + + return result; +} + +common::Status NnapiExecutionProvider::Compile(const std::vector& fused_nodes, + std::vector& node_compute_funcs) { + for (const auto* fused_node : fused_nodes) { + // Reconstruct graph proto from fused node's function body + const auto* func_body = fused_node->GetFunctionBody(); + if (!func_body) { + return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Function body is empty"); + } + const Graph& graph_body = func_body->Body(); + onnxruntime::Model model(graph_body.Name(), true, ModelMetaData(), + IOnnxRuntimeOpSchemaRegistryList(), graph_body.DomainToVersionMap()); + ONNX_NAMESPACE::ModelProto model_proto = model.ToProto(); + *(model_proto.mutable_graph()) = graph_body.ToGraphProto(); + model_proto.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + + dnn::OnnxReader onnx_reader; + dnn::ModelBuilder model_builder; + onnx_reader.ReadOnnx(model_proto, model_builder); + model_builder.AllowFp16(true); + auto dnn_model = model_builder.Compile(model_builder.PREFERENCE_SUSTAINED_SPEED); + dnn_models_.emplace(fused_node->Name(), std::move(dnn_model)); + + NodeComputeInfo compute_info; + compute_info.create_state_func = [&](ComputeContext* context, FunctionState* state) { + *state = dnn_models_[context->node_name].get(); + return 0; + }; + + compute_info.release_state_func = [](FunctionState state) { + // the `state` is a dnn::model managed by unique_ptr + ORT_UNUSED_PARAMETER(state); + }; + + compute_info.compute_func = [](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { + Ort::CustomOpApi ort{*api}; + dnn::Model* model = reinterpret_cast(state); + const size_t num_inputs = ort.KernelContext_GetInputCount(context); + const size_t num_outputs = ort.KernelContext_GetOutputCount(context); + ORT_ENFORCE(model->GetInputs().size() <= num_inputs, "Inconsistent input sizes"); + ORT_ENFORCE(model->GetOutputs().size() == num_outputs, "Inconsistent output sizes"); + // Maintain the created nhwc buffers so that they can be deleted after inferencing + std::vector nhwc_inputs; + std::vector>> nhwc_outputs; + for (size_t i = 0; i < num_outputs; i++) { + const auto output_name = model->GetOutputs()[i]; + const auto output_shape = model->GetShape(output_name); + std::vector int64_output_shape(output_shape.begin(), output_shape.end()); + if (int64_output_shape.size() == 4) { + // NHWC to NCHW + std::swap(int64_output_shape[1], int64_output_shape[3]); + std::swap(int64_output_shape[2], int64_output_shape[3]); + float* nhwc_output = new float[model->GetSize(output_name)]; + model->SetOutputBuffer(i, nhwc_output); + nhwc_outputs.push_back(std::make_tuple(i, nhwc_output, int64_output_shape)); + } else { + auto* output_tensor = ort.KernelContext_GetOutput(context, i, int64_output_shape.data(), int64_output_shape.size()); + model->SetOutputBuffer(i, ort.GetTensorMutableData(output_tensor)); + } + } + std::vector inputs; + for (size_t i = 0; i < model->GetInputs().size(); i++) { + const OrtValue* input_tensor = ort.KernelContext_GetInput(context, i); + float* input = const_cast(ort.GetTensorData(input_tensor)); + + const auto tensor_info = ort.GetTensorTypeAndShape(input_tensor); + const auto& tensor_shape = ort.GetTensorShape(tensor_info); + + if (tensor_shape.size() == 4) { + // Transpose nchw -> nhwc manually + const int N = tensor_shape[0], C = tensor_shape[1], H = tensor_shape[2], W = tensor_shape[3]; + float* nhwc_input = new float[N * C * H * W]; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + nhwc_input[n * H * W * C + h * W * C + w * C + c] = input[n * C * H * W + c * H * W + h * W + w]; + } + } + } + } + inputs.push_back(nhwc_input); + nhwc_inputs.push_back(nhwc_input); + } else { + inputs.push_back(input); + } + ort.ReleaseTensorTypeAndShapeInfo(tensor_info); + } + model->Predict(inputs); + // Transpose nhwc -> nchw manually + for (size_t i = 0; i < nhwc_outputs.size(); i++) { + const auto output = nhwc_outputs[i]; + size_t index; + float* nhwc_data; + std::vector nchw_shape; + std::tie(index, nhwc_data, nchw_shape) = output; + auto* output_tensor = ort.KernelContext_GetOutput(context, index, nchw_shape.data(), nchw_shape.size()); + const int N = nchw_shape[0], C = nchw_shape[1], H = nchw_shape[2], W = nchw_shape[3]; + float* nchw_output = ort.GetTensorMutableData(output_tensor); + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + nchw_output[n * C * H * W + c * H * W + h * W + w] = nhwc_data[n * H * W * C + h * W * C + w * C + c]; + } + } + } + } + } + for (auto nhwc_input : nhwc_inputs) { + delete[] nhwc_input; + } + for (auto nhwc_output : nhwc_outputs) { + delete[] std::get<1>(nhwc_output); + } + return Status::OK(); + }; + + node_compute_funcs.push_back(compute_info); + } + return Status::OK(); +} +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nnapi/nnapi_execution_provider.h b/onnxruntime/core/providers/nnapi/nnapi_execution_provider.h new file mode 100644 index 0000000000000..a03eb5c7f4c73 --- /dev/null +++ b/onnxruntime/core/providers/nnapi/nnapi_execution_provider.h @@ -0,0 +1,25 @@ +// Copyright 2019 JD.com Inc. JD AI + +#pragma once + +#include "core/framework/execution_provider.h" +#include "core/graph/onnx_protobuf.h" +#include "dnnlibrary/Model.h" + +namespace onnxruntime { +class NnapiExecutionProvider : public IExecutionProvider { + public: + NnapiExecutionProvider(); + virtual ~NnapiExecutionProvider(); + + std::vector> + GetCapability(const onnxruntime::GraphViewer& graph, + const std::vector& /*kernel_registries*/) const override; + common::Status Compile(const std::vector& fused_nodes, + std::vector& node_compute_funcs) override; + + private: + std::unordered_map> dnn_models_; + std::vector> GetSupportedNodes(const ONNX_NAMESPACE::ModelProto& model_proto) const; +}; +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nnapi/nnapi_provider_factory.cc b/onnxruntime/core/providers/nnapi/nnapi_provider_factory.cc new file mode 100644 index 0000000000000..b72ddc6dab52c --- /dev/null +++ b/onnxruntime/core/providers/nnapi/nnapi_provider_factory.cc @@ -0,0 +1,32 @@ +// Copyright 2019 JD.com Inc. JD AI + +#include "core/providers/nnapi/nnapi_provider_factory.h" +#include "nnapi_execution_provider.h" +#include "core/session/abi_session_options_impl.h" + +using namespace onnxruntime; + +namespace onnxruntime { + +struct NnapiProviderFactory : IExecutionProviderFactory { + NnapiProviderFactory() {} + ~NnapiProviderFactory() override {} + + std::unique_ptr CreateProvider() override; +}; + +std::unique_ptr NnapiProviderFactory::CreateProvider() { + return std::make_unique(); +} + +std::shared_ptr CreateExecutionProviderFactory_Nnapi() { + return std::make_shared(); +} +} // namespace onnxruntime + +ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Nnapi, _In_ OrtSessionOptions* options) { + options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_Nnapi()); + return nullptr; +} + + diff --git a/onnxruntime/core/providers/nnapi/symbols.txt b/onnxruntime/core/providers/nnapi/symbols.txt new file mode 100644 index 0000000000000..54df5ca1457b9 --- /dev/null +++ b/onnxruntime/core/providers/nnapi/symbols.txt @@ -0,0 +1 @@ +OrtSessionOptionsAppendExecutionProvider_Nnapi diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index a4b37d5ea8b6e..f2a15562bb0b0 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -14,12 +14,13 @@ #include "core/framework/compute_capability.h" #include "core/framework/tensorprotoutils.h" #include "core/session/onnxruntime_cxx_api.h" +#include "core/util/protobuf_parsing_utils.h" +#include "core/common/logging/logging.h" #include "openvino_execution_provider.h" #include "core/graph/model.h" #include "openvino_graph.h" -#include "core/util/protobuf_parsing_utils.h" namespace onnxruntime { constexpr const char* OpenVINO = "OpenVINO"; @@ -28,7 +29,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(OpenVINOExecutionProviderIn : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} { ORT_UNUSED_PARAMETER(info); - DeviceAllocatorRegistrationInfo device_info({OrtMemTypeDefault, [](int) { return std::make_unique(std::make_unique(OPENVINO, OrtDeviceAllocator, 0, OrtMemTypeDefault)); }, std::numeric_limits::max()}); + DeviceAllocatorRegistrationInfo device_info({OrtMemTypeDefault, [](int) { return std::make_unique(std::make_unique(OPENVINO, OrtDeviceAllocator)); }, std::numeric_limits::max()}); InsertAllocator(CreateAllocator(device_info)); } @@ -63,146 +64,131 @@ static ONNX_NAMESPACE::ModelProto GetModelProtoFromFusedNode(const onnxruntime:: } //Gets the input count of given node -int GetInputCount(const Node* node, const InitializedTensorSet& initializer_set){ - - int count = 0; - for(const auto& input : node->InputDefs()){ - auto name = input->Name(); - auto it = initializer_set.find(name); - if(it == initializer_set.end()){ - count++; - } +int GetInputCount(const Node* node, const InitializedTensorSet& initializer_set) { + int count = 0; + for (const auto& input : node->InputDefs()) { + auto name = input->Name(); + auto it = initializer_set.find(name); + if (it == initializer_set.end()) { + count++; } - return count; + } + return count; } //Checks whether the dimensions of a given node are supported in OpenVINO -bool IsDimensionSupported(const Node* node, std::string dev_id){ - - auto node_inputs = node->InputDefs(); - size_t input_dims = 0; - if(node_inputs[0]->Shape() != nullptr){ - input_dims = node_inputs[0]->Shape()->dim_size(); - } - - if(node->OpType().find("Pool") != std::string::npos){ +bool IsDimensionSupported(const Node* node, std::string dev_id) { + auto node_inputs = node->InputDefs(); + size_t input_dims = 0; + if (node_inputs[0]->Shape() != nullptr) { + input_dims = node_inputs[0]->Shape()->dim_size(); + } - if(dev_id == "MYRIAD" || dev_id == "HDDL"){ - if(input_dims != 3 || input_dims != 4) - return false; - } else if(input_dims < 4 || input_dims > 5){ - return false; - } + if (node->OpType().find("Pool") != std::string::npos) { + if (dev_id == "MYRIAD" || dev_id == "HDDL") { + if (input_dims != 3 && input_dims != 4) + return false; + } else if (input_dims != 4 && input_dims != 5) { + return false; } + } - //Only support 4D and 5D Transposes - if(node->OpType() == "Transpose"){ + //Only support 4D and 5D Transposes + if (node->OpType() == "Transpose") { + if (input_dims == 2 || input_dims == 3 || input_dims > 5) + return false; + } - if(input_dims == 2 || input_dims == 3 || input_dims > 5) - return false; + if (node->OpType() == "Unsqueeze") { + auto attributes = node->GetAttributes(); + auto axes = attributes["axes"].ints(); + if (input_dims + axes.size() > 5) + return false; + if (dev_id == "MYRIAD" || dev_id == "HDDL") { + if (node_inputs[0]->Shape() != nullptr && node_inputs[0]->Shape()->dim(0).dim_value() != 1) + return false; } + } - if(node->OpType() == "Unsqueeze"){ + if (node->OpType() == "Reshape") { + //Don't support Reshape without output dims + auto node_outputs = node->OutputDefs(); + if (node_outputs[0]->Shape() != nullptr && node_outputs[0]->Shape()->dim_size() == 0) + return false; - auto attributes = node->GetAttributes(); - auto axes = attributes["axes"].ints(); - if(input_dims + axes.size() > 5) - return false; - if(dev_id == "MYRIAD" || dev_id == "HDDL"){ - if(node_inputs[0]->Shape() != nullptr && node_inputs[0]->Shape()->dim(0).dim_value() != 1) - return false; - } + if (dev_id == "MYRIAD" || dev_id == "HDDL") { + if (node_inputs[0]->Shape() != nullptr && node_inputs[0]->Shape()->dim(0).dim_value() != 1) + return false; } + } - if(node->OpType() == "Reshape"){ - - //Don't support Reshape without output dims - auto node_outputs = node->OutputDefs(); - if(node_outputs[0]->Shape() != nullptr && node_outputs[0]->Shape()->dim_size() == 0) - return false; - - if(dev_id == "MYRIAD" || dev_id == "HDDL"){ - - if(node_inputs[0]->Shape() != nullptr && node_inputs[0]->Shape()->dim(0).dim_value() != 1) - return false; - - } + if (node->OpType() == "Softmax") { + //First dimension of Softmax input has to be 1 + if (input_dims != 0) { + if (node_inputs[0]->Shape()->dim(0).dim_value() != 1) + return false; } - if(node->OpType() == "Softmax"){ - - //First dimension of Softmax input has to be 1 - if(input_dims != 0 ){ - if(node_inputs[0]->Shape()->dim(0).dim_value() != 1) - return false; - } - - //Only 2D input supported on MYRIAD and HDDL - if(dev_id == "MYRIAD" || dev_id == "HDDL"){ - if(input_dims != 2) - return false; - } + //3D input not supported on GPU, MYRIAD and HDDL + if (dev_id == "GPU" || dev_id == "MYRIAD" || dev_id == "HDDL") { + if (input_dims == 3) + return false; } - - //Only 2D MatMul is supported - if(node->OpType() == "MatMul"){ - for(size_t i = 0; i < node_inputs.size(); i++){ - - if(node_inputs[i]->Shape() != nullptr){ - if(node_inputs[i]->Shape()->dim_size() != 2) - return false; - } - } + } + //Only 2D MatMul is supported + if (node->OpType() == "MatMul") { + for (size_t i = 0; i < node_inputs.size(); i++) { + if (node_inputs[i]->Shape() != nullptr) { + if (node_inputs[i]->Shape()->dim_size() != 2) + return false; + } } + } - if(node->OpType() == "Flatten"){ - - if(dev_id == "MYRIAD" || dev_id == "HDDL"){ - if(node_inputs[0]->Shape() != nullptr && node_inputs[0]->Shape()->dim(0).dim_value() != 1) - return false; - } + if (node->OpType() == "Flatten") { + if (dev_id == "MYRIAD" || dev_id == "HDDL") { + if (node_inputs[0]->Shape() != nullptr && node_inputs[0]->Shape()->dim(0).dim_value() != 1) + return false; } + } - return true; + return true; } //Checks whether the node is supported by OpenVINO -bool IsOpSupported(std::string name){ - - std::set supported_ops = { - "Add", - "BatchNormalization", - "Conv", - "GlobalAveragePool", - "Relu", - "Reshape", - "Flatten", - "Gemm", - "MaxPool", - "AveragePool", - "Concat", - "Dropout", - "LRN", - "Softmax", - "Mul", - "Sum", - "Transpose", - "Identity", - "MatMul", - "Unsqueeze", - "ImageScaler", - "LeakyRelu", - "GlobalMaxPool"}; - - auto iter = supported_ops.find(name); - return iter != supported_ops.end(); +bool IsOpSupported(std::string name) { + std::set supported_ops = { + "Add", + "BatchNormalization", + "Conv", + "GlobalAveragePool", + "Relu", + "Reshape", + "Flatten", + "Gemm", + "MaxPool", + "AveragePool", + "Concat", + "Dropout", + "LRN", + "Softmax", + "Mul", + "Sum", + "Transpose", + "Identity", + "MatMul", + "Unsqueeze", + "ImageScaler", + "LeakyRelu", + "GlobalMaxPool"}; + + auto iter = supported_ops.find(name); + return iter != supported_ops.end(); } +//Checks if the entire graph is supported by OpenVINO EP and throws eception if any. -//Checks if the entire graph is supported by OpenVINO EP and returns false if it is not. - -bool IsGraphSupported(const onnxruntime::GraphViewer& graph_viewer, std::string dev_id){ - +void CheckGraphSupported(const onnxruntime::GraphViewer& graph_viewer, std::string dev_id) { const auto& initializers = graph_viewer.GetAllInitializedTensors(); auto node_indexes = graph_viewer.GetNodesInTopologicalOrder(); @@ -215,16 +201,26 @@ bool IsGraphSupported(const onnxruntime::GraphViewer& graph_viewer, std::string int num_inputs = graph_viewer.GetInputs().size(); int num_outputs = graph_viewer.GetOutputs().size(); - if (num_inputs != 0) - input_dims = graph_proto->input(0).type().tensor_type().shape().dim_size(); + //GPU Plugin does not support 1D and 5D input + if (dev_id == "GPU") { + for (int i = 0; i < num_inputs; i++) { + input_dims = graph_proto->input(i).type().tensor_type().shape().dim_size(); - if (num_outputs != 0) - output_dims = graph_proto->output(0).type().tensor_type().shape().dim_size(); + if (input_dims == 1 || input_dims == 5) { + throw "GPU plugin doesn't support 1D and 5D input"; + } + } + } - //GPU Plugin does not support single dimensional input and 5 dimensional input + //GPU Plugin does not support 5D output if (dev_id == "GPU") { - if (input_dims == 1 || input_dims == 5 || output_dims == 5) - return false; + for (int i = 0; i < num_outputs; i++) { + output_dims = graph_proto->output(i).type().tensor_type().shape().dim_size(); + + if (output_dims == 5) { + throw "GPU plugin doesn't support 5D output"; + } + } } for (auto index : node_indexes) { @@ -232,104 +228,108 @@ bool IsGraphSupported(const onnxruntime::GraphViewer& graph_viewer, std::string //Check if the Operation is Supported by OpenVINO if (!IsOpSupported(node->OpType())) { - return false; + { + throw "Operation is not supported by OpenVINO"; + } } auto node_inputs = node->InputDefs(); //Zero dimension check - for(size_t i = 0; i < node_inputs.size(); i++){ - if(node_inputs[i]->Shape() != nullptr){ - - if(node_inputs[i]->Shape()->dim_size() == 0) - return false; + for (size_t i = 0; i < node_inputs.size(); i++) { + if (node_inputs[i]->Shape() != nullptr) { + if (node_inputs[i]->Shape()->dim_size() == 0) { + throw "Node_input is zero dimension"; } + } } - - //BatchNormalization cannot take more than 1 input - if(node->OpType() == "BatchNormalization"){ - - if(GetInputCount(node,initializers) > 1) - return false; + if (node->OpType() == "BatchNormalization") { + if (GetInputCount(node, initializers) > 1) { + throw "BatchNormalization: Cannot take more than 1 input"; + } } - //Conv cannot take more than 1 input - if(node->OpType() == "Conv"){ - - if(GetInputCount(node,initializers) > 1) - return false; + if (node->OpType() == "Conv") { + if (GetInputCount(node, initializers) > 1) { + throw "Conv: Cannot take more than 1 input"; + } } - //Reshape should have shape as initializer - if(node->OpType() == "Reshape"){ - - int input_count = GetInputCount(node,initializers); + if (node->OpType() == "Reshape") { + int input_count = GetInputCount(node, initializers); - if(input_count > 1) - return false; - - //Myriad and HDDL plugins do not support Reshape with two initializers - if(dev_id == "MYRIAD" || dev_id == "HDDL") - if(input_count == 0) - return false; + if (input_count > 1) { + throw "Reshape: Shape should be an initializer"; + } - if(!IsDimensionSupported(node,dev_id)){ - return false; + //Myriad and HDDL plugins do not support Reshape with two initializers + if (dev_id == "MYRIAD" || dev_id == "HDDL") + if (input_count == 0) { + throw "Myriad and HDDL plugins do not support Reshape with two initializers "; } - } - if(node->OpType() == "Flatten"){ + if (!IsDimensionSupported(node, dev_id)) { + throw "Reshape: Dimensions are not supported"; + } + } - if(!IsDimensionSupported(node,dev_id)) - return false; + if (node->OpType() == "Flatten") { + if (!IsDimensionSupported(node, dev_id)) { + throw "Flatten: Dimensions are not supported"; + } - //Only default axis is supported for MYRIAD and HDDL plugins - auto attributes = node->GetAttributes(); - auto axis = attributes["axis"].i(); - if (dev_id == "MYRIAD" || dev_id == "HDDL") { - if (axis != 1) - return false; + //Only default axis is supported for MYRIAD and HDDL plugins + auto attributes = node->GetAttributes(); + auto axis = attributes["axis"].i(); + if (dev_id == "MYRIAD" || dev_id == "HDDL") { + if (axis != 1) { + throw "Only default axis is supported for MYRIAD and HDDL plugins"; } + } } //MatMul is only supported if it is followed by Add if (node->OpType() == "MatMul") { for (size_t i = 0; i < node->InputDefs().size(); i++) { if (node->InputDefs()[i]->TypeAsProto()->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { - return false; + throw "Input data type should be float"; } } auto iter = node->OutputNodesBegin(); if (iter == node->OutputNodesEnd()) { - return false; + throw "MatMul should be followed by Add"; } for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) { const auto out_node = graph_viewer.GetNode((*it).Index()); if (out_node->OpType() != "Add") { - return false; + { + throw "Matmul should be followed by Add"; + } } } - if(!IsDimensionSupported(node,dev_id)) - return false; - + if (!IsDimensionSupported(node, dev_id)) { + throw "Dimension is not supported"; + } } //Dropout , Identity and Concat can't have graph inputs if (node->OpType() == "Dropout" || node->OpType() == "Identity" || node->OpType() == "Concat") { auto graph_inputs = graph_viewer.GetInputs(); - for (auto input : node->InputDefs()) { + for (const auto& input : node->InputDefs()) { auto it = find(graph_inputs.begin(), graph_inputs.end(), input); if (it != graph_inputs.end()) { - return false; + { + throw "Dropout, Identity and Concat can't have graph inputs"; + } } } } @@ -337,42 +337,48 @@ bool IsGraphSupported(const onnxruntime::GraphViewer& graph_viewer, std::string //Attribute auto pad for MaxPool and Average Pool must not be empty or SAME_LOWER //Only support 4D and 5D blobs for CPU,GPU //Only support 3D and 4D blobs for MYRIAD and HDDL - if (node->OpType() == "MaxPool" || node->OpType() == "AveragePool"){ + if (node->OpType() == "MaxPool" || node->OpType() == "AveragePool") { auto attributes = node->GetAttributes(); auto auto_pad = attributes["auto_pad"].s(); - if (auto_pad == "" || auto_pad == "SAME_LOWER") - return false; + if (auto_pad == "" || auto_pad == "SAME_LOWER") { + throw "Attribute auto pad shouldn't be empty or SAME_LOWER for MaxPool and Average Pool"; + } auto strides_ints = attributes["strides"].ints(); - if(auto_pad == "SAME_UPPER" && strides_ints.size() == 0) - return false; + if (auto_pad == "SAME_UPPER" && strides_ints.size() == 0) { + throw "Pooling: Generic Error"; + } //Dilations have to be 1 auto dilations_ints = attributes["dilations"].ints(); if (dilations_ints.size() != 0) { - if (dilations_ints[0] > 1) - return false; + if (dilations_ints[0] > 1) { + throw "Pooling: Generic error"; + } } //Don't support ceil_mode = 1 auto ceil_mode = attributes["ceil_mode"].i(); - if (ceil_mode != 0) - return false; + if (ceil_mode != 0) { + throw "Pooling: Ceil mode should be 1"; + } //Don't support multiple outputs for Pooling - if (node->OutputDefs().size() > 1) - return false; + if (node->OutputDefs().size() > 1) { + throw "Pooling: Multiple outputs not supported"; + } - if(!IsDimensionSupported(node,dev_id)) - return false; + if (!IsDimensionSupported(node, dev_id)) { + throw "Pooling: Dimensions not supported"; + } } //Only support 4D and 5D blobs for CPU,GPU //Only support 3D and 4D blobs for MYRIAD and HDDL - if(node->OpType() == "GlobalMaxPool" || node->OpType() == "GlobalAveragePool"){ - - if(!IsDimensionSupported(node,dev_id)) - return false; + if (node->OpType() == "GlobalMaxPool" || node->OpType() == "GlobalAveragePool") { + if (!IsDimensionSupported(node, dev_id)) { + throw "Pooling: Only support 4D and 5D blobs for CPU,GPU, Only support 3D and 4D blobs for MYRIAD and HDDL"; + } } //Transpose with no attr is not supported @@ -380,45 +386,50 @@ bool IsGraphSupported(const onnxruntime::GraphViewer& graph_viewer, std::string auto attributes = node->GetAttributes(); auto perm = attributes["perm"].ints(); if (perm.size() == 0 || perm.size() > 5) { - return false; + throw " Transpose: Tranpose with no attr is not supported. Perm size shouldn't be zero or greater than five"; } //String data type is not supported const auto* type_proto = node->InputDefs()[0]->TypeAsProto(); if (type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_STRING) { - return false; + throw "Transpose: String data type is not supported "; } - if(!IsDimensionSupported(node,dev_id)) - return false; + if (!IsDimensionSupported(node, dev_id)) { + throw "Transpose: Dimensions are not supported "; + } } - if (node->OpType() == "Unsqueeze") { - - if(!IsDimensionSupported(node,dev_id)) - return false; + if (!IsDimensionSupported(node, dev_id)) { + throw "Unsqueeze: Dimensions are not supported "; + } const auto* type_proto = node->InputDefs()[0]->TypeAsProto(); - if (type_proto->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) - return false; + if (type_proto->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { + throw "Unsqueeze: Datatype should be float"; + } } //Only support 2D input and axis 1 if (node->OpType() == "Softmax") { - - if(!IsDimensionSupported(node,dev_id)) - return false; + if (!IsDimensionSupported(node, dev_id)) { + throw "Softmax: Dimensions are not supported "; + } auto attributes = node->GetAttributes(); auto axis = attributes["axis"].i(); - if (axis != 1) - return false; + if (axis != 1) { + throw "Softmax: Only default axis is supported"; + } } + //Don't support only one input + if (node->OpType() == "Sum") { + if (node->InputDefs().size() == 1) { + throw "Sum: Doesn't support only one input "; + } + } } - - return true; - } std::vector> OpenVINOExecutionProvider::GetCapability( @@ -442,86 +453,93 @@ std::vector> OpenVINOExecutionProvider::GetCa device_id = "MYRIAD"; #endif -#ifdef OPENVINO_CONFIG_VAD_R +#ifdef OPENVINO_CONFIG_VAD_M precision_fp32 = false; device_id = "HDDL"; #endif int counter = 0; + std::unique_ptr sub_graph = std::make_unique(); auto model_proto = GetModelProtoFromFusedNode(graph_viewer); - std::set fused_inputs, fused_outputs; + std::set fused_inputs, fused_outputs; - if (IsGraphSupported(graph_viewer,device_id)) { - std::string model_proto_strbuf; - model_proto.SerializeToString(&model_proto_strbuf); + try { + CheckGraphSupported(graph_viewer, device_id); + } catch (const char* error_msg) { + LOGS_DEFAULT(WARNING) << openvino_ep::OpenVINOGraph::log_tag << "Rejecting as graph has unsupported operations." << error_msg; + return result; + } - std::string xml_string, weights_string; + std::string model_proto_strbuf; + model_proto.SerializeToString(&model_proto_strbuf); - // Try converting with OpenVINO's Model Optimizer - try { - openvino_ep::OpenVINOGraph::ConvertONNXModelToOpenVINOIR(model_proto_strbuf, xml_string, weights_string, precision_fp32); - } catch (const char* msg) { - // Model Optimizer cannot convert this model. - return result; - } + std::string xml_string, weights_string; - auto node_indexes = graph_viewer.GetNodesInTopologicalOrder(); + // Try converting with OpenVINO's Model Optimizer + try { + openvino_ep::OpenVINOGraph::ConvertONNXModelToOpenVINOIR(model_proto_strbuf, xml_string, weights_string, precision_fp32); + } catch (const char* msg) { + // Model Optimizer cannot convert this model. + LOGS_DEFAULT(WARNING) << openvino_ep::OpenVINOGraph::log_tag << "Rejecting as Model Optimizer cannot convert this model." << msg; + return result; + } - for (auto index : node_indexes) { - sub_graph->nodes.push_back(index); - const auto node = graph_viewer.GetNode(index); + auto node_indexes = graph_viewer.GetNodesInTopologicalOrder(); - // Track graph inputs and initializers - for (auto input_def : node->InputDefs()) { - if (fused_outputs.find(input_def) == fused_outputs.end()) { - fused_inputs.insert(input_def); - } else { - fused_outputs.erase(input_def); - } - } + for (auto index : node_indexes) { + sub_graph->nodes.push_back(index); + const auto node = graph_viewer.GetNode(index); - // Track graph outputs - for (auto output_def : node->OutputDefs()) { - if (fused_inputs.find(output_def) == fused_inputs.end()) { - fused_outputs.insert(output_def); - } else { - fused_inputs.erase(output_def); - } + // Track graph inputs and initializers + for (const auto& input_def : node->InputDefs()) { + if (fused_outputs.find(input_def) == fused_outputs.end()) { + fused_inputs.insert(input_def); + } else { + fused_outputs.erase(input_def); } } - ONNX_NAMESPACE::AttributeProto xml_str_attr; - xml_str_attr.set_name("xml_str"); - xml_str_attr.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_STRING); - xml_str_attr.set_s(xml_string); - - ONNX_NAMESPACE::AttributeProto weights_str_attr; - weights_str_attr.set_name("weights_str"); - weights_str_attr.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_STRING); - weights_str_attr.set_s(weights_string); - - auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); - meta_def->attributes["xml_str"] = xml_str_attr; - meta_def->attributes["weights_str"] = weights_str_attr; - meta_def->name = "OpenVINOKernel_" + std::to_string(counter++); - meta_def->domain = "OpenVINO"; - meta_def->since_version = 1; - - for (auto input : fused_inputs) { - meta_def->inputs.push_back(input->Name()); + // Track graph outputs + for (const auto& output_def : node->OutputDefs()) { + if (fused_inputs.find(output_def) == fused_inputs.end()) { + fused_outputs.insert(output_def); + } else { + fused_inputs.erase(output_def); + } } + } - for (auto output : fused_outputs) { - meta_def->outputs.push_back(output->Name()); - } + ONNX_NAMESPACE::AttributeProto xml_str_attr; + xml_str_attr.set_name("xml_str"); + xml_str_attr.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_STRING); + xml_str_attr.set_s(xml_string); + + ONNX_NAMESPACE::AttributeProto weights_str_attr; + weights_str_attr.set_name("weights_str"); + weights_str_attr.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_STRING); + weights_str_attr.set_s(weights_string); + + auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); + meta_def->attributes["xml_str"] = xml_str_attr; + meta_def->attributes["weights_str"] = weights_str_attr; + meta_def->name = "OpenVINOKernel_" + std::to_string(counter++); + meta_def->domain = "OpenVINO"; + meta_def->since_version = 1; + + for (auto input : fused_inputs) { + meta_def->inputs.push_back(input->Name()); + } - sub_graph->SetMetaDef(meta_def); - result.push_back(std::make_unique(std::move(sub_graph))); + for (auto output : fused_outputs) { + meta_def->outputs.push_back(output->Name()); } + sub_graph->SetMetaDef(meta_def); + result.push_back(std::make_unique(std::move(sub_graph))); + return result; } @@ -534,6 +552,7 @@ common::Status OpenVINOExecutionProvider::Compile( openvino_graph = std::make_shared(fused_node); } catch (const char* msg) { + LOGS_DEFAULT(ERROR) << openvino_ep::OpenVINOGraph::log_tag << "Compilation error: " << msg; return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::NOT_IMPLEMENTED, msg); } diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 6b6c27df72593..5ec06a1feed7d 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -47,16 +47,6 @@ class OpenVINOExecutionProvider : public IExecutionProvider { return std::make_shared(); } - common::Status CopyTensor(const Tensor& src, Tensor& dst) const override { - // TODO: Copy for now. May optimize later to avoid copy. - size_t bytes = src.DataType()->Size() * src.Shape().Size(); - const void* src_data = src.DataRaw(); - void* dst_data = dst.MutableDataRaw(); - memcpy(dst_data, src_data, bytes); - - return Status::OK(); - } - const void* GetExecutionHandle() const noexcept override { return nullptr; } diff --git a/onnxruntime/core/providers/openvino/openvino_graph.cc b/onnxruntime/core/providers/openvino/openvino_graph.cc index f8bdea49bbcd0..b64dce7951d0d 100644 --- a/onnxruntime/core/providers/openvino/openvino_graph.cc +++ b/onnxruntime/core/providers/openvino/openvino_graph.cc @@ -16,14 +16,16 @@ #include "core/graph/graph.h" #include "core/framework/tensorprotoutils.h" #include "core/session/onnxruntime_cxx_api.h" +#include "core/common/logging/logging.h" #include "openvino_graph.h" -namespace onnxruntime{ +namespace onnxruntime { namespace openvino_ep { -OpenVINOGraph::OpenVINOGraph(const onnxruntime::Node* fused_node) { +const std::string OpenVINOGraph::log_tag = "[OpenVINO-EP] "; +OpenVINOGraph::OpenVINOGraph(const onnxruntime::Node* fused_node) { device_id_ = "CPU"; precision_ = InferenceEngine::Precision::FP32; std::string precision_str = "FP32"; @@ -48,7 +50,7 @@ OpenVINOGraph::OpenVINOGraph(const onnxruntime::Node* fused_node) { precision_ = InferenceEngine::Precision::FP16; precision_str = "FP16"; #endif -#ifdef OPENVINO_CONFIG_VAD_R +#ifdef OPENVINO_CONFIG_VAD_M device_id_ = "HDDL"; precision_ = InferenceEngine::Precision::FP16; precision_str = "FP16"; @@ -63,8 +65,8 @@ OpenVINOGraph::OpenVINOGraph(const onnxruntime::Node* fused_node) { // operations associated with the Infer Requests may be scheduled in parallel. // Infer Requests hold resources representing the entire network on their target hardware. So, // having more Infer Requests than needed would waste system resources. - // In VAD-R (HDDL) accelerator, there are 8 parallel execution units. So, creating 8 instances - // of Infer Requests only if the VAD-R accelerator is being used. + // In VAD-M (HDDL) accelerator, there are 8 parallel execution units. So, creating 8 instances + // of Infer Requests only if the VAD-M accelerator is being used. // sets number of maximum parallel inferences num_inf_reqs_ = (device_id_ == "HDDL") ? 8 : 1; @@ -95,7 +97,26 @@ OpenVINOGraph::OpenVINOGraph(const onnxruntime::Node* fused_node) { openvino_network_ = BuildOpenVINONetworkWithMO(); // Create hardware specific OpenVINO network representation - infer_requests_ = GetExecutableHandle(openvino_network_, device_id_); + GetExecutableHandle(openvino_network_); + + std::vector plugin_path = GetEnvLdLibraryPath(); + plugin_path.push_back(""); + plugin_ = InferenceEngine::PluginDispatcher( + plugin_path) + .getPluginByDevice(device_id_); + + //Loading model to the plugin + InferenceEngine::ExecutableNetwork exeNetwork = plugin_.LoadNetwork(*openvino_network_, {}); + + LOGS_DEFAULT(INFO) << log_tag << "Network loaded into accelerator plug-in succesfully"; + + //Create infer request + for (size_t i = 0; i < num_inf_reqs_; i++) { + auto infRequest = exeNetwork.CreateInferRequestPtr(); + + infer_requests_.push_back(infRequest); + } + LOGS_DEFAULT(INFO) << log_tag << "Infer requests created: " << num_inf_reqs_; } std::vector OpenVINOGraph::GetEnvLdLibraryPath() const { @@ -112,8 +133,7 @@ std::vector OpenVINOGraph::GetEnvLdLibraryPath() const { } void OpenVINOGraph::ConvertONNXModelToOpenVINOIR(const std::string& onnx_model, - std::string& openvino_xml, std::string& openvino_bin, bool precision_fp32) { - + std::string& openvino_xml, std::string& openvino_bin, bool precision_fp32) { Py_Initialize(); if (!Py_IsInitialized()) { throw "Python environment initialization failure"; @@ -176,7 +196,6 @@ void OpenVINOGraph::ConvertONNXModelToOpenVINOIR(const std::string& onnx_model, } std::shared_ptr OpenVINOGraph::BuildOpenVINONetworkWithMO() { - const auto& attributes = fused_node_->GetAttributes(); std::string xml_string = attributes.at("xml_str").s(); std::string weights_string = attributes.at("weights_str").s(); @@ -196,36 +215,28 @@ std::shared_ptr OpenVINOGraph::BuildOpenVINONetwork InferenceEngine::Precision OpenVINOGraph::ConvertPrecisionONNXToOpenVINO( ONNX_NAMESPACE::DataType onnx_type) { - - if(*onnx_type == "float" || *onnx_type == "tensor(float)") { + if (*onnx_type == "float" || *onnx_type == "tensor(float)") { return InferenceEngine::Precision::FP32; - } else if ( *onnx_type == "float16" || *onnx_type == "tensor(float16)") { + } else if (*onnx_type == "float16" || *onnx_type == "tensor(float16)") { return InferenceEngine::Precision::FP16; - } else if ( *onnx_type == "int32" || *onnx_type == "tensor(int32)") { + } else if (*onnx_type == "int32" || *onnx_type == "tensor(int32)") { return InferenceEngine::Precision::I32; - } else if ( *onnx_type == "int16" || *onnx_type == "tensor(int16)") { + } else if (*onnx_type == "int16" || *onnx_type == "tensor(int16)") { return InferenceEngine::Precision::I16; - } else if ( *onnx_type == "int8" || *onnx_type == "tensor(int8)") { + } else if (*onnx_type == "int8" || *onnx_type == "tensor(int8)") { return InferenceEngine::Precision::I8; - } else if ( *onnx_type == "uint16" || *onnx_type == "tensor(uint16)") { + } else if (*onnx_type == "uint16" || *onnx_type == "tensor(uint16)") { return InferenceEngine::Precision::U16; - } else if ( *onnx_type == "uint8" || *onnx_type == "tensor(uint8)") { + } else if (*onnx_type == "uint8" || *onnx_type == "tensor(uint8)") { return InferenceEngine::Precision::U8; } else { throw "Unsupported Data type"; } } -std::vector OpenVINOGraph::GetExecutableHandle( - std::shared_ptr network, const std::string& device) { - - - // Load Plugin for inference engine - std::vector plugin_path = GetEnvLdLibraryPath(); - plugin_path.push_back(""); - InferenceEngine::InferencePlugin plugin = InferenceEngine::PluginDispatcher( - plugin_path) - .getPluginByDevice(device); +void OpenVINOGraph::GetExecutableHandle( + std::shared_ptr network) { + LOGS_DEFAULT(INFO) << log_tag << "Loaded plugins"; // Configure input & output // Prepare input blobs @@ -235,7 +246,6 @@ std::vector OpenVINOGraph::GetExecutableHand int input_idx = 0; for (auto iter = inputInfo.begin(); iter != inputInfo.end(); ++iter, ++input_idx) { - // Get the onnx index for the corresponding input (ignoring initializers) auto tracked_input_idx = input_indexes_[input_idx]; auto precision = ConvertPrecisionONNXToOpenVINO(onnx_input_defs[tracked_input_idx]->Type()); @@ -270,7 +280,6 @@ std::vector OpenVINOGraph::GetExecutableHand int output_idx = 0; for (auto iter = outputInfo.begin(); iter != outputInfo.end(); ++iter, ++output_idx) { - auto precision = ConvertPrecisionONNXToOpenVINO(onnx_output_defs[output_idx]->Type()); iter->second->setPrecision(precision); @@ -296,17 +305,6 @@ std::vector OpenVINOGraph::GetExecutableHand throw "Invalid Dims type for output data map for: " + iter->first; } } - - // Loading model to the plugin - InferenceEngine::ExecutableNetwork exeNetwork = plugin.LoadNetwork(*network, - {}); - - // Create infer request - std::vector infer_requests; - for (size_t i = 0; i < num_inf_reqs_; i++) { - infer_requests.push_back(exeNetwork.CreateInferRequestPtr()); - } - return infer_requests; } size_t OpenVINOGraph::DeduceBatchSize(Ort::CustomOpApi ort, const OrtValue* input_tensor, @@ -321,6 +319,8 @@ size_t OpenVINOGraph::DeduceBatchSize(Ort::CustomOpApi ort, const OrtValue* inpu batch_size = input_shape[0]; } + LOGS_DEFAULT(INFO) << log_tag << "Deduced batch size: " << batch_size; + return batch_size; } @@ -329,14 +329,12 @@ size_t OpenVINOGraph::DeduceBatchSize(Ort::CustomOpApi ort, const OrtValue* inpu void OpenVINOGraph::StartAsyncInference(Ort::CustomOpApi ort, const OrtValue* input_tensors[], size_t batch_slice_idx, size_t infer_req_idx) { - auto infer_request = infer_requests_[infer_req_idx]; auto graph_input_info = openvino_network_->getInputsInfo(); size_t i = 0; for (auto input_info_iter = graph_input_info.begin(); input_info_iter != graph_input_info.end(); ++input_info_iter, ++i) { - // Get OpenVINO's input buffer auto graph_input_blob = infer_request->GetBlob(input_info_iter->first); auto graph_input_buffer = @@ -359,7 +357,6 @@ void OpenVINOGraph::StartAsyncInference(Ort::CustomOpApi ort, const OrtValue* in void OpenVINOGraph::CompleteAsyncInference(Ort::CustomOpApi ort, OrtValue* output_tensors[], size_t batch_slice_idx, size_t infer_req_idx) { - auto infer_request = infer_requests_[infer_req_idx]; // Wait for Async inference completion @@ -424,6 +421,8 @@ void OpenVINOGraph::Infer(Ort::CustomOpApi ort, OrtKernelContext* context) { // Currently allows only one Infer execution at a time std::lock_guard lock(compute_lock_); + LOGS_DEFAULT(INFO) << log_tag << "Starting inference"; + // Get Input and Output tensors size_t input_count = openvino_network_->getInputsInfo().size(); size_t output_count = openvino_network_->getOutputsInfo().size(); @@ -465,7 +464,9 @@ void OpenVINOGraph::Infer(Ort::CustomOpApi ort, OrtKernelContext* context) { size_t batch_slice_idx = full_parallel_runs * num_inf_reqs_ + inf_req_idx; CompleteAsyncInference(ort, output_tensors, batch_slice_idx, inf_req_idx); } + + LOGS_DEFAULT(INFO) << log_tag << "Inference successful"; } } // namespace openvino_ep -} // namespace onnxruntime +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_graph.h b/onnxruntime/core/providers/openvino/openvino_graph.h index c01ddfa146852..58943b376e676 100644 --- a/onnxruntime/core/providers/openvino/openvino_graph.h +++ b/onnxruntime/core/providers/openvino/openvino_graph.h @@ -27,14 +27,15 @@ class OpenVINOGraph { static void ConvertONNXModelToOpenVINOIR(const std::string& onnx_model, std::string& openvino_xml, std::string& openvino_bin, bool precision_fp32); + static const std::string log_tag; + private: std::shared_ptr BuildOpenVINONetworkWithMO(); InferenceEngine::Precision ConvertPrecisionONNXToOpenVINO(ONNX_NAMESPACE::DataType onnx_type); - std::vector GetExecutableHandle( - std::shared_ptr network, - const std::string& device); + void GetExecutableHandle( + std::shared_ptr network); size_t DeduceBatchSize(Ort::CustomOpApi ort, const OrtValue* input_tensor, InferenceEngine::SizeVector graph_dims); @@ -52,6 +53,7 @@ class OpenVINOGraph { const onnxruntime::Node* fused_node_; std::shared_ptr openvino_network_; size_t num_inf_reqs_; + InferenceEngine::InferencePlugin plugin_; std::vector infer_requests_; std::string device_id_; mutable std::mutex compute_lock_; diff --git a/onnxruntime/core/providers/openvino/openvino_mo/openvino_emitter.py b/onnxruntime/core/providers/openvino/openvino_mo/openvino_emitter.py index ea78c301179e0..26605d57a2a6d 100644 --- a/onnxruntime/core/providers/openvino/openvino_mo/openvino_emitter.py +++ b/onnxruntime/core/providers/openvino/openvino_mo/openvino_emitter.py @@ -2,29 +2,40 @@ Copyright(C) 2019 Intel Corporation Licensed under the MIT License """ -import sys -import os - -import hashlib -import xml.dom.minidom -from xml.etree.ElementTree import Element, SubElement, tostring -from mo.front.extractor import update_ie_fields -from mo.graph.graph import * -from mo.utils.unsupported_ops import UnsupportedOps -from mo.utils.utils import refer_to_faq_msg from mo.utils.version import get_version - +from mo.utils.utils import refer_to_faq_msg +from mo.utils.unsupported_ops import UnsupportedOps +from mo.graph.graph import * +from mo.front.extractor import update_ie_fields +from xml.etree.ElementTree import Element, SubElement, tostring +import xml.dom.minidom +import hashlib +import sys +import os ov_root = os.environ['INTEL_CVSDK_DIR'] -mo_path = os.path.join(ov_root, "deployment_tools", "model_optimizer") +if '2019' in ov_root: + version = 'R1' +else: + version = 'R5' +mo_path = ov_root + "/deployment_tools/model_optimizer" sys.path.append(mo_path) def create_const_nodes(graph: nx.MultiDiGraph, start_data_nodes_are_not_allowed: bool = True): for node_name in list(graph.nodes()): - node = NodeWrap(graph, node_name) - if (node.has('kind') and node.kind == 'data' and ((len(node.out_edges()) == 1 and 'bin' not in node.out_edge(0)) or node.has_and_set('is_output')) and len(node.in_nodes()) == 0): + if 'R5' in version: + node = NodeWrap(graph, node_name) + else: + node = Node(graph, node_name) + if ( + node.has('kind') and + node.kind == 'data' and ( + (len(node.out_edges()) == 1 and 'bin' not in node.out_edge(0)) or + node.has_and_set('is_output') + ) and + len(node.in_nodes()) == 0): if node.has_valid('value'): const_node_name = node.id + '_const' @@ -74,7 +85,10 @@ def serialize_constants_recursively(weights, graph: nx.MultiDiGraph, data_type, elif (data_type == np.float16): precision = 2 for node in nodes: - node = NodeWrap(graph, node) + if 'R5' in version: + node = NodeWrap(graph, node) + else: + node = Node(graph, node) if node.kind == 'data' and node.value is not None and any('bin' in d for u, v, d in graph.out_edges(node.node, data=True)): blob = node.value @@ -101,7 +115,10 @@ def serialize_constants_recursively(weights, graph: nx.MultiDiGraph, data_type, graph, node.soft_get('name'), node.id, node.shape, node.offset, node.size)) for node in nodes: - node = NodeWrap(graph, node) + if 'R5' in version: + node = NodeWrap(graph, node) + else: + node = Node(graph, node) # Dump blobs recursively if sub-graphs are present in the node if node.has_valid('sub_graphs'): for sub_graph_attr_name in node.sub_graphs: @@ -131,6 +148,8 @@ def xml_shape(shape: np.ndarray, element: xml.etree.ElementTree.Element): dim = SubElement(element, 'dim') if d <= 0: d = 1 + # raise Error('The value "{}" for shape is less or equal to 0. May be the input shape of the topology is ' + # 'wrong.'.format(d)) if int(d) != d: raise Error('The value "{}" for shape is not integer.'.format(d)) if not isinstance(d, np.int64): @@ -140,10 +159,24 @@ def xml_shape(shape: np.ndarray, element: xml.etree.ElementTree.Element): dim.text = str(d) +def sorted_inputs(node): + if 'R5' in version: + return get_sorted_inputs(node) + else: + return node.get_sorted_inputs(node) + + +def sorted_outputs(node): + if 'R5' in version: + return get_sorted_outputs(node) + else: + return node.get_sorted_outputs(node) + + def xml_ports(node: Node, element: xml.etree.ElementTree.Element, edges: xml.etree.ElementTree.Element): # input ports inputs = None # will create input section only if at least one input is available - for u, d in get_sorted_inputs(node): + for u, d in sorted_inputs(node): if 'bin' not in d and ('xml_skip' not in d or not d['xml_skip']): if inputs is None: inputs = SubElement(element, 'input') @@ -166,7 +199,7 @@ def xml_ports(node: Node, element: xml.etree.ElementTree.Element, edges: xml.etr # output ports outputs = None - for v, d in get_sorted_outputs(node): + for v, d in sorted_outputs(node): if 'xml_skip' not in d or not d['xml_skip']: if outputs is None: outputs = SubElement(element, 'output') @@ -180,7 +213,7 @@ def xml_ports(node: Node, element: xml.etree.ElementTree.Element, edges: xml.etr def xml_consts(graph: nx.MultiDiGraph, node: Node, element: xml.etree.ElementTree.Element): blobs = None # sub-element that will be created on-demand - for u, d in get_sorted_inputs(node): + for u, d in sorted_inputs(node): if 'bin' in d: if not blobs: blobs = SubElement(element, 'blobs') @@ -346,7 +379,10 @@ def serialize_network(graph, net_element, unsupported): return nodes = sorted(graph.nodes()) for node in nodes: - node = NodeWrap(graph, node) + if 'R5' in version: + node = NodeWrap(graph, node) + else: + node = Node(graph, node) if not node.has('IE'): continue if node.kind == 'op' and (not node.has('type') or node.type is None): @@ -394,12 +430,15 @@ def generate_ie_ir(graph: nx.MultiDiGraph, file_name: str, input_names: tuple = def port_renumber(graph: nx.MultiDiGraph): for node in list(graph.nodes()): - node = NodeWrap(graph, node) + if 'R5' in version: + node = NodeWrap(graph, node) + else: + node = Node(graph, node) if node.kind == 'op': base = 0 - for u, d in get_sorted_inputs(node): + for u, d in sorted_inputs(node): d['in'] = base base += 1 - for v, d in get_sorted_outputs(node): + for v, d in sorted_outputs(node): d['out'] = base base += 1 diff --git a/onnxruntime/core/providers/openvino/openvino_mo/openvino_mo.py b/onnxruntime/core/providers/openvino/openvino_mo/openvino_mo.py index fa998265ec068..b1bd8db440274 100644 --- a/onnxruntime/core/providers/openvino/openvino_mo/openvino_mo.py +++ b/onnxruntime/core/providers/openvino/openvino_mo/openvino_mo.py @@ -12,35 +12,25 @@ from __future__ import unicode_literals from mo.pipeline.common import determined_sort, get_fw_tensor_debug_info, get_sorted_outputs, collect_sub_graphs, relabel_nodes_inplace_safe -from mo.middle.passes import tensor_names, convert_data_type -from mo.graph.graph import Node, unique_id from openvino_emitter import port_renumber, serialize_mean_image, create_const_nodes, serialize_network, add_meta_data, generate_ie_ir, serialize_constants, serialize_constants_recursively import openvino_emitter -import networkx as nx from operator import itemgetter -from mo.graph.graph import check_empty_graph -from mo.utils.error import Error from mo.utils import class_registration from mo.middle.passes.shape import convert_reshape, reverse_input_channels, \ fuse_sequence_of_reshapes, merge_nodes_permutations, permute_data_nodes_attrs, permute_op_nodes_attrs from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess -from mo.middle.passes.eliminate import get_nodes_with_attributes -from mo.middle.passes.infer import scale_input, override_placeholder_shapes, convert_mul_add_to_power, \ - add_mean_scale_values, override_batch, exit_bound_edges, control_flow_infer # , partial_infer, update_fully_connected_shapes from mo.middle.passes.fusing.mark_unfused_nodes import mark_unfused_nodes from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence from mo.middle.passes.fusing.fuse_linear_ops import fuse_linear_ops from mo.middle.passes.fusing.fuse_grouped_conv import grouped_convolutions_fusing from mo.middle.passes.fusing.decomposition import convert_batch_norm, convert_scale_shift_to_mul_add -from mo.middle.passes.eliminate import graph_clean_up, remove_op_nodes, remove_useless_split -from mo.middle.passes.conv import convert_add_to_scaleshift, convert_gemm_to_fully_connected, \ - convert_muladd_to_scaleshift_or_power, fuse_pad, convert_dilated_convolution, convert_mul_to_scaleshift +from mo.middle.passes.infer import partial_infer, update_fully_connected_shapes +from mo.middle.passes import infer, tensor_names, convert_data_type from mo.front.onnx.loader import load_onnx_model, protobuf2nx from mo.front.onnx.extractor import common_onnx_fields, onnx_op_extractor, onnx_op_extractors from mo.front.extractor import add_output_ops, add_input_ops, \ extract_node_attrs, create_tensor_nodes, remove_output_ops, user_data_repack -from mo.front.common.register_custom_ops import update_extractors_with_extensions -from mo.front.common.register_custom_ops import check_for_duplicates +from mo.front.common.register_custom_ops import check_for_duplicates, update_extractors_with_extensions from extensions.middle.NormalizeFullyConnected import NormalizeFullyConnected from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize from mo.utils.versions_checker import check_requirements @@ -52,10 +42,10 @@ from mo.utils.cli_parser import get_placeholder_shapes, get_tuple_values, get_model_name, \ get_common_cli_options, get_caffe_cli_options, get_tf_cli_options, get_mxnet_cli_options, get_kaldi_cli_options, \ get_onnx_cli_options, get_mean_scale_dictionary, parse_tuple_pairs, get_meta_info -from mo.utils import import_extensions from mo.utils.versions_checker import check_python_version import onnx import numpy as np +import networkx as nx from collections import OrderedDict import traceback import logging as log @@ -64,12 +54,35 @@ import sys import os ov_root = os.environ['INTEL_CVSDK_DIR'] -mo_path = os.path.join(ov_root, "deployment_tools", "model_optimizer") -mo_extensions = os.path.join(mo_path, "extensions") +if '2019.1' in ov_root: + version = 'R1' +elif '2018.5' in ov_root: + version = 'R5' +else: + version = 'unsupported' + print('You are using unsupported version of OpenVINO. Please refer to BUILD.md for supported versions of OpenVINO.') +mo_path = ov_root + "/deployment_tools/model_optimizer" +mo_extensions = mo_path + "/extensions" sys.path.append(mo_path) -# from mo.back.ie_ir_ver_2.emitter import port_renumber, serialize_mean_image, create_const_nodes #, generate_ie_ir, serialize_constants, serialize_constants_recursively +if 'R5' in version: + from mo.utils import import_extensions + from mo.middle.passes.conv import convert_add_to_scaleshift, convert_gemm_to_fully_connected, \ + convert_muladd_to_scaleshift_or_power, fuse_pad, convert_dilated_convolution, convert_mul_to_scaleshift + from mo.middle.passes.eliminate import graph_clean_up, remove_op_nodes, remove_useless_split, get_nodes_with_attributes + from mo.middle.passes.infer import scale_input, override_placeholder_shapes, convert_mul_add_to_power, \ + add_mean_scale_values, override_batch, exit_bound_edges, control_flow_infer + from mo.graph.graph import check_empty_graph, Node, unique_id +else: + from mo.utils import import_extensions, class_registration + from mo.middle.passes.conv import convert_add_or_mul_to_scaleshift, convert_muladd_to_scaleshift_or_power, fuse_pad + from mo.middle.passes.eliminate import remove_const_ops, mark_output_reachable_nodes, mark_undead_nodes, mark_const_producer_nodes, \ + eliminate_dead_nodes, add_constant_operations, shape_inference, remove_op_nodes, get_nodes_with_attributes + from mo.middle.passes.infer import override_placeholder_shapes, convert_mul_add_to_power, override_batch, exit_bound_edges, control_flow_infer + from extensions.back.CreateConstNodes import CreateConstNodesReplacement + from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively + from mo.graph.graph import check_empty_graph, Node, Graph def is_fully_defined_shape(shape: np.ndarray): @@ -78,160 +91,11 @@ def is_fully_defined_shape(shape: np.ndarray): return True -def partial_infer(graph: nx.MultiDiGraph, start_node: str = None): - - cycle_nodes = get_nodes_with_attributes(graph, is_cyclic=True) - cycle_nodes = [Node(graph, node).out_node().id for node in cycle_nodes] - ebunch_cyclic = list(graph.out_edges( - nbunch=cycle_nodes, data=True, keys=True)) - ebunch_reconnected = exit_bound_edges( - graph, sources=cycle_nodes, end_node_attrs={'op': 'Exit'}) - graph.remove_edges_from(ebunch_cyclic) - graph.add_edges_from(ebunch_reconnected) - - try: - nodes = list(nx.topological_sort(graph)) - except: - raise Error('Graph contains a cycle. Can not proceed. ' + - refer_to_faq_msg(97)) - - graph.remove_edges_from(ebunch_reconnected) - graph.add_edges_from(ebunch_cyclic) - - # Mark all nodes as not inferred yet - if start_node is not None: - start_index = nodes.index(start_node) - nx.set_node_attributes(G=graph.subgraph( - nodes[start_index:]), name='is_partial_inferred', values=False) - else: - nx.set_node_attributes( - G=graph, name='is_partial_inferred', values=False) - debug_logger = log.getLogger().isEnabledFor(log.DEBUG) - - nx.set_node_attributes(G=graph, name='executable', - values={n: True for n in get_nodes_with_attributes(graph, kind='data')}) - - for n in nodes: - # Data Flow Infer - try: - node = Node(graph, n) - node_name = node.soft_get('name') - if node.has('is_partial_inferred') and not node.is_partial_inferred: - if node.has('infer') and node.infer is not None: - log.debug('-' * 20) - log.debug('Partial infer for {}'.format( - node.soft_get('name'))) - log.debug('Op: {}'.format(node.soft_get('op'))) - node.infer(node) - out_nodes = node.out_nodes() - - # propagate nchw_layout attributes to data nodes - if node.has('nchw_layout'): - for out_node in out_nodes.values(): - out_node['nchw_layout'] = node.nchw_layout - - # In debug print current node attributes, input shapes/values and output shape/values - if debug_logger: - log.debug('Inputs:') - log_debug_dict(node.in_nodes(), 'input') - log.debug('Outputs:') - log_debug_dict(node.out_nodes(), 'output') - - for out_port, out_node in out_nodes.items(): - not_all_output_shapes = False - if not out_node.has_valid('shape'): - log.error('Shape is not defined for output {} of "{}".'.format( - out_port, node_name)) - not_all_output_shapes = True - elif not is_fully_defined_shape(out_node.shape): - log.error( - ('Shape {} is not fully defined for output {} of "{}". ' + - 'Use --input_shape with positive integers to override model input shapes.').format( - out_node.shape, - out_port, - node_name - ) - ) - not_all_output_shapes = True - - if not_all_output_shapes: - raise Error('Not all output shapes were inferred or fully defined for node "{}". ' + - refer_to_faq_msg(40), - node_name) - elif node.kind != 'data': - raise Error( - 'There is no registered "infer" function for node "{}" with op = "{}". ' + - 'Please implement this function in the extensions. ' + - refer_to_faq_msg(37), - node_name, - node.soft_get('op') - ) - node.is_partial_inferred = True - - except Exception as err: - log.error('Cannot infer shapes or values for node "{}".'.format( - node.soft_get('name'))) - log.error(str(err)) - log.error('') - log.error('It can happen due to bug in custom shape infer function {}.'.format( - node.soft_get('infer'))) - log.error('Or because the node inputs have incorrect values/shapes.') - log.error( - 'Or because input shapes are incorrect (embedded to the model or passed via --input_shape).') - debug_messages = '\n'.join( - ['Layer "' + node_name + '": ' + node_attrs['debug_message'] for node_name, node_attrs in - graph.nodes(data=True) if 'debug_message' in node_attrs]) - if debug_messages != "": - log.error('') - log.error('Other possible failure reasons are listed below:') - log.error(debug_messages) - if not debug_logger: - log.error( - 'Run Model Optimizer with --log_level=DEBUG for more information.') - else: - log.debug('Node "{}" attributes: {}'.format( - node.soft_get('name'), node.graph.node[node.id])) - raise Error('Stopped shape/value propagation at "{}" node. '.format(node.soft_get('name')) + - refer_to_faq_msg(38)) from err - control_flow_infer(graph, n) - - not_fully_inferred = get_nodes_with_attributes( - graph, is_not_fully_inferred=True) - for n in not_fully_inferred: - node = Node(graph, n) - if node.has('infer') and node.infer is not None: - node.infer(node) - - # delete_not_executable(graph) - return graph - - -def update_fully_connected_shapes(graph: nx.MultiDiGraph): - nodes = nx.topological_sort(graph) - while True: - should_infer = False - for n in nodes: - node = Node(graph, n) - if node.has('type') and node.type == 'FullyConnected' and node.in_node(0).shape.size == 3: - log.debug("node.in_node(0).shape = {}".format( - node.in_node(0).shape)) - log.debug("channel_dims = {}".format(node.channel_dims)) - assert (node.in_node(0).shape.size == - 3 and node.channel_dims > 0) - node.in_node(0).shape = np.delete(node.in_node(0).shape, 1) - if node.out_node().shape.size == 3: - node.channel_dims = node.channel_dims - 1 - log.debug( - "Initiated partial infer from update_fully_connected_shapes") - graph = partial_infer(graph, node.in_node(0).id) - should_infer = True - break - if not should_infer: - break +infer.is_fully_defined_shape = is_fully_defined_shape def prepare_emit_ir(graph: nx.MultiDiGraph, data_type: str, output_dir: str, output_model_name: str, - mean_data: [list, None]=None, input_names: list=[], meta_info: dict=dict()): + mean_data: [list, None] = None, input_names: list = [], meta_info: dict = dict()): for sub_graph in [graph] + collect_sub_graphs(graph): create_const_nodes( @@ -257,7 +121,6 @@ def prepare_emit_ir(graph: nx.MultiDiGraph, data_type: str, output_dir: str, out if mean_data: mean_offset, mean_size = serialize_mean_image( bin_file, mean_data=mean_data) - xml_string = generate_ie_ir(graph=graph, file_name=os.path.join( output_dir, '{}.xml'.format(output_model_name)), @@ -266,16 +129,33 @@ def prepare_emit_ir(graph: nx.MultiDiGraph, data_type: str, output_dir: str, out mean_size=mean_size, meta_info=meta_info) - # tensor_names.output_tensor_names_map(graph, os.path.join(output_dir, '{}.mapping'.format(output_model_name))) return weights, xml_string -# argv: argparse.Namespace +if 'R1' in version: + def graph_clean_up(graph: Graph, undead_node_types: list = None): + if undead_node_types is None: + undead_node_types = [] + + if 'Shape' in undead_node_types: + undead_node_types.remove('Shape') + + mark_output_reachable_nodes(graph) + mark_undead_nodes(graph, undead_node_types) + mark_const_producer_nodes(graph) + eliminate_dead_nodes(graph) + # Add Const op for constant data nodes + add_constant_operations(graph) + shape_inference(graph) -def driver(onnx_modelproto_bytes, precision: str, output_model_name: str, outputs: list, output_dir: str, - scale: float, - user_shapes: [None, list, np.array]=None, - mean_scale_values: [dict, list]=()): + def graph_clean_up_onnx(graph: Graph): + graph_clean_up(graph, ['Shape']) + + +def driver_R5(onnx_modelproto_bytes, precision: str, output_model_name: str, outputs: list, output_dir: str, + scale: float, + user_shapes: [None, list, np.array] = None, + mean_scale_values: [dict, list] = ()): try: model_proto = onnx.load_from_string(bytes(onnx_modelproto_bytes)) @@ -320,7 +200,6 @@ def driver(onnx_modelproto_bytes, precision: str, output_model_name: str, output output_op_nodes = add_output_ops(graph, packed_outputs) input_op_nodes = add_input_ops(graph, packed_user_shapes, True) - # this call of 'graph_clean_up' removes child nodes of outputs which is useful when custom output is specified graph_clean_up(graph) check_empty_graph(graph, 'add_output_ops and add_input_ops') extract_node_attrs(graph, lambda node: onnx_op_extractor( @@ -404,6 +283,101 @@ def driver(onnx_modelproto_bytes, precision: str, output_model_name: str, output return weights, xml_string +def driver_R1(onnx_modelproto_bytes, precision: str, output_model_name: str, outputs: list, output_dir: str, + scale: float, + user_shapes: [None, list, np.array] = None, + mean_scale_values: [dict, list] = ()): + + try: + model_proto = onnx.load_from_string(bytes(onnx_modelproto_bytes)) + except Exception as e: + print("[python] onnx exception: ", str(e)) + + model_graph = model_proto.graph # pylint: disable=no-member + + update_extractors_with_extensions(onnx_op_extractors) + + try: + graph = protobuf2nx(model_proto) + log.debug("Number of nodes in NX graph: {}".format( + graph.number_of_nodes())) + graph.__setattr__( + 'name', output_model_name if output_model_name else model_proto.graph.name) # pylint: disable=no-member + graph.graph['layout'] = 'NCHW' + graph.graph['cmd_params'] = argparse.Namespace(batch=None, data_type='float', disable_fusing=False, disable_gfusing=False, disable_resnet_optimization=False, enable_concat_optimization=False, extensions=mo_extensions, finegrain_fusing=None, framework='onnx', freeze_placeholder_with_value=None, generate_deprecated_IR_V2=False, + input=None, input_model=None, input_shape=None, keep_shape_ops=False, log_level='ERROR', mean_scale_values={}, mean_values=(), model_name=None, move_to_preprocess=False, output=None, output_dir='.', placeholder_shapes=None, reverse_input_channels=False, scale=None, scale_values=(), silent=False, version=False) + graph.graph['fw'] = 'onnx' + graph.graph['feature_dim'] = 1 if graph.graph['layout'] == 'NCHW' else 3 + graph.graph['ir_version'] = 4 + extract_node_attrs(graph, lambda node: ( + True, common_onnx_fields(node))) + except Exception as e: + raise Error( + 'Cannot pre-process ONNX graph after reading from model file "{}". ' + 'File is corrupt or has unsupported format. Details: {}. ' + + refer_to_faq_msg(44), + model_file_name, + str(e) + ) from e + graph.check_empty_graph( + 'protobuf2nx. It may happen due to problems with loaded model') + extract_node_attrs(graph, lambda node: onnx_op_extractor( + node, check_for_duplicates(onnx_op_extractors))) + + # --------------------------------- LOAD END ------------------------------------------------------ + class_registration.apply_replacements( + graph, class_registration.ClassType.FRONT_REPLACER) + partial_infer(graph) + graph.check_empty_graph('partial_infer') + class_registration.apply_replacements( + graph, class_registration.ClassType.MIDDLE_REPLACER) + + fuse_pad(graph) + graph_clean_up_onnx(graph) + + mark_unfused_nodes(graph, 'False') + convert_batch_norm(graph) + graph_clean_up_onnx(graph) + + convert_muladd_to_scaleshift_or_power(graph) + graph_clean_up_onnx(graph) + + convert_mul_add_to_power(graph) + graph_clean_up_onnx(graph) + + convert_reshape(graph) + graph_clean_up_onnx(graph) + convert_add_or_mul_to_scaleshift(graph) # scale = 1 + graph_clean_up_onnx(graph) + + fuse_pad(graph) + graph_clean_up_onnx(graph) + + fuse_sequence_of_reshapes(graph) + graph_clean_up_onnx(graph) + + pattern = EltwiseInputNormalize() + pattern.find_and_replace_pattern(graph) + + merge_nodes_permutations(graph) + permute_data_nodes_attrs(graph) + permute_op_nodes_attrs(graph) + + class_registration.apply_replacements( + graph, class_registration.ClassType.BACK_REPLACER) + + for_graph_and_each_sub_graph_recursively(graph, remove_const_ops) + + CreateConstNodesReplacement().find_and_replace_pattern(graph) + + for_graph_and_each_sub_graph_recursively(graph, remove_output_ops) + + weights, xml_string = prepare_emit_ir(graph=graph, data_type=precision, output_dir=output_dir, output_model_name=output_model_name, + meta_info={'unset': []}) + + return weights, xml_string + + def driver_entry(onnx_modelproto_bytes, precision: str): start_time = datetime.datetime.now() @@ -414,11 +388,19 @@ def driver_entry(onnx_modelproto_bytes, precision: str): scale_values = {} mean_scale = {} - from mo.front.onnx.register_custom_ops import update_registration - import_extensions.load_dirs('onnx', [mo_extensions], update_registration) - weights, xml_string = driver(onnx_modelproto_bytes, precision, model_name, outputs, ".", None, - user_shapes=placeholder_shapes, - mean_scale_values=mean_scale) + if 'R5' in version: + from mo.front.onnx.register_custom_ops import update_registration + import_extensions.load_dirs( + 'onnx', [mo_extensions], update_registration) + weights, xml_string = driver_R5(onnx_modelproto_bytes, precision, model_name, outputs, ".", None, + user_shapes=placeholder_shapes, + mean_scale_values=mean_scale) + else: + from mo.front.onnx.register_custom_ops import get_front_classes + import_extensions.load_dirs('onnx', [mo_extensions], get_front_classes) + weights, xml_string = driver_R1(onnx_modelproto_bytes, precision, model_name, outputs, ".", None, + user_shapes=placeholder_shapes, + mean_scale_values=mean_scale) return weights, xml_string @@ -459,5 +441,11 @@ def convert_fp32(onnx_modelproto_bytes): sys.exit(ret_code) from mo.utils.cli_parser import get_onnx_cli_parser + if '2019' in ov_root: + print('2019 R1 version') + else: + print('2018 R5 version') weights_string, final_string = convert_fp32() + print(weights_string) + sys.exit(0) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h b/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h index f6d70bdde0bda..ff2e74714691b 100755 --- a/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_allocator.h @@ -12,7 +12,7 @@ class TensorrtPinnedAllocator : public CPUAllocator { public: virtual const OrtAllocatorInfo& Info() const override { static OrtAllocatorInfo tensorrt_cpu_allocator_info(TRT, - OrtAllocatorType::OrtDeviceAllocator, 0, + OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemType::OrtMemTypeCPU); return tensorrt_cpu_allocator_info; } @@ -25,8 +25,7 @@ class TensorrtAllocator : public CPUAllocator { public: virtual const OrtAllocatorInfo& Info() const override { static OrtAllocatorInfo tensorrt_default_allocator_info(TRT, - OrtAllocatorType::OrtDeviceAllocator, 0, - OrtMemType::OrtMemTypeDefault); + OrtAllocatorType::OrtDeviceAllocator); return tensorrt_default_allocator_info; } }; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index af0bbdfe66902..1ff6839af4dc5 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -280,12 +280,6 @@ TensorrtExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, return result; } -common::Status TensorrtExecutionProvider::CopyTensor(const Tensor& src, Tensor& dst) const { - ORT_UNUSED_PARAMETER(src); - ORT_UNUSED_PARAMETER(dst); - return Status::OK(); -} - common::Status TensorrtExecutionProvider::Compile(const std::vector& fused_nodes, std::vector& node_compute_funcs) { for (const auto* fused_node : fused_nodes) { diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 723375ce4042f..7f7b27b60bbcc 100755 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -62,8 +62,6 @@ class TensorrtExecutionProvider : public IExecutionProvider { common::Status Compile(const std::vector& fused_nodes, std::vector& node_compute_funcs) override; - Status CopyTensor(const Tensor& src, Tensor& dst) const override; - void SetMaxBatchSize(const int batch_size) { max_batch_size_ = batch_size; } diff --git a/onnxruntime/core/session/IOBinding.h b/onnxruntime/core/session/IOBinding.h index 1896bd778b69f..703a1a66e43b2 100644 --- a/onnxruntime/core/session/IOBinding.h +++ b/onnxruntime/core/session/IOBinding.h @@ -44,7 +44,7 @@ class IOBinding { * copy it to the desired location. This copy may or may not be async. It depends on the exec provider. * If the input ort_value is not at the desired location, it should be preallocated * If the input ort_value isn't preallocated, it should have memtype of OrtMemTypeDefault - * For copying it leverages IExecutionProvider::CopyTensor(). + * For copying it leverages DataTransferManager::CopyTensor(). */ common::Status BindInput(const std::string& name, const OrtValue& ml_value); diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc index aeaab0b2488da..710ab2db8121f 100644 --- a/onnxruntime/core/session/abi_session_options.cc +++ b/onnxruntime/core/session/abi_session_options.cc @@ -28,7 +28,7 @@ ORT_API(void, OrtReleaseSessionOptions, OrtSessionOptions* ptr) { delete ptr; } -ORT_API_STATUS_IMPL(OrtCloneSessionOptions, OrtSessionOptions* input, OrtSessionOptions** out) { +ORT_API_STATUS_IMPL(OrtCloneSessionOptions, const OrtSessionOptions* input, OrtSessionOptions** out) { API_IMPL_BEGIN *out = new OrtSessionOptions(*input); return nullptr; @@ -89,16 +89,18 @@ ORT_API_STATUS_IMPL(OrtSetSessionLogId, _In_ OrtSessionOptions* options, const c } ///< applies to session load, initialization, etc -ORT_API_STATUS_IMPL(OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, uint32_t session_log_verbosity_level) { +ORT_API_STATUS_IMPL(OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, int session_log_verbosity_level) { options->value.session_log_verbosity_level = session_log_verbosity_level; return nullptr; } // Set Graph optimization level. -// Returns 0 on success and -1 otherwise // Available options are : 0, 1, 2. -ORT_API_STATUS_IMPL(OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, uint32_t graph_optimization_level) { - if (graph_optimization_level >= static_cast(onnxruntime::TransformerLevel::MaxTransformerLevel)) +ORT_API_STATUS_IMPL(OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, int graph_optimization_level) { + if (graph_optimization_level < 0) { + return OrtCreateStatus(ORT_INVALID_ARGUMENT, "graph_optimization_level is not valid"); + } + if (graph_optimization_level >= static_cast(onnxruntime::TransformerLevel::MaxTransformerLevel)) return OrtCreateStatus(ORT_INVALID_ARGUMENT, "graph_optimization_level is not valid"); options->value.graph_optimization_level = static_cast(graph_optimization_level); return nullptr; diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index 7d4218022af12..38b7699aa4d35 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -29,30 +29,48 @@ ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_int64, _In_ const OrtKernelInfo* i return onnxruntime::ToOrtStatus(status); } -ORT_API_STATUS_IMPL(OrtKernelContext_GetInputCount, const OrtKernelContext* context, _Out_ size_t* out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetInputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) { *out = reinterpret_cast(context)->InputCount(); return nullptr; }; -ORT_API_STATUS_IMPL(OrtKernelContext_GetOutputCount, const OrtKernelContext* context, _Out_ size_t* out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetOutputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) { *out = reinterpret_cast(context)->OutputCount(); return nullptr; }; -ORT_API_STATUS_IMPL(OrtKernelContext_GetInput, const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out) { *out = reinterpret_cast(reinterpret_cast(context)->GetInputMLValue(index)); return nullptr; }; -ORT_API_STATUS_IMPL(OrtKernelContext_GetOutput, OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) { onnxruntime::TensorShape shape(dim_values, dim_count); *out = reinterpret_cast(reinterpret_cast(context)->OutputMLValue(index, shape)); return nullptr; }; +ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t* size) { + std::string value; + auto status = reinterpret_cast(info)->GetAttr(name, &value); + if (status.IsOK()) { + if (*size >= value.size() + 1) { + std::memcpy(out, value.data(), value.size()); + out[value.size()] = '\0'; + *size = value.size(); + return nullptr; + } else { + *size = value.size() + 1; + return OrtCreateStatus(ORT_INVALID_ARGUMENT, "Result buffer is not large enough"); + } + } + return onnxruntime::ToOrtStatus(status); +} + constexpr OrtCustomOpApi g_custom_op_api = { &OrtKernelInfoGetAttribute_float, &OrtKernelInfoGetAttribute_int64, + &OrtKernelInfoGetAttribute_string, &OrtGetTensorTypeAndShape, diff --git a/onnxruntime/core/session/default_cpu_allocator_c_api.cc b/onnxruntime/core/session/default_cpu_allocator_c_api.cc index e699f605c5931..a4a7e7ea51612 100644 --- a/onnxruntime/core/session/default_cpu_allocator_c_api.cc +++ b/onnxruntime/core/session/default_cpu_allocator_c_api.cc @@ -17,7 +17,7 @@ struct OrtDefaultAllocator : OrtAllocatorImpl { OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Alloc(size); }; OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast(this_)->Free(p); }; OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast(this_)->Info(); }; - ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &cpuAllocatorInfo)); + ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpuAllocatorInfo)); } ~OrtDefaultAllocator() override { OrtReleaseAllocatorInfo(cpuAllocatorInfo); } diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc index 7d8e6541d4b7f..ef253bbb61ad2 100644 --- a/onnxruntime/core/session/environment.cc +++ b/onnxruntime/core/session/environment.cc @@ -32,6 +32,7 @@ Status Environment::Initialize() { // Register Microsoft domain with min/max op_set version as 1/1. std::call_once(schemaRegistrationOnceFlag, []() { ONNX_NAMESPACE::OpSchemaRegistry::DomainToVersionRange::Instance().AddDomainToVersion(onnxruntime::kMSDomain, 1, 1); + ONNX_NAMESPACE::OpSchemaRegistry::DomainToVersionRange::Instance().AddDomainToVersion(onnxruntime::kMSNchwcDomain, 1, 1); // Register contributed schemas. // The corresponding kernels are registered inside the appropriate execution provider. #ifndef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 4d7906d8b84e5..10ceef943af0d 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "core/common/logging/logging.h" @@ -44,6 +45,9 @@ #include "core/optimizer/insert_cast_transformer.h" #include "core/optimizer/transformer_memcpy.h" #include "core/providers/cpu/cpu_execution_provider.h" +#ifdef USE_CUDA +#include "core/providers/cuda/gpu_data_transfer.h" +#endif #include "core/session/IOBinding.h" #include "core/session/custom_ops.h" #include "core/util/protobuf_parsing_utils.h" @@ -88,7 +92,8 @@ inline std::basic_string GetCurrentTimeString() { } } // namespace -InferenceSession::InferenceSession(const SessionOptions& session_options, logging::LoggingManager* logging_manager) +InferenceSession::InferenceSession(const SessionOptions& session_options, + logging::LoggingManager* logging_manager) : session_options_{session_options}, graph_transformation_mgr_{session_options_.max_num_graph_transformation_steps}, logging_manager_{logging_manager}, @@ -100,6 +105,8 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, loggin InitLogger(logging_manager); + session_state_.SetDataTransferMgr(&data_transfer_mgr_); + // The threadpool is currently evolving. We will always create a per session threadpool. // Beyond this, we will create a global thread pool to share across sessions. { @@ -213,7 +220,7 @@ common::Status InferenceSession::Load(std::functionSetLogger(*session_logger_); // Pass threadpool to subgraph subgraph_session_state->SetThreadPool(session_state.GetThreadPool()); - + // Pass data transfer manager to subgraph. + subgraph_session_state->SetDataTransferMgr(&session_state.GetDataTransferMgr()); // Pass fused function manager to subgraph subgraph_session_state->GetMutableFuncMgr().SetFusedFuncs(session_state.GetFuncMgr()); @@ -471,6 +479,16 @@ common::Status InferenceSession::Initialize() { std::make_unique(epi))); } + // Register data transfer methods. + data_transfer_mgr_.RegisterDataTransfer(std::make_unique()); +#ifdef USE_CUDA + // TODO: this should be refactored later by exposing separate API to allow users to register different data transfers for different devices. + bool is_nvidia_gpu_used = (nullptr != execution_providers_.Get(kCudaExecutionProvider)) || (nullptr != execution_providers_.Get(kTensorrtExecutionProvider)); + if (is_nvidia_gpu_used) { + data_transfer_mgr_.RegisterDataTransfer(std::make_unique()); + } +#endif + if (!session_options_.enable_sequential_execution && execution_providers_.Get(onnxruntime::kCudaExecutionProvider)) { LOGS(*session_logger_, ERROR) << "Parallel execution is currently not supported " @@ -532,7 +550,7 @@ common::Status InferenceSession::Initialize() { LOGS(*session_logger_, ERROR) << status.ErrorMessage(); } - if (session_profiler_.FEnabled()) { + if (session_profiler_.IsEnabled()) { session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, "session_initialization", tp); } return status; @@ -542,7 +560,45 @@ int InferenceSession::GetCurrentNumRuns() const { return current_num_runs_.load(); } -common::Status InferenceSession::CheckTypes(MLDataType actual, MLDataType expected) { +common::Status InferenceSession::CheckShapes(const std::string& input_name, + const TensorShape& input_shape, + const TensorShape& expected_shape) const { + auto input_shape_sz = input_shape.NumDimensions(); + auto expected_shape_sz = expected_shape.NumDimensions(); + if (input_shape_sz != expected_shape_sz) { + std::ostringstream ostr; + ostr << "Invalid rank for input: " << input_name + << " Got: " << input_shape_sz << " Expected: " << expected_shape_sz + << " Please fix either the inputs or the model."; + LOGS(*session_logger_, WARNING) << ostr.str(); + return Status::OK(); + } + + std::vector invalid_dim_indices; + for (size_t i = 0; i < input_shape_sz; ++i) { + if (expected_shape[i] < 0) { + continue; // this represents a symbolic shape dimension + } + if (input_shape[i] != expected_shape[i]) { + invalid_dim_indices.push_back(i); + } + } + + if (!invalid_dim_indices.empty()) { + std::ostringstream ostr; + ostr << "Got invalid dimensions for input: " << input_name << " for the following indices\n"; + for (size_t i = 0, end = invalid_dim_indices.size(); i < end; ++i) { + int idx = invalid_dim_indices[i]; + ostr << " index: " << idx << " Got: " << input_shape[idx] << " Expected: " << expected_shape[idx] << "\n"; + } + ostr << " Please fix either the inputs or the model."; + LOGS(*session_logger_, WARNING) << ostr.str(); + } + + return Status::OK(); +} + +static common::Status CheckTypes(MLDataType actual, MLDataType expected) { if (actual == expected) { return Status::OK(); } @@ -553,7 +609,7 @@ common::Status InferenceSession::CheckTypes(MLDataType actual, MLDataType expect } common::Status InferenceSession::ValidateInputs(const std::vector& feed_names, - const std::vector& feeds) { + const std::vector& feeds) const { if (feed_names.size() != feeds.size()) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Size mismatch: feed_names has ", @@ -561,21 +617,34 @@ common::Status InferenceSession::ValidateInputs(const std::vector& feeds.size(), " elements."); } - // More feeds are offered. - // In the case of overriding some initializers (which are also taken as graph inputs). for (size_t i = 0; i < feeds.size(); ++i) { - auto iter = input_def_map_.find(feed_names[i]); + const auto& feed_name = feed_names[i]; + + auto iter = input_def_map_.find(feed_name); if (input_def_map_.end() == iter) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Invalid Feed Input Name:", feed_names[i]); + "Invalid Feed Input Name:", feed_name); } - auto expected_type = utils::GetMLDataType(*iter->second); + auto expected_type = iter->second.ml_data_type; auto& input_ml_value = feeds.at(i); if (input_ml_value.IsTensor()) { + // check for type + if (!expected_type->IsTensorType()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input with name: ", + feed_name, " is not expected to be of type tensor."); + } + auto expected_element_type = expected_type->AsTensorType()->GetElementType(); auto input_element_type = input_ml_value.Get().DataType(); ORT_RETURN_IF_ERROR(CheckTypes(input_element_type, expected_element_type)); + + // check for shape + const auto& expected_shape = iter->second.tensor_shape; + if (expected_shape.NumDimensions() > 0) { + const auto& input_shape = input_ml_value.Get().Shape(); + ORT_RETURN_IF_ERROR(CheckShapes(feed_name, input_shape, expected_shape)); + } } else { auto input_type = input_ml_value.Type(); ORT_RETURN_IF_ERROR(CheckTypes(input_type, expected_type)); @@ -586,8 +655,8 @@ common::Status InferenceSession::ValidateInputs(const std::vector& } common::Status InferenceSession::ValidateOutputs(const std::vector& output_names, - const std::vector* p_fetches) { - if (!p_fetches) { + const std::vector* p_fetches) const { + if (p_fetches == nullptr) { return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Output vector pointer is NULL"); } @@ -630,8 +699,6 @@ Status InferenceSession::Run(const RunOptions& run_options, const std::vector InferenceSession::GetModelInputs( } } - return std::make_pair(common::Status::OK(), &required_input_def_list_); + // return required inputs (excludes any inputs used for overriding initializers) + return std::make_pair(common::Status::OK(), &model_->MainGraph().GetInputs()); } std::pair InferenceSession::GetModelOutputs() const { @@ -791,7 +859,12 @@ void InferenceSession::StartProfiling(const logging::Logger* logger_ptr) { std::string InferenceSession::EndProfiling() { if (is_model_loaded_) { - return session_profiler_.EndProfiling(); + if (session_profiler_.IsEnabled()) { + return session_profiler_.EndProfiling(); + } else { + LOGS(*session_logger_, VERBOSE) << "Profiler is disabled."; + return std::string(); + } } LOGS(*session_logger_, ERROR) << "Could not write a profile because no model was loaded."; return std::string(); @@ -816,15 +889,31 @@ common::Status InferenceSession::SaveModelMetadata(const onnxruntime::Model& mod model_metadata_.custom_metadata_map = model.MetaData(); model_metadata_.graph_name = graph.Name(); - // save required inputs - const auto& required_inputs = graph.GetInputs(); // inputs excluding initializers - required_input_def_list_ = required_inputs; // A direct copy of required inputs + for (auto input : graph.GetInputs()) { + required_inputs_.insert(input->Name()); + } + + auto add_inputs = [this](const InputDefList& inputs) { + input_def_map_.reserve(inputs.size()); + for (auto elem : inputs) { + auto elem_type = utils::GetMLDataType(*elem); + auto elem_shape_proto = elem->Shape(); + input_def_map_.insert({elem->Name(), InputDefMetaData(elem, + elem_type, + elem_shape_proto + ? utils::GetTensorShapeFromTensorShapeProto(*elem_shape_proto) + : TensorShape())}); + } + }; - // save all valid inputs - auto& all_inputs = graph.GetInputsIncludingInitializers(); - input_def_map_.reserve(all_inputs.size()); - for (auto elem : all_inputs) { - input_def_map_.insert({elem->Name(), elem}); + if (graph.CanOverrideInitializer()) { + // for IR 4 or higher it is optional to have a matching graph input for an initializer, and if one exists the + // initializer is explicitly overridable. + add_inputs(graph.GetInputsIncludingInitializers()); + } else { + // for IR < 4 we don't allow overriding initializers so that they can be treated as constant. exclude them from + // the list of valid inputs by just using the GetInputs() list. + add_inputs(graph.GetInputs()); } // save outputs @@ -859,13 +948,12 @@ const logging::Logger& InferenceSession::CreateLoggerForRun(const RunOptions& ru run_log_id += run_options.run_tag; logging::Severity severity = logging::Severity::kWARNING; - - if (run_options.run_log_severity_level < 0) { + if (run_options.run_log_severity_level == -1) { severity = session_logger_->GetSeverity(); } else { ORT_ENFORCE(run_options.run_log_severity_level >= 0 && run_options.run_log_severity_level <= static_cast(logging::Severity::kFATAL), - "Invalid run log severity level. Must be a valid onnxruntime::logging::Severity value. Got ", + "Invalid run log severity level. Not a valid onnxruntime::logging::Severity value: ", run_options.run_log_severity_level); severity = static_cast(run_options.run_log_severity_level); } @@ -888,26 +976,22 @@ const logging::Logger& InferenceSession::CreateLoggerForRun(const RunOptions& ru void InferenceSession::InitLogger(logging::LoggingManager* logging_manager) { // create logger for session, using provided logging manager if possible - if (logging_manager != nullptr) { - std::string session_logid = !session_options_.session_logid.empty() - ? session_options_.session_logid - : "InferenceSession"; // there's probably a better default... - + if (logging_manager != nullptr && !session_options_.session_logid.empty()) { logging::Severity severity = logging::Severity::kWARNING; - - if (session_options_.session_log_severity_level < 0) { + if (session_options_.session_log_severity_level == -1) { severity = logging::LoggingManager::DefaultLogger().GetSeverity(); } else { ORT_ENFORCE(session_options_.session_log_severity_level >= 0 && session_options_.session_log_severity_level <= static_cast(logging::Severity::kFATAL), - "Invalid session log severity level. Must be a valid onnxruntime::logging::Severity value. Got ", + "Invalid session log severity level. Not a valid onnxruntime::logging::Severity value: ", session_options_.session_log_severity_level); severity = static_cast(session_options_.session_log_severity_level); } - owned_session_logger_ = logging_manager_->CreateLogger(session_logid, severity, false, + owned_session_logger_ = logging_manager_->CreateLogger(session_options_.session_logid, + severity, + false, session_options_.session_log_verbosity_level); - session_logger_ = owned_session_logger_.get(); } else { session_logger_ = &logging::LoggingManager::DefaultLogger(); @@ -939,6 +1023,10 @@ void InferenceSession::AddPredefinedTransformers(GraphTransformerManager& transf if ((graph_optimization_level >= TransformerLevel::Level2) || !custom_list.empty()) { add_transformers(TransformerLevel::Level2); } + + if ((graph_optimization_level >= TransformerLevel::Level3) || !custom_list.empty()) { + add_transformers(TransformerLevel::Level3); + } } common::Status InferenceSession::WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms) { diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index 2a2371d439207..761c121e95a42 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -77,7 +77,7 @@ struct SessionOptions { /// See https://github.com/microsoft/onnxruntime/blob/master/include/onnxruntime/core/common/logging/severity.h /// Default = -1 (use default logger severity) int session_log_severity_level = -1; - unsigned session_log_verbosity_level = 0; ///< VLOG level if debug build and session_log_severity_level is 0 (VERBOSE). + int session_log_verbosity_level = 0; ///< VLOG level if debug build and session_log_severity_level is 0 (VERBOSE). unsigned max_num_graph_transformation_steps = 5; // TODO choose a good default here? @@ -177,7 +177,7 @@ class InferenceSession { * The order of invocation indicates the reversed preference order: Register your most * preferred registry at the end. * Calling this API is optional. - * This API is not thread safe. + * This API is not thread safe. * @return OK if success. */ common::Status RegisterCustomRegistry(std::shared_ptr custom_registry); @@ -210,7 +210,7 @@ class InferenceSession { * Initializes a previously loaded model. Initialization includes but is not * limited to graph transformations, construction of kernels, etc. * This method assumes that a method has been loaded previously. - * This API is thread-safe. + * This API is thread-safe. * @return OK if success */ common::Status Initialize(); @@ -367,11 +367,13 @@ class InferenceSession { void InitLogger(logging::LoggingManager* logging_manager); - static common::Status CheckTypes(MLDataType actual, MLDataType expected); + common::Status CheckShapes(const std::string& input_name, + const TensorShape& input_shape, + const TensorShape& expected_shape) const; - common::Status ValidateInputs(const std::vector& feed_names, const std::vector& feeds); + common::Status ValidateInputs(const std::vector& feed_names, const std::vector& feeds) const; - common::Status ValidateOutputs(const std::vector& output_names, const std::vector* p_fetches); + common::Status ValidateOutputs(const std::vector& output_names, const std::vector* p_fetches) const; common::Status WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms); @@ -391,14 +393,15 @@ class InferenceSession { std::vector transformers_to_enable_; /// Logging manager if provided. - logging::LoggingManager* logging_manager_; + logging::LoggingManager* logging_manager_ = nullptr; /// Logger for this session. WARNING: Will contain nullptr if logging_manager_ is nullptr. - std::unique_ptr owned_session_logger_; + std::unique_ptr owned_session_logger_ = nullptr; // Profiler for this session. profiling::Profiler session_profiler_; + // The list of execution providers. ExecutionProviders execution_providers_; protected: @@ -414,12 +417,23 @@ class InferenceSession { std::vector> executors_; // TODO do we need this vector? ModelMetadata model_metadata_; - InputDefList required_input_def_list_; - std::unordered_map input_def_map_; + std::unordered_set required_inputs_; + + struct InputDefMetaData { + InputDefMetaData(const NodeArg* node_arg0, MLDataType ml_data_type0, TensorShape&& tensor_shape0) + : node_arg(node_arg0), ml_data_type(ml_data_type0), tensor_shape(std::move(tensor_shape0)) { + } + const NodeArg* node_arg; + MLDataType ml_data_type; + TensorShape tensor_shape; // not applicable if the input is non-tensor type + }; + std::unordered_map input_def_map_; OutputDefList output_def_list_; // Threadpool for this session std::unique_ptr thread_pool_; + // Data transfer manager. + DataTransferManager data_transfer_mgr_; // Number of concurrently running executors std::atomic current_num_runs_; diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index fa37c8784ada6..4cc41de5b54ea 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -92,7 +92,7 @@ class LoggingWrapper : public ISink { ORT_API_STATUS_IMPL(OrtCreateEnvWithCustomLogger, OrtLoggingFunction logging_function, _In_opt_ void* logger_param, OrtLoggingLevel default_warning_level, _In_ const char* logid, - _Out_ OrtEnv** out) { + _Outptr_ OrtEnv** out) { API_IMPL_BEGIN std::string name = logid; std::unique_ptr logger = std::make_unique(logging_function, logger_param); @@ -113,7 +113,7 @@ ORT_API(const char*, OrtGetVersionString) { } ORT_API_STATUS_IMPL(OrtCreateEnv, OrtLoggingLevel default_warning_level, - _In_ const char* logid, _Out_ OrtEnv** out) { + _In_ const char* logid, _Outptr_ OrtEnv** out) { API_IMPL_BEGIN std::string name = logid; auto default_logging_manager = std::make_unique(std::unique_ptr{new CLogSink{}}, @@ -147,7 +147,7 @@ ORT_API_STATUS_IMPL(OrtGetStringTensorDataLength, _In_ const OrtValue* value, _O API_IMPL_END } -ORT_API_STATUS_IMPL(OrtFillStringTensor, _In_ OrtValue* value, _In_ const char* const* s, size_t s_len) { +ORT_API_STATUS_IMPL(OrtFillStringTensor, _Inout_ OrtValue* value, _In_ const char* const* s, size_t s_len) { TENSOR_READWRITE_API_BEGIN auto* dst = tensor->MutableData(); auto len = static_cast(tensor->Shape().Size()); @@ -206,7 +206,7 @@ OrtStatus* CreateTensorImpl(const int64_t* shape, size_t shape_len, const OrtAll */ ORT_API_STATUS_IMPL(OrtCreateTensorWithDataAsOrtValue, _In_ const OrtAllocatorInfo* info, _Inout_ void* p_data, size_t p_data_len, _In_ const int64_t* shape, size_t shape_len, - ONNXTensorElementDataType type, _Out_ OrtValue** out) { + ONNXTensorElementDataType type, _Outptr_ OrtValue** out) { API_IMPL_BEGIN std::unique_ptr tensor; switch (type) { @@ -272,7 +272,7 @@ ORT_API_STATUS_IMPL(OrtCreateTensorWithDataAsOrtValue, _In_ const OrtAllocatorIn ORT_API_STATUS_IMPL(OrtCreateTensorAsOrtValue, _Inout_ OrtAllocator* allocator, _In_ const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type, - _Out_ OrtValue** out) { + _Outptr_ OrtValue** out) { API_IMPL_BEGIN std::unique_ptr tensor; switch (type) { @@ -336,7 +336,7 @@ ORT_API_STATUS_IMPL(OrtCreateTensorAsOrtValue, _Inout_ OrtAllocator* allocator, API_IMPL_END } -ORT_API_STATUS_IMPL(OrtCreateCustomOpDomain, _In_ const char* domain, _Out_ OrtCustomOpDomain** out) { +ORT_API_STATUS_IMPL(OrtCreateCustomOpDomain, _In_ const char* domain, _Outptr_ OrtCustomOpDomain** out) { API_IMPL_BEGIN auto custom_op_domain = std::make_unique(); custom_op_domain->domain_ = domain; @@ -365,8 +365,8 @@ ORT_API_STATUS_IMPL(OrtAddCustomOpDomain, _In_ OrtSessionOptions* options, OrtCu namespace { template -OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* options, - Loader loader, _Out_ OrtSession** out) { +OrtStatus* CreateSessionImpl(_In_ const OrtEnv* env, _In_ const OrtSessionOptions* options, + Loader loader, _Outptr_ OrtSession** out) { auto sess = std::make_unique<::onnxruntime::InferenceSession>( options == nullptr ? onnxruntime::SessionOptions() : options->value, env->loggingManager); Status status; @@ -395,8 +395,8 @@ OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* opt } } // namespace -ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path, - _In_ const OrtSessionOptions* options, _Out_ OrtSession** out) { +ORT_API_STATUS_IMPL(OrtCreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path, + _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) { API_IMPL_BEGIN const auto loader = [model_path](InferenceSession& sess) { return sess.Load(model_path); @@ -405,8 +405,8 @@ ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* mo API_IMPL_END } -ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length, - _In_ const OrtSessionOptions* options, _Out_ OrtSession** out) { +ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length, + _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) { API_IMPL_BEGIN const auto loader = [model_data, model_data_length](InferenceSession& sess) { return sess.Load(model_data, static_cast(model_data_length)); @@ -415,10 +415,10 @@ ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void API_IMPL_END } -ORT_API_STATUS_IMPL(OrtRun, _In_ OrtSession* sess, +ORT_API_STATUS_IMPL(OrtRun, _Inout_ OrtSession* sess, _In_ const OrtRunOptions* run_options, _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len, - _In_ const char* const* output_names1, size_t output_names_len, _Out_ OrtValue** output) { + _In_ const char* const* output_names1, size_t output_names_len, _Outptr_ OrtValue** output) { API_IMPL_BEGIN auto session = reinterpret_cast<::onnxruntime::InferenceSession*>(sess); const int queue_id = 0; @@ -477,7 +477,7 @@ ORT_API_STATUS_IMPL(OrtRun, _In_ OrtSession* sess, API_IMPL_END } -ORT_API_STATUS_IMPL(OrtGetTensorMutableData, _In_ OrtValue* value, _Out_ void** output) { +ORT_API_STATUS_IMPL(OrtGetTensorMutableData, _Inout_ OrtValue* value, _Outptr_ void** output) { TENSOR_READWRITE_API_BEGIN //TODO: test if it's a string tensor *output = tensor->MutableDataRaw(); @@ -522,10 +522,10 @@ ORT_API_STATUS_IMPL(OrtGetStringTensorContent, _In_ const OrtValue* value, ORT_API_STATUS_IMPL(OrtTensorProtoToOrtValue, _In_ const void* input, int input_len, _In_opt_ const ORTCHAR_T* input_file_path, _Inout_ void* preallocated, size_t preallocated_size, - _Out_ OrtValue** out, _Out_ OrtCallback** deleter) { + _Outptr_ OrtValue** out, _Outptr_ OrtCallback** deleter) { API_IMPL_BEGIN OrtAllocatorInfo* cpuAllocatorInfo; - auto st = OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &cpuAllocatorInfo); + auto st = OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpuAllocatorInfo); if (st != nullptr) return st; ::ONNX_NAMESPACE::TensorProto proto; if (!proto.ParseFromArray(input, input_len)) { @@ -596,7 +596,7 @@ ORT_API_STATUS_IMPL(OrtSessionGetOutputCount, _In_ const OrtSession* sess, _Out_ API_IMPL_END } -ORT_API_STATUS_IMPL(OrtSessionGetInputTypeInfo, _In_ const OrtSession* sess, size_t index, _Out_ struct OrtTypeInfo** out) { +ORT_API_STATUS_IMPL(OrtSessionGetInputTypeInfo, _In_ const OrtSession* sess, size_t index, _Outptr_ struct OrtTypeInfo** out) { API_IMPL_BEGIN auto session = reinterpret_cast(sess); std::pair p = session->GetModelInputs(); @@ -608,7 +608,7 @@ ORT_API_STATUS_IMPL(OrtSessionGetInputTypeInfo, _In_ const OrtSession* sess, siz return OrtTypeInfo::FromDataTypeImpl(type_proto, out); API_IMPL_END } -ORT_API_STATUS_IMPL(OrtSessionGetOutputTypeInfo, _In_ const OrtSession* sess, size_t index, _Out_ struct OrtTypeInfo** out) { +ORT_API_STATUS_IMPL(OrtSessionGetOutputTypeInfo, _In_ const OrtSession* sess, size_t index, _Outptr_ struct OrtTypeInfo** out) { API_IMPL_BEGIN auto session = reinterpret_cast(sess); std::pair p = session->GetModelOutputs(); @@ -630,7 +630,7 @@ static char* StrDup(const std::string& str, OrtAllocator* allocator) { static OrtStatus* GetInputOutputNameImpl(_In_ const OrtSession* sess, size_t index, _Inout_ OrtAllocator* allocator, bool is_input, - _Out_ char** output) { + _Outptr_ char** output) { auto session = reinterpret_cast(sess); std::pair p = is_input ? session->GetModelInputs() : session->GetModelOutputs(); if (!p.first.IsOK()) @@ -650,7 +650,7 @@ ORT_API_STATUS_IMPL(OrtIsTensor, _In_ const OrtValue* value, int* out) { return nullptr; } -ORT_API_STATUS_IMPL(OrtAllocatorAlloc, _Inout_ OrtAllocator* ptr, size_t size, _Out_ void** out) { +ORT_API_STATUS_IMPL(OrtAllocatorAlloc, _Inout_ OrtAllocator* ptr, size_t size, _Outptr_ void** out) { API_IMPL_BEGIN *out = ptr->Alloc(ptr, size); return nullptr; @@ -664,7 +664,7 @@ ORT_API_STATUS_IMPL(OrtAllocatorFree, _Inout_ OrtAllocator* ptr, void* p) { API_IMPL_END } -ORT_API_STATUS_IMPL(OrtAllocatorGetInfo, _In_ const OrtAllocator* ptr, _Out_ const struct OrtAllocatorInfo** out) { +ORT_API_STATUS_IMPL(OrtAllocatorGetInfo, _In_ const OrtAllocator* ptr, _Outptr_ const struct OrtAllocatorInfo** out) { API_IMPL_BEGIN *out = ptr->Info(ptr); return nullptr; @@ -672,14 +672,14 @@ ORT_API_STATUS_IMPL(OrtAllocatorGetInfo, _In_ const OrtAllocator* ptr, _Out_ con } ORT_API_STATUS_IMPL(OrtSessionGetInputName, _In_ const OrtSession* sess, size_t index, - _Inout_ OrtAllocator* allocator, _Out_ char** output) { + _Inout_ OrtAllocator* allocator, _Outptr_ char** output) { API_IMPL_BEGIN return GetInputOutputNameImpl(sess, index, allocator, true, output); API_IMPL_END } ORT_API_STATUS_IMPL(OrtSessionGetOutputName, _In_ const OrtSession* sess, size_t index, - _Inout_ OrtAllocator* allocator, _Out_ char** output) { + _Inout_ OrtAllocator* allocator, _Outptr_ char** output) { API_IMPL_BEGIN return GetInputOutputNameImpl(sess, index, allocator, false, output); API_IMPL_END @@ -933,7 +933,7 @@ ORT_API_STATUS_IMPL(OrtGetValue, const OrtValue* value, int index, OrtAllocator* /////////////////// // OrtCreateValue template -static OrtStatus* OrtCreateValueImplSeqHelperMap(OrtValue** const in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplSeqHelperMap(const OrtValue* const* in, size_t num_values, OrtValue** out) { using SeqType = std::vector; auto vec_ptr = std::make_unique(); vec_ptr->reserve(num_values); @@ -951,7 +951,7 @@ static OrtStatus* OrtCreateValueImplSeqHelperMap(OrtValue** const in, size_t num } template -static OrtStatus* OrtCreateValueImplSeqHelper(OrtValue** in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplSeqHelper(const OrtValue* const* in, size_t num_values, OrtValue** out) { using SeqType = std::vector; auto vec_ptr = std::make_unique(); vec_ptr->reserve(num_values); @@ -972,7 +972,7 @@ static OrtStatus* OrtCreateValueImplSeqHelper(OrtValue** in, size_t num_values, return nullptr; } -static OrtStatus* OrtCreateValueImplSeq(OrtValue** in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplSeq(const OrtValue* const* in, size_t num_values, OrtValue** out) { // We only support limited sequence types. For the sake of simplicity the type of the first // OrtValue* in OrtValue** will determine the type of the vector used to create the output OrtValue // this type should be either a tensor of limited types or map of limited types @@ -1069,7 +1069,7 @@ static OrtStatus* OrtCreateValueImplMapHelper(const Tensor& key_tensor, const Te } } -static OrtStatus* OrtCreateValueImplMap(OrtValue** in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplMap(const OrtValue* const* in, size_t num_values, OrtValue** out) { if (num_values != NUM_MAP_INDICES) { return OrtCreateStatus(ORT_FAIL, "For map type num_values MUST be 2"); } @@ -1102,7 +1102,7 @@ static OrtStatus* OrtCreateValueImplMap(OrtValue** in, size_t num_values, OrtVal return OrtCreateStatus(ORT_FAIL, "Key type is not supported yet."); } -static OrtStatus* OrtCreateValueImpl(OrtValue** in, size_t num_values, enum ONNXType value_type, OrtValue** out) { +static OrtStatus* OrtCreateValueImpl(const OrtValue* const* in, size_t num_values, enum ONNXType value_type, OrtValue** out) { if (num_values <= 0) { return OrtCreateStatus(ORT_FAIL, "Number of values should be at least 1."); } @@ -1115,7 +1115,7 @@ static OrtStatus* OrtCreateValueImpl(OrtValue** in, size_t num_values, enum ONNX return OrtCreateStatus(ORT_FAIL, "Input is not of type sequence or map."); } -ORT_API_STATUS_IMPL(OrtCreateValue, OrtValue** in, size_t num_values, enum ONNXType value_type, OrtValue** out) { +ORT_API_STATUS_IMPL(OrtCreateValue, const OrtValue* const* in, size_t num_values, enum ONNXType value_type, OrtValue** out) { API_IMPL_BEGIN return OrtCreateValueImpl(in, num_values, value_type, out); API_IMPL_END diff --git a/onnxruntime/core/util/math.h b/onnxruntime/core/util/math.h index 70d9cb3630dd8..21593c69326ee 100644 --- a/onnxruntime/core/util/math.h +++ b/onnxruntime/core/util/math.h @@ -1,7 +1,3 @@ -/** -* Derived from caffe2, need copyright announcement here. -*/ - /** * Copyright (c) 2016-present, Facebook, Inc. * @@ -36,7 +32,6 @@ extern "C" { #endif #include "core/common/common.h" -#include "core/framework/data_types.h" #include "core/framework/tensor.h" namespace onnxruntime { @@ -47,131 +42,39 @@ enum StorageOrder { NCHW = 2, }; -#define FLOAT_TYPE DataTypeImpl::GetType() - namespace math { template void Exp(int N, const T* x, T* y, Provider* provider); template -void Log(int N, const T* x, T* y, Provider* provider); -template -void Cos(int N, const T* x, T* y, Provider* provider); -template -void Sin(int N, const T* x, T* y, Provider* provider); -template -void SinCos(int N, const T* x, T* ys, T* yc, Provider* provider); -template -void Abs(int N, const T* x, T* y, Provider* provider); -template -void Sqrt(int N, const T* x, T* y, Provider* provider); -template -void InvSqrt(int N, const T* x, T* y, Provider* provider); -template void Sqr(int N, const T* x, T* y, Provider* provider); -template -void Not(int N, const T* x, T* y, Provider* provider); - template void Powx(int N, const T* a, T b, T* y, Provider* provider); -#define DECLARE_BINARY_OP_BINARY_RESULT(name) \ - template \ - void name(int N, const T* a, const T* b, bool* y, Provider* provider); \ - template \ - void name##ToRow(int M, int N, const T* a, const T* b, bool* y, Provider* provider); - -DECLARE_BINARY_OP_BINARY_RESULT(LT); -DECLARE_BINARY_OP_BINARY_RESULT(LE); -DECLARE_BINARY_OP_BINARY_RESULT(GT); -DECLARE_BINARY_OP_BINARY_RESULT(GE); - -DECLARE_BINARY_OP_BINARY_RESULT(And); -DECLARE_BINARY_OP_BINARY_RESULT(Or); -DECLARE_BINARY_OP_BINARY_RESULT(Xor); - -#undef DECLARE_BINARY_OP_BINARY_RESULT - #define DECLARE_BINARY_OP(name) \ template \ - void name(int N, const T* a, const T* b, T* y, Provider* provider); \ - template \ - void name##ToRow(int M, int N, const T* a, const T* b, T* y, Provider* provider); \ - template \ - void name##ToRow(int M, int N, const T* x, T* y, Provider* provider); \ - template \ - void name##ToCol(int M, int N, const T* x, T* y, Provider* provider); + void name(int N, const T* a, const T* b, T* y, Provider* provider); DECLARE_BINARY_OP(Add); -DECLARE_BINARY_OP(Sub); DECLARE_BINARY_OP(Mul); -DECLARE_BINARY_OP(Div); #undef DECLARE_BINARY_OP -template -void ReduceMin( - int N, - const T* x, - T* y, - Tensor* scratch_ptr, - Provider* provider); -template -void ReduceMax( - int N, - const T* x, - T* y, - Tensor* scratch_ptr, - Provider* provider); - -// Adds batch sub-tensors elementwise to output. Stripe is the stripe length -// and N is the number of elements to add (size of Y). -template -void AddStripedBatch( - int N, - const T* first, - T* y, - int stripe, - int batch, - Provider* provider); - -// Compute the row-wise sum of a N*D matrix X, and write it to a N -// dimensional vector y. -template -void RowwiseSum(int N, int D, const T* x, T* y, - Provider* provider); - -// Compute the column-wise sum of a N*D matrix X, and write it to a D -// dimensional vector y. -template -void ColwiseSum(int N, int D, const T* x, T* y, - Provider* provider); - // Compute the row-wise max of a N*D matrix X, and write it to a N // dimensional vector y. template void RowwiseMax(int N, int D, const T* x, T* y, Provider* provider); -// Compute the column-wise max of a N*D matrix X, and write it to a D -// dimensional vector y. -template -void ColwiseMax(int N, int D, const T* x, T* y, - Provider* provider); - -// Elemwise maximum of vector x and vector y. z[i] = max(x[i], y[i]) -template -void ElemwiseMax(int N, const T* x, const T* y, T* z, Provider* provider); - -// Elemwise maximum of vector x and scalar alpha. y[i] = max(x[i], alpha) -template -void Maximum( +template +void MatMul( + int M, int N, - float alpha, - const T* x, - T* y, - Provider* provider); + int K, + const T* A, + const T* B, + T* C); // Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. @@ -187,10 +90,7 @@ void Gemm( const T* B, float beta, T* C, - Provider* provider, - //Caffe2 use this type to control on GPU, what presicion do we want to do the calculation - //But not sure is this a good design for us. Keep it here for now. - MLDataType math_type = FLOAT_TYPE); + Provider* provider); // We also provide a gemm that has explicit lda, ldb and ldc specified. // In most cases you probably want to use the function above, though. @@ -211,27 +111,6 @@ void GemmEx( int ldc, Provider* provider); -// GemmBatched provides a simple abstraction into library routines -template -void GemmBatched( - CBLAS_TRANSPOSE TransA, - CBLAS_TRANSPOSE TransB, - int A_size, - int A_batches, - int B_size, - int B_batches, - int M, - int N, - int K, - float alpha, - const T* A, - const T* B, - float beta, - T* C, - Provider* provider, - Tensor* scratch = nullptr, - MLDataType math_type = DataTypeImpl::FLOAT_TYPE); - // Gemv always takes in a M*N matrix A, and depending on whether we set TransA // to Trans, the output is: // CblasNoTrans: x is an N dim vector and y is an M dim vector. @@ -246,59 +125,10 @@ void Gemv( const T* x, float beta, T* y, - Provider* provider, - MLDataType math_type = DataTypeImpl::FLOAT_TYPE); -template -void Set(int64_t N, T alpha, T* X, Provider* provider); - -template -void RandUniform(int n, T a, T b, const T* r, Provider* provider); - -template -void RandUniformUnique( - size_t n, - T a, - T b, - T* r, - size_t m, - const T* avoid, Provider* provider); template -void RandGaussian(int n, T mean, T std, const T* r, Provider* provider); - -// Dot matrix of vector a and b, and writes the result to a single value y. -template -void Dot(int N, const T* a, const T* b, T* y, Provider* provider); - -// Sum of vector x, and writes the result to a single value y. -template -void Sum(int N, const T* x, T* y, Provider* provider, - Tensor* scratch_ptr = nullptr); - -// Sum of squares of vector x, and writes the result to a single value y. -template -void SumSqr( - int N, - const T* x, - T* y, - Provider* provider, - Tensor* scratch_ptr = nullptr); - -// Select does index selection of the rows a N*D matrix x, and gives the N -// dimensional vector y that contains the selected data. -template -void Select(int N, int D, const T* x, const int* idx, T* y, - Provider* provider); - -template -void Scale(int N, float alpha, const T* x, T* y, Provider* provider); - -// Different from the Scale function above, if alpha is passed in -// as a pointer, we will assume that it lives on the correct execution provider, -// for example on GPU. -template -void Scale(int N, const float* alpha, const T* x, T* y, Provider* provider); +void Set(int64_t N, T alpha, T* X, Provider* provider); template void Axpy(int N, float alpha, const T* x, T* y, Provider* provider); @@ -309,15 +139,6 @@ void Axpy(int N, float alpha, const T* x, T* y, Provider* provider); template void Axpy(int N, const float* alpha, const T* x, T* y, Provider* provider); -template -void Axpby( - int N, - float alpha, - const T* x, - T b, - T* y, - Provider* provider); - template struct Im2colNd { void operator()( @@ -479,8 +300,6 @@ void CopyMatrix( template void CopyVector(int N, const T* A, T* B, Provider* provider); -uint32_t randomNumberSeed(); - // Function uses casting from int64_t to uint64_t to compare if value of // parameter a is greater or equal to zero and lower than value of // parameter b. The b parameter is of type signed and is always diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc index 3359a2f3343d2..822d58ec63140 100644 --- a/onnxruntime/core/util/math_cpu.cc +++ b/onnxruntime/core/util/math_cpu.cc @@ -13,92 +13,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -// Implements the math functions for CPU. -// The implementation in this file allows us to route the underlying numerical -// computation library to different backends. Notably: -// (1) For all BLAS-related functions, one can explicitly request a BLAS backend -// such as MKL, openblas or Atlas. To see the set of supported backends -// currently provided, check //third_party/blas/. -// (2) If one chooses to link against MKL, we utilize MKL's vector math library -// (VML) for a few functions such as Exp and Log. -// (3) Fallback implementations are provided in Eigen for cross-platform -// support. Since Eigen is a header-only library and supports a number of -// platforms, it allows one to quickly port Caffe2 to different platforms -// where BLAS may not be present. // Modifications Copyright (c) Microsoft. #include -#include -#include -#include -#include -#include "core/platform/env.h" -#include "core/common/logging/logging.h" -#include "core/providers/cpu/cpu_execution_provider.h" #include "core/util/math.h" #include "core/util/math_cpuonly.h" -#include "Eigen/src/Core/arch/GPU/Half.h" - -#if defined(USE_MLAS) #include "core/mlas/inc/mlas.h" -#endif +#include "Eigen/src/Core/arch/GPU/Half.h" namespace onnxruntime { namespace math { -// Gemm implementation purely based on Eigen. -template -void GemmEigen( - CBLAS_TRANSPOSE TransA, - CBLAS_TRANSPOSE TransB, - int64_t M, - int64_t N, - int64_t K, - float alpha, - const T* A, - const T* B, - float beta, - T* C) { - auto C_mat = EigenMatrixMap(C, N, M); - if (beta == 0) { - C_mat.setZero(); - } else { - C_mat *= static_cast(beta); +// MatMul implementation purely based on Eigen. +#define EIGEN_MATMUL_FUNCTION(T) \ + template <> \ + void MatMul(int M, int N, int K, const T* A, const T* B, T* C) { \ + auto C_mat = EigenMatrixMap(C, N, M); \ + C_mat.noalias() = ConstEigenMatrixMap(B, N, K) * ConstEigenMatrixMap(A, K, M); \ } - switch (TransA) { - case CblasNoTrans: { - switch (TransB) { - case CblasNoTrans: - C_mat.noalias() += static_cast(alpha) * (ConstEigenMatrixMap(B, N, K) * - ConstEigenMatrixMap(A, K, M)); - return; - case CblasTrans: - C_mat.noalias() += static_cast(alpha) * (ConstEigenMatrixMap(B, K, N).transpose() * - ConstEigenMatrixMap(A, K, M)); - return; - default: - ORT_THROW("CblasNoTrans Unexpected CBLAS_TRANSPOSE for TransB of ", TransB); - } - } - case CblasTrans: { - switch (TransB) { - case CblasNoTrans: - C_mat.noalias() += static_cast(alpha) * (ConstEigenMatrixMap(B, N, K) * - ConstEigenMatrixMap(A, M, K).transpose()); - return; - case CblasTrans: - C_mat.noalias() += static_cast(alpha) * (ConstEigenMatrixMap(B, K, N).transpose() * - ConstEigenMatrixMap(A, M, K).transpose()); - return; - default: - ORT_THROW("CblasTrans Unexpected CBLAS_TRANSPOSE for TransB of ", TransB); - } - } - default: - ORT_THROW("Unexpected CBLAS_TRANSPOSE for TransA of ", TransA); - } -} + +EIGEN_MATMUL_FUNCTION(int32_t) +EIGEN_MATMUL_FUNCTION(uint32_t) +EIGEN_MATMUL_FUNCTION(int64_t) +EIGEN_MATMUL_FUNCTION(uint64_t) //////////////////////////////////////////////////////////////////////////////// // BLAS alternatives. @@ -127,115 +64,31 @@ void GemmEigen( template <> void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta, - float* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { -#if defined(USE_MLAS) + float* C, CPUMathUtil* /*provider*/) { int lda = static_cast((TransA == CblasNoTrans) ? K : M); int ldb = static_cast((TransB == CblasNoTrans) ? N : K); // TODO: Make this use the operator threadpool MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N, nullptr); -#else - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -#endif } template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const double* A, const double* B, - float beta, double* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No double precision Gemm offering from MLAS or MKLDNN. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -} - -template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const int32_t* A, const int32_t* B, - float beta, int32_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No int32_t Gemm offering from MLAS or MKLDNN. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -} - -template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const uint32_t* A, const uint32_t* B, - float beta, uint32_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No uint32_t Gemm offering from MLAS or MKLDNN. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -} - -template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const int64_t* A, const int64_t* B, - float beta, int64_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No int64_t Gemm offering from MLAS or MKLDNN. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); +void MatMul(int M, int N, int K, const float* A, const float* B, float* C) { + // TODO: Make this use the operator threadpool + MlasSgemm(CblasNoTrans, CblasNoTrans, M, N, K, 1.f, A, K, B, N, 0.f, C, N, nullptr); } -template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const uint64_t* A, const uint64_t* B, - float beta, uint64_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No uint64_t Gemm offering from MLAS or MKLDNN. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -} +EIGEN_MATMUL_FUNCTION(double) template <> void GemmEx(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K, float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, int ldc, CPUMathUtil*) { -#if defined(USE_MLAS) MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, nullptr); -#else - using OuterStride = Eigen::OuterStride; - using StridedMap = Eigen::Map; - using ConstStridedMap = Eigen::Map; - auto C_mat = StridedMap(C, N, M, OuterStride(ldc)); - if (beta == 0) { - C_mat.setZero(); - } else { - C_mat *= beta; - } - switch (TransA) { - case CblasNoTrans: { - switch (TransB) { - case CblasNoTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) * - ConstStridedMap(A, K, M, OuterStride(lda))); - return; - case CblasTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() * - ConstStridedMap(A, K, M, OuterStride(lda))); - return; - default: - ORT_THROW("CblasNoTrans Unexpected CBLAS_TRANSPOSE for TransB of ", TransB); - } - } - case CblasTrans: { - switch (TransB) { - case CblasNoTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) * - ConstStridedMap(A, M, K, OuterStride(lda)).transpose()); - return; - case CblasTrans: - C_mat.noalias() += - alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() * - ConstStridedMap(A, M, K, OuterStride(lda)).transpose()); - return; - default: - ORT_THROW("CblasTrans Unexpected CBLAS_TRANSPOSE for TransB of ", TransB); - } - } - default: - ORT_THROW("Unexpected CBLAS_TRANSPOSE for TransA of ", TransA); - } -#endif } template <> void Gemv(const CBLAS_TRANSPOSE TransA, int M, int N, float alpha, const float* A, const float* x, - float beta, float* y, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { + float beta, float* y, CPUMathUtil* /*provider*/) { EigenVectorMap y_vec(y, TransA == CblasNoTrans ? M : N); if (beta == 0) { // In Caffe2 we often do a lazy initialization, which may contain NaNs in @@ -260,26 +113,6 @@ void Gemv(const CBLAS_TRANSPOSE TransA, int M, int N, float } } -#define SPECIALIZED_SCALE(T) \ - template <> \ - void Scale(int n, float alpha, const T* x, T* y, CPUMathUtil* /*provider*/) { \ - EigenVectorMap(y, n) = ConstEigenVectorMap(x, n) * alpha; \ - } \ - template <> \ - void Scale(int n, const float* alpha, const T* x, T* y, CPUMathUtil* /*provider*/) { \ - EigenVectorMap(y, n) = ConstEigenVectorMap(x, n) * (*alpha); \ - } -SPECIALIZED_SCALE(float) -#undef SPECIALIZED_SCALE - -#define SPECIALIZED_DOT(T) \ - template <> \ - void Dot(int N, const T* a, const T* b, T* y, CPUMathUtil* /*provider*/) { \ - *y = ConstEigenVectorMap(a, N).dot(ConstEigenVectorMap(b, N)); \ - } -SPECIALIZED_DOT(float) -#undef SPECIALIZED_DOT - #define SPECIALIZED_AXPY(T) \ template <> \ void Axpy(int N, const T alpha, const T* x, T* Y, CPUMathUtil* /*provider*/) { \ @@ -292,21 +125,12 @@ SPECIALIZED_DOT(float) SPECIALIZED_AXPY(float) #undef SPECIALIZED_AXPY -#define SPECIALIZED_AXPBY(T) \ - template <> \ - void Axpby(int N, const T alpha, const T* x, const T beta, T* y, CPUMathUtil* /*context*/) { \ - EigenVectorMap y_vec(y, N); \ - y_vec = y_vec * beta + ConstEigenVectorMap(x, N) * alpha; \ - } -SPECIALIZED_AXPBY(float) -#undef SPECIALIZED_AXPBY - #else // USE_EIGEN_FOR_BLAS template <> void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta, - float* C, CPUMathUtil* /*context*/, MLDataType /*math_type*/) { + float* C, CPUMathUtil* /*context*/) { int lda = gsl::narrow_cast((TransA == CblasNoTrans) ? K : M); int ldb = gsl::narrow_cast((TransB == CblasNoTrans) ? N : K); cblas_sgemm(CblasRowMajor, TransA, TransB, @@ -318,46 +142,13 @@ void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOS } template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const double* A, const double* B, - float beta, double* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - int lda = gsl::narrow_cast((TransA == CblasNoTrans) ? K : M); - int ldb = gsl::narrow_cast((TransB == CblasNoTrans) ? N : K); - cblas_dgemm(CblasRowMajor, TransA, TransB, gsl::narrow_cast(M), gsl::narrow_cast(N), - gsl::narrow_cast(K), gsl::narrow_cast(alpha), A, lda, B, ldb, gsl::narrow_cast(beta), - C, gsl::narrow_cast(N)); +void MatMul(int M, int N, int K, const float* A, const float* B, float* C) { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, A, K, B, N, 0, C, N); } template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const int32_t* A, const int32_t* B, - float beta, int32_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No int32_t Gemm offering from MKLML. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -} - -template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const uint32_t* A, const uint32_t* B, - float beta, uint32_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No uint32_t Gemm offering from MKLML. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -} - -template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const int64_t* A, const int64_t* B, - float beta, int64_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No int64_t Gemm offering from MKLML. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); -} - -template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const uint64_t* A, const uint64_t* B, - float beta, uint64_t* C, CPUMathUtil* /*provider*/, MLDataType /*math_type*/) { - // No uint64_t Gemm offering from MKLML. Directly fallback to Eigen. - GemmEigen(TransA, TransB, M, N, K, alpha, A, B, beta, C); +void MatMul(int M, int N, int K, const double* A, const double* B, double* C) { + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, A, K, B, N, 0, C, N); } template <> @@ -370,32 +161,10 @@ void GemmEx(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSP template <> void Gemv(const CBLAS_TRANSPOSE TransA, int M, int N, float alpha, const float* A, const float* x, - float beta, float* y, CPUMathUtil* /*context*/, MLDataType /*math_type*/) { + float beta, float* y, CPUMathUtil* /*context*/) { cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } -#define CAFFE2_SPECIALIZED_SCALE(T, prefix) \ - template <> \ - void Scale(int n, float alpha, const T* x, T* y, CPUMathUtil*) { \ - if (y != x) cblas_##prefix##copy(n, x, 1, y, 1); \ - cblas_##prefix##scal(n, static_cast(alpha), y, 1); \ - } \ - template <> \ - void Scale(int n, const float* alpha, const T* x, T* y, CPUMathUtil*) { \ - if (y != x) cblas_##prefix##copy(n, x, 1, y, 1); \ - cblas_##prefix##scal(n, static_cast(*alpha), y, 1); \ - } -CAFFE2_SPECIALIZED_SCALE(float, s) -#undef CAFFE2_SPECIALIZED_SCALE - -#define CAFFE2_SPECIALIZED_DOT(T, prefix) \ - template <> \ - void Dot(int N, const T* a, const T* b, T* y, CPUMathUtil*) { \ - *y = cblas_##prefix##dot(N, a, 1, b, 1); \ - } -CAFFE2_SPECIALIZED_DOT(float, s) -#undef CAFFE2_SPECIALIZED_DOT - #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \ template <> \ void Axpy(int N, const T alpha, const T* x, T* y, CPUMathUtil*) { \ @@ -408,78 +177,17 @@ CAFFE2_SPECIALIZED_DOT(float, s) CAFFE2_SPECIALIZED_AXPY(float, s) #undef CAFFE2_SPECIALIZED_AXPY -#define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ - template <> \ - void Axpby(int N, const T alpha, const T* x, const T beta, T* y, CPUMathUtil*) { \ - cblas_##prefix##scal(N, beta, y, 1); \ - cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ - } -CAFFE2_SPECIALIZED_AXPBY(float, s) -#undef CAFFE2_SPECIALIZED_AXPBY - #endif // USE_EIGEN_FOR_BLAS -template <> -void GemmBatched(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int A_size, - int A_batches, int B_size, int B_batches, int M, int N, int K, float /*alpha*/, - const float* A, const float* B, float /*beta*/, float* C, CPUMathUtil* provider, - Tensor*, /* scratch */ - MLDataType /* math_type */) { - auto a_offset = A_size / A_batches; - auto b_offset = B_size / B_batches; - auto y_offset = M * N; - // loop over matrices in the batch - for (int i = 0; i < A_batches; ++i) { - math::Gemm( - TransA, - TransB, - M, - N, - K, - 1, - A + a_offset * i, - B + b_offset * i, - 0, - C + y_offset * i, - provider); - } -} - - // MKL will be implmenet as an execution provider - //////////////////////////////////////////////////////////////////////////////// - // MKL VML alternatives. - // Depending on whether we are using MKL, we will delegate the Caffe math - // functions that are VML-related to either the VML call or the Eigen - // implementation. If you are setting the flags (such as AVX) right for your CPU - // architecture, usually Eigen will deliver a throughput as fast as the VML - // functions. - //////////////////////////////////////////////////////////////////////////////// - #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \ template <> \ void Funcname(int N, const T* x, T* y, CPUMathUtil*) { \ EigenVectorMap(y, N) = ConstEigenVectorMap(x, N).array().expr(); \ } DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp) -DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log) -DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos) -DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin) -DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, abs) -DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt) -DELEGATE_SIMPLE_UNARY_FUNCTION(float, InvSqrt, rsqrt) DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square) #undef DELEGATE_SIMPLE_UNARY_FUNCTION -#define DELEGATE_SINCOS_FUNCTION(T) \ - template <> \ - void SinCos(int N, const T* x, T* ys, T* yc, CPUMathUtil*) { \ - EigenVectorMap(ys, N) = ConstEigenVectorMap(x, N).array().sin(); \ - EigenVectorMap(yc, N) = ConstEigenVectorMap(x, N).array().cos(); \ - } -DELEGATE_SINCOS_FUNCTION(float) -DELEGATE_SINCOS_FUNCTION(double) -#undef DELEGATE_SINCOS_FUNCTION - #define DELEGATE_POWX_FUNCTION(T) \ template <> \ void Powx(int N, const T* a, T b, T* y, CPUMathUtil*) { \ @@ -500,9 +208,7 @@ DELEGATE_POWX_FUNCTION(float) EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr) DEFINE_SIMPLE_BINARY_FUNCTION(Add, +) -DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -) DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *) -DEFINE_SIMPLE_BINARY_FUNCTION(Div, /) #undef EIGEN_SIMPLE_BINARY_FUNCTION #undef DEFINE_FLOAT_BINARY_FUNCTION @@ -513,41 +219,6 @@ DEFINE_SIMPLE_BINARY_FUNCTION(Div, /) // Eigen or via custom code. //////////////////////////////////////////////////////////////////////////////// -#define SPECIALIZED_REDUCEMIN(T) \ - template <> \ - void ReduceMin(int N, const T* x, T* y, Tensor* /*scratch_ptr*/, CPUMathUtil* /*context*/) { \ - *y = *std::min_element(x, x + N); \ - } -SPECIALIZED_REDUCEMIN(float) -#undef SPECIALIZED_REDUCEMIN - -#define SPECIALIZED_REDUCEMAX(T) \ - template <> \ - void ReduceMax(int N, const T* x, T* y, Tensor* /*scratch_ptr*/, CPUMathUtil* /*context*/) { \ - *y = *std::max_element(x, x + N); \ - } -SPECIALIZED_REDUCEMAX(float) -SPECIALIZED_REDUCEMAX(int32_t) -SPECIALIZED_REDUCEMAX(int64_t) - -#undef SPECIALIZED_REDUCEMAX - -#define SPECIALIZED_ROWWISESUM(T) \ - template <> \ - void RowwiseSum(int N, int D, const T* x, T* y, CPUMathUtil*) { \ - EigenVectorMap(y, N) = ConstEigenMatrixMap(x, D, N).colwise().sum(); \ - } -SPECIALIZED_ROWWISESUM(float) -#undef SPECIALIZED_ROWWISESUM - -#define SPECIALIZED_COLWISESUM(T) \ - template <> \ - void ColwiseSum(int N, int D, const T* x, T* y, CPUMathUtil*) { \ - EigenVectorMap(y, D) = ConstEigenMatrixMap(x, D, N).rowwise().sum(); \ - } -SPECIALIZED_COLWISESUM(float) -#undef SPECIALIZED_COLWISESUM - #define SPECIALIZED_ROWWISEMAX(T) \ template <> \ void RowwiseMax(int N, int D, const T* x, T* y, CPUMathUtil*) { \ @@ -556,61 +227,6 @@ SPECIALIZED_COLWISESUM(float) SPECIALIZED_ROWWISEMAX(float) #undef SPECIALIZED_ROWWISEMAX -#define SPECIALIZED_COLWISEMAX(T) \ - template <> \ - void ColwiseMax(int N, int D, const T* x, T* y, CPUMathUtil*) { \ - EigenVectorMap(y, D) = ConstEigenMatrixMap(x, D, N).rowwise().maxCoeff(); \ - } -SPECIALIZED_COLWISEMAX(float) -#undef SPECIALIZED_COLWISEMAX - -#define SPECIALIZED_ELEMWISEMAX(T) \ - template <> \ - void ElemwiseMax(int N, const T* x, const T* y, T* z, CPUMathUtil* /*context*/) { \ - std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) { return std::max(x_i, y_i); }); \ - } -SPECIALIZED_ELEMWISEMAX(float) -#undef SPECIALIZED_ELEMWISEMAX - -#define SPECIALIZED_MAXIMUM(T) \ - template <> \ - void Maximum(int N, float alpha, const T* x, T* y, CPUMathUtil* /*provider*/) { \ - std::transform(x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \ - } -SPECIALIZED_MAXIMUM(float) -#undef SPECIALIZED_MAXIMUM - -// AddToRow and AddToCol adds the corresponding row/col vector b to the matrix a -// of shape M x N. The actual implementation uses eigen which is column major, -// so notice the row/column swap in the actual implementation. -#define DELEGATE_BROADCAST_BINARY_FUNCTION(T, Funcname, expr) \ - template <> \ - void Funcname##ToRow(int M, int N, const T* a, const T* b, T* y, CPUMathUtil*) { \ - EigenArrayMap(y, N, M) = ConstEigenArrayMap(a, N, M).colwise() expr ConstEigenVectorArrayMap(b, N); \ - } \ - /* inplace versions */ \ - template <> \ - void Funcname##ToRow(int M, int N, const T* x, T* y, CPUMathUtil*) { \ - EigenArrayMap(y, N, M).colwise() expr## = ConstEigenVectorArrayMap(x, N); \ - } \ - template <> \ - void Funcname##ToCol(int M, int N, const T* x, T* y, CPUMathUtil*) { \ - EigenArrayMap(y, N, M).rowwise() expr## = ConstEigenVectorArrayMap(x, M).transpose(); \ - } - -#define DEFINE_BROADCAST_BINARY_FUNCTION(name, op) \ - DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op) \ - DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op) \ - DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op) - -DEFINE_BROADCAST_BINARY_FUNCTION(Add, +) -DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -) -DEFINE_BROADCAST_BINARY_FUNCTION(Mul, *) -DEFINE_BROADCAST_BINARY_FUNCTION(Div, /) - -#undef DEFINE_BROADCAST_BINARY_FUNCTION -#undef DELEGATE_BROADCAST_BINARY_FUNCTION - #define SPECIALIZED_SET(T) \ template <> \ void Set(const int64_t N, const T alpha, T* Y, CPUMathUtil*) { \ @@ -633,150 +249,6 @@ SPECIALIZED_SET(uint8_t); SPECIALIZED_SET(uint16_t); #undef SPECIALIZED_SET -#define INSTANTIATE_BINARY_OP(name, op, T) \ - template <> \ - void name(int n, const T* a, const T* b, bool* y, CPUMathUtil*) { \ - for (int i = 0; i < n; ++i) { \ - y[i] = a[i] op b[i]; \ - } \ - } \ - template <> \ - void name##ToRow(int m, int n, const T* a, const T* b, bool* y, CPUMathUtil*) { \ - for (int i = 0; i < n * m; ++i) { \ - y[i] = a[i] op b[i % n]; \ - } \ - } - -#define DEFINE_BINARY_OP(name, op) \ - INSTANTIATE_BINARY_OP(name, op, float) \ - INSTANTIATE_BINARY_OP(name, op, int32_t) \ - INSTANTIATE_BINARY_OP(name, op, int64_t) - -DEFINE_BINARY_OP(LT, <); -DEFINE_BINARY_OP(LE, <=); -DEFINE_BINARY_OP(GT, >); -DEFINE_BINARY_OP(GE, >=); - -INSTANTIATE_BINARY_OP(Or, |, bool); -INSTANTIATE_BINARY_OP(And, &, bool); -INSTANTIATE_BINARY_OP(Xor, ^, bool); - -template <> -void Not(int n, const bool* x, bool* y, CPUMathUtil* /*context*/) { - for (int i = 0; i < n; ++i) { - y[i] = !x[i]; - } -} - -#undef DEFINE_BINARY_OP -#undef INSTANTIATE_BINARY_OP - -#define SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \ - template <> \ - void AddStripedBatch(int N, const T* first, T* y, int stripe, int batch, CPUMathUtil* provider) { \ - for (int j = 0; j < batch; j++) { \ - Add(N, first + j * stripe, y, y, provider); \ - } \ - } - -SPECIALIZED_CPU_ADD_STRIPED_BATCH(float); -#undef SPECIALIZED_CPU_ADD_STRIPED_BATCH - -template <> -void RandUniform(int n, float a, float b, const float* r, CPUMathUtil* /*provider*/) { - std::uniform_real_distribution distribution(a, b); - //todo: need implmenet "RandGenerator()" in execution provider - ORT_UNUSED_PARAMETER(n); - ORT_UNUSED_PARAMETER(r); - ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented"); - /*for (int i = 0; i < n; ++i) { - r[i] = distribution(context->RandGenerator()); - }*/ -} - -template <> -void RandUniform(int n, int a, int b, const int* r, CPUMathUtil* /*provider*/) { - std::uniform_int_distribution distribution(a, b); - //todo: need implmenet "RandGenerator()" in execution provider - ORT_UNUSED_PARAMETER(n); - ORT_UNUSED_PARAMETER(r); - ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented"); - /*for (int i = 0; i < n; ++i) { - r[i] = distribution(context->RandGenerator()); - }*/ -} - -//todo: need implmenet "RandGenerator()" in execution provider - -//#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \ -// template <> \ -// void RandUniformUnique( \ -// const size_t n, \ -// const T a, \ -// const T b, \ -// T* r, \ -// const size_t m, \ -// const T* avoid, \ -// CPUContext* context) { \ -// CAFFE_ENFORCE_LE( \ -// n, b - a - m + 1, "Cannot satisfy the unique requirement"); \ -// std::unordered_set avoid_set(n); \ -// if (m) { \ -// avoid_set.insert(avoid, avoid + m); \ -// CAFFE_ENFORCE_EQ(m, avoid_set.size(), "Avoid should be unique"); \ -// } \ -// std::uniform_int_distribution distribution(a, b); \ -// T v = 0; \ -// for (size_t i = 0; i < n; ++i) { \ -// do { \ -// v = distribution(context->RandGenerator()); \ -// } while (avoid_set.count(v)); \ -// r[i] = v; \ -// avoid_set.insert(v); \ -// } \ -// } -// -// CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t); -// CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t); -//#undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE - -template <> -void RandGaussian(int n, float mean, float std, const float* r, CPUMathUtil* /*provider*/) { - std::normal_distribution distribution(mean, std); - ORT_UNUSED_PARAMETER(n); - ORT_UNUSED_PARAMETER(r); - ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented"); - /*for (int i = 0; i < n; ++i) { - r[i] = distribution(context->RandGenerator()); - }*/ -} - -#define SPECIALIZED_SUM(T) \ - template <> \ - void Sum(int N, const T* x, T* y, CPUMathUtil* /* unused */, Tensor* /* unused */) { \ - *y = ConstEigenVectorMap(x, N).sum(); \ - } - -SPECIALIZED_SUM(float); -SPECIALIZED_SUM(int32_t); -SPECIALIZED_SUM(int64_t); - -#undef SPECIALIZED_SUM - -template <> -void SumSqr(int N, const float* x, float* y, CPUMathUtil* /*context*/ /* unused */, - Tensor* /*scratch_ptr*/ /* unused */) { - *y = ConstEigenVectorMap(x, N).squaredNorm(); -} - -template <> -void Select(int N, int D, const float* x, const int* idx, float* y, CPUMathUtil* /*context*/) { - for (int i = 0; i < N; ++i) { - ORT_ENFORCE(idx[i] < D); - y[i] = x[i * D + idx[i]]; - } -} - template <> void Col2imNd(const float* data_col, const int64_t* img_shape, const int64_t* col_shape, int64_t img_size, int64_t col_size, @@ -1092,24 +564,6 @@ void Col2im(const float* data_col, int64 SPECIALIZED_COPYVECTOR(float) #undef SPECIALIZED_COPYVECTOR -uint32_t randomNumberSeed() { - // Originally copied from folly::randomNumberSeed (at 418ad4) - // modified to use chrono instead of sys/time.h - static std::atomic seedInput(0); - auto tv = std::chrono::system_clock::now().time_since_epoch(); - uint64_t usec = static_cast( - std::chrono::duration_cast(tv).count()); - uint32_t tv_sec = static_cast(usec / 1000000); - uint32_t tv_usec = static_cast(usec % 1000000); - const uint32_t kPrime0 = 51551; - const uint32_t kPrime1 = 61631; - const uint32_t kPrime2 = 64997; - const uint32_t kPrime3 = 111857; - static const uint32_t pid = static_cast(Env::Default().GetSelfPid()); - return kPrime0 * (seedInput++) + kPrime1 * pid + - kPrime2 * tv_sec + kPrime3 * tv_usec; -} - uint16_t floatToHalf(float f) { return Eigen::half_impl::float_to_half_rtne(f).x; } diff --git a/onnxruntime/python/_ld_preload.py b/onnxruntime/python/_ld_preload.py new file mode 100644 index 0000000000000..a67f27f68da86 --- /dev/null +++ b/onnxruntime/python/_ld_preload.py @@ -0,0 +1,10 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +# This file can be modified by setup.py when building a manylinux2010 wheel +# When modified, it will preload some libraries needed for the python C extension +# Do not remove or move the following comment + +# LD_PRELOAD_BEGIN_MARK diff --git a/onnxruntime/python/_pybind_state.py b/onnxruntime/python/_pybind_state.py index 2feda322a331d..c48bb8bdd0778 100644 --- a/onnxruntime/python/_pybind_state.py +++ b/onnxruntime/python/_pybind_state.py @@ -5,6 +5,7 @@ import sys import os import warnings +import onnxruntime.capi._ld_preload try: from onnxruntime.capi.onnxruntime_pybind11_state import * # noqa diff --git a/onnxruntime/python/datasets/logreg_iris.onnx b/onnxruntime/python/datasets/logreg_iris.onnx index e15b27a5da40e..0326870f825d1 100644 Binary files a/onnxruntime/python/datasets/logreg_iris.onnx and b/onnxruntime/python/datasets/logreg_iris.onnx differ diff --git a/onnxruntime/python/datasets/mul_1.pb b/onnxruntime/python/datasets/mul_1.onnx similarity index 100% rename from onnxruntime/python/datasets/mul_1.pb rename to onnxruntime/python/datasets/mul_1.onnx diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index e29c6cf3f8850..83b6c878385d6 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -21,7 +21,6 @@ #define BACKEND_OPENMP "" #endif - #if USE_MKLDNN #define BACKEND_MKLDNN "-MKL-DNN" #include "core/providers/mkldnn/mkldnn_execution_provider.h" @@ -48,8 +47,6 @@ #define BACKEND_OPENVINO "" #endif - - #if USE_OPENBLAS #define BACKEND_OPENBLAS "-OPENBLAS" #else @@ -465,8 +462,11 @@ including arg name, arg type (contains both type and shape).)pbdoc") if (shape->dim(i).has_dim_value()) { res << shape->dim(i).dim_value(); } else if (shape->dim(i).has_dim_param()) { + res << "'" << shape->dim(i).dim_param() << "'"; + } else { res << "None"; } + if (i < shape->dim_size() - 1) { res << ", "; } @@ -491,6 +491,8 @@ including arg name, arg type (contains both type and shape).)pbdoc") if (shape->dim(i).has_dim_value()) { arr[i] = py::cast(shape->dim(i).dim_value()); } else if (shape->dim(i).has_dim_param()) { + arr[i] = py::cast(shape->dim(i).dim_param()); + } else { arr[i] = py::none(); } } diff --git a/onnxruntime/python/tools/quantization/README.md b/onnxruntime/python/tools/quantization/README.md index 19c8e2cd55c64..6f0ccfe48e45e 100644 --- a/onnxruntime/python/tools/quantization/README.md +++ b/onnxruntime/python/tools/quantization/README.md @@ -62,7 +62,7 @@ onnx.save(quantized_model, 'path/to/the/quantized_model.onnx') See below for a description of all the options to quantize(): - **model**: ModelProto to quantize -- **per_channel**: *default: True* +- **per_channel**: *default: False* If True, weights of Conv nodes are quantized per output channel. If False, they are quantized per tensor. Refer [QLinearConv](https://github.com/onnx/onnx/blob/master/docs/Operators.md#qlinearconv) for more information. - **nbits**: *default: 8* diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 3bf180e84c3cf..deb4dddf33b45 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -990,7 +990,7 @@ def _quantize_matmul(self, node, new_nodes_list): return [node] -def quantize(model, per_channel=True, nbits=8, quantization_mode=QuantizationMode.IntegerOps, +def quantize(model, per_channel=False, nbits=8, quantization_mode=QuantizationMode.IntegerOps, static=False, asymmetric_input_types=False, input_quantization_params=None, output_quantization_params=None): ''' Given an onnx model, create a quantized onnx model and save it into a file diff --git a/onnxruntime/server/converter.cc b/onnxruntime/server/converter.cc index 7be3e1fe72546..523c022de7545 100644 --- a/onnxruntime/server/converter.cc +++ b/onnxruntime/server/converter.cc @@ -1,74 +1,76 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include -#include "core/common/logging/logging.h" -#include "core/framework/data_types.h" -#include "core/session/environment.h" -#include "core/framework/framework_common.h" -#include "core/framework/mem_buffer.h" -#include "core/framework/ml_value.h" -#include "core/framework/tensor.h" -#include "core/framework/tensorprotoutils.h" +#include "core/session/onnxruntime_cxx_api.h" #include "onnx-ml.pb.h" #include "predict.pb.h" #include "converter.h" +#include "serializing/mem_buffer.h" namespace onnxruntime { namespace server { namespace protobufutil = google::protobuf::util; -onnx::TensorProto_DataType MLDataTypeToTensorProtoDataType(const onnxruntime::DataTypeImpl* cpp_type) { - if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_FLOAT; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_UINT8; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_INT8; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_UINT16; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_INT16; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_INT32; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_INT64; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_STRING; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_BOOL; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_FLOAT16; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_BFLOAT16; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_DOUBLE; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_UINT32; - } else if (cpp_type == onnxruntime::DataTypeImpl::GetType()) { - return onnx::TensorProto_DataType_UINT64; - } else { - return onnx::TensorProto_DataType_UNDEFINED; +onnx::TensorProto_DataType MLDataTypeToTensorProtoDataType(ONNXTensorElementDataType onnx_enum) { + switch (onnx_enum) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: + return onnx::TensorProto_DataType::TensorProto_DataType_FLOAT; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: + return onnx::TensorProto_DataType::TensorProto_DataType_UINT8; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: + return onnx::TensorProto_DataType::TensorProto_DataType_INT8; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16: + return onnx::TensorProto_DataType::TensorProto_DataType_UINT16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16: + return onnx::TensorProto_DataType::TensorProto_DataType_INT16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: + return onnx::TensorProto_DataType::TensorProto_DataType_INT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: + return onnx::TensorProto_DataType::TensorProto_DataType_INT64; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING: + return onnx::TensorProto_DataType::TensorProto_DataType_STRING; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: + return onnx::TensorProto_DataType::TensorProto_DataType_BOOL; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + return onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: + return onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32: + return onnx::TensorProto_DataType::TensorProto_DataType_UINT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64: + return onnx::TensorProto_DataType::TensorProto_DataType_UINT64; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64: + return onnx::TensorProto_DataType::TensorProto_DataType_COMPLEX64; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128: + return onnx::TensorProto_DataType::TensorProto_DataType_COMPLEX128; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16: + return onnx::TensorProto_DataType::TensorProto_DataType_BFLOAT16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED: + default: + return onnx::TensorProto_DataType::TensorProto_DataType_UNDEFINED; } } -common::Status MLValueToTensorProto(const OrtValue& ml_value, bool using_raw_data, - std::unique_ptr logger, - /* out */ onnx::TensorProto& tensor_proto) { +void MLValueToTensorProto(Ort::Value& ml_value, bool using_raw_data, + const std::shared_ptr& logger, + /* out */ onnx::TensorProto& tensor_proto) { + if (!ml_value.IsTensor()) { + throw Ort::Exception("Don't support Non-Tensor values", OrtErrorCode::ORT_NOT_IMPLEMENTED); + } // Tensor in MLValue - const auto& tensor = ml_value.Get(); + const auto& shape = ml_value.GetTensorTypeAndShapeInfo(); // dims field - const onnxruntime::TensorShape& tensor_shape = tensor.Shape(); - for (const auto& dim : tensor_shape.GetDims()) { + for (const auto& dim : shape.GetShape()) { tensor_proto.add_dims(dim); } + auto elem_count = shape.GetElementCount(); // data_type field - onnx::TensorProto_DataType data_type = MLDataTypeToTensorProtoDataType(tensor.DataType()); + onnx::TensorProto_DataType data_type = MLDataTypeToTensorProtoDataType(shape.GetElementType()); tensor_proto.set_data_type(data_type); // data_location field: Data is stored in raw_data (if set) otherwise in type-specified field. @@ -81,181 +83,181 @@ common::Status MLValueToTensorProto(const OrtValue& ml_value, bool using_raw_dat // exactly one of the *_data fields is used to store the elements of the tensor. switch (data_type) { case onnx::TensorProto_DataType_FLOAT: { // Target: raw_data or float_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(float) * elem_count); } else { - for (size_t i = 0, count = tensor.Shape().Size(); i < count; ++i) { + for (size_t i = 0, count = elem_count; i < count; ++i) { tensor_proto.add_float_data(data[i]); } } break; } case onnx::TensorProto_DataType_INT32: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(int32_t) * elem_count); } else { - for (size_t i = 0, count = tensor.Shape().Size(); i < count; ++i) { + for (size_t i = 0, count = elem_count; i < count; ++i) { tensor_proto.add_int32_data(data[i]); } } break; } case onnx::TensorProto_DataType_UINT8: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(uint8_t) * elem_count); } else { - auto i32data = reinterpret_cast(data); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(int32_t)); i < count; ++i) { - tensor_proto.add_int32_data(i32data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_int32_data(data[i]); } } break; } case onnx::TensorProto_DataType_INT8: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(int8_t) * elem_count); } else { - auto i32data = reinterpret_cast(data); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(int32_t)); i < count; ++i) { - tensor_proto.add_int32_data(i32data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_int32_data(data[i]); } } break; } case onnx::TensorProto_DataType_UINT16: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(uint16_t) * elem_count); } else { - auto i32data = reinterpret_cast(data); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(int32_t)); i < count; ++i) { - tensor_proto.add_int32_data(i32data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_int32_data(data[i]); } } break; } case onnx::TensorProto_DataType_INT16: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(int16_t) * elem_count); } else { - auto i32data = reinterpret_cast(data); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(int32_t)); i < count; ++i) { - tensor_proto.add_int32_data(i32data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_int32_data(data[i]); } } break; } case onnx::TensorProto_DataType_BOOL: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(bool) * elem_count); } else { - auto i32data = reinterpret_cast(data); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(int32_t)); i < count; ++i) { - tensor_proto.add_int32_data(i32data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_int32_data(data[i]); } } break; } case onnx::TensorProto_DataType_FLOAT16: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(onnxruntime::MLFloat16) * elem_count); } else { - auto i32data = reinterpret_cast(data); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(int32_t)); i < count; ++i) { - tensor_proto.add_int32_data(i32data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_int32_data(reinterpret_cast(data)[i]); } } break; } case onnx::TensorProto_DataType_BFLOAT16: { // Target: raw_data or int32_data - const auto* data = tensor.Data(); - const auto raw_data_size = tensor.Shape().Size(); + const auto* data = ml_value.GetTensorMutableData(); std::vector raw_data; - raw_data.reserve(raw_data_size); - for (int i = 0; i < raw_data_size; ++i) { + raw_data.reserve(elem_count); + for (size_t i = 0; i < elem_count; ++i) { raw_data.push_back(data[i].val); } if (using_raw_data) { tensor_proto.set_raw_data(raw_data.data(), raw_data.size() * sizeof(uint16_t)); } else { - auto i32data = reinterpret_cast(raw_data.data()); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(int32_t)); i < count; ++i) { - tensor_proto.add_int32_data(i32data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_int32_data(raw_data[i]); } } break; } case onnx::TensorProto_DataType_STRING: { // Target: string_data // string could not be written into "raw_data" - const auto* data = tensor.Data(); - for (size_t i = 0, count = tensor.Shape().Size(); i < count; ++i) { - tensor_proto.add_string_data(data[i]); + auto length = ml_value.GetStringTensorDataLength(); + std::vector buffer; + std::vector offsets; + buffer.reserve(length); + offsets.reserve(elem_count); + ml_value.GetStringTensorContent(buffer.data(), length, offsets.data(), elem_count); + size_t start = 0; + for (size_t i = 1; i < elem_count; ++i) { + auto end = offsets[i]; + tensor_proto.add_string_data(&buffer[start], end - start); + start = end; } + tensor_proto.add_string_data(&buffer[start], length - start); break; } case onnx::TensorProto_DataType_INT64: { // Target: raw_data or int64_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(int64_t) * elem_count); } else { - for (size_t x = 0, loop_length = tensor.Shape().Size(); x < loop_length; ++x) { + for (size_t x = 0, loop_length = elem_count; x < loop_length; ++x) { tensor_proto.add_int64_data(data[x]); } } break; } case onnx::TensorProto_DataType_UINT32: { // Target: raw_data or uint64_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(uint32_t) * elem_count); } else { - auto u64data = reinterpret_cast(data); - for (size_t i = 0, count = 1 + ((tensor.Size() - 1) / sizeof(uint64_t)); i < count; ++i) { - tensor_proto.add_uint64_data(u64data[i]); + for (size_t i = 0, count = elem_count; i < count; ++i) { + tensor_proto.add_uint64_data(data[i]); } } break; } case onnx::TensorProto_DataType_UINT64: { // Target: raw_data or uint64_data - const auto* data = tensor.Data(); + const auto* data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(uint64_t) * elem_count); } else { - for (size_t x = 0, loop_length = tensor.Shape().Size(); x < loop_length; ++x) { + for (size_t x = 0, loop_length = elem_count; x < loop_length; ++x) { tensor_proto.add_uint64_data(data[x]); } } break; } case onnx::TensorProto_DataType_DOUBLE: { // Target: raw_data or double_data - auto data = tensor.Data(); + auto data = ml_value.GetTensorMutableData(); if (using_raw_data) { - tensor_proto.set_raw_data(data, tensor.Size()); + tensor_proto.set_raw_data(data, sizeof(double) * elem_count); } else { - for (size_t x = 0, loop_length = tensor.Shape().Size(); x < loop_length; ++x) { + for (size_t x = 0, loop_length = elem_count; x < loop_length; ++x) { tensor_proto.add_double_data(data[x]); } } break; } default: { - LOGS(*logger, ERROR) << "Unsupported TensorProto DataType: " << data_type; - return common::Status(common::StatusCategory::ONNXRUNTIME, - common::StatusCode::NOT_IMPLEMENTED, - "Unsupported TensorProto DataType: " + std::to_string(data_type)); + logger->error("Unsupported TensorProto DataType: {}", data_type); + std::ostringstream ostr; + ostr << "Initialized tensor with unexpected type: " << tensor_proto.data_type(); + throw Ort::Exception(ostr.str(), OrtErrorCode::ORT_INVALID_ARGUMENT); } } - return common::Status::OK(); + return; } } // namespace server } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/converter.h b/onnxruntime/server/converter.h index 9d635d9cc963a..3e705f5766204 100644 --- a/onnxruntime/server/converter.h +++ b/onnxruntime/server/converter.h @@ -2,6 +2,7 @@ // Licensed under the MIT License. #pragma once +#include "core/session/onnxruntime_cxx_api.h" #include @@ -13,7 +14,7 @@ namespace onnxruntime { namespace server { -onnx::TensorProto_DataType MLDataTypeToTensorProtoDataType(const onnxruntime::DataTypeImpl* cpp_type); +onnx::TensorProto_DataType MLDataTypeToTensorProtoDataType(ONNXTensorElementDataType cpp_type); // Convert MLValue to TensorProto. Some fields are ignored: // * name field: could not get from MLValue @@ -21,9 +22,9 @@ onnx::TensorProto_DataType MLDataTypeToTensorProtoDataType(const onnxruntime::Da // * segment field: we do not expect very large tensors in the prediction output // * external_data field: we do not expect very large tensors in the prediction output // Note: If any input data is in raw_data field, all outputs tensor data will be put into raw_data field. -common::Status MLValueToTensorProto(const OrtValue& ml_value, bool using_raw_data, - std::unique_ptr logger, - /* out */ onnx::TensorProto& tensor_proto); +void MLValueToTensorProto(Ort::Value& ml_value, bool using_raw_data, + const std::shared_ptr& logger, + /* out */ onnx::TensorProto& tensor_proto); } // namespace server } // namespace onnxruntime diff --git a/onnxruntime/server/core/request_id.cc b/onnxruntime/server/core/request_id.cc new file mode 100644 index 0000000000000..8e0c5495d6879 --- /dev/null +++ b/onnxruntime/server/core/request_id.cc @@ -0,0 +1,20 @@ +#include "request_id.h" +// boost random is using a deprecated header in 1.69 +// See: https://github.com/boostorg/random/issues/49 +#define BOOST_PENDING_INTEGER_LOG2_HPP +#include +#include +#include +#include + +namespace onnxruntime { +namespace server { +namespace util { +std::string InternalRequestId() { + return boost::uuids::to_string(boost::uuids::random_generator()()); +} +const std::string MS_REQUEST_ID_HEADER = "x-ms-request-id"; +const std::string MS_CLIENT_REQUEST_ID_HEADER = "x-ms-client-request-id"; +} // namespace util +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/core/request_id.h b/onnxruntime/server/core/request_id.h new file mode 100644 index 0000000000000..b45ca7113ba1a --- /dev/null +++ b/onnxruntime/server/core/request_id.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace onnxruntime { +namespace server { +namespace util { +std::string InternalRequestId(); +extern const std::string MS_REQUEST_ID_HEADER; +extern const std::string MS_CLIENT_REQUEST_ID_HEADER; +} // namespace util +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/environment.cc b/onnxruntime/server/environment.cc index 2faed1f5376d5..6ebcb90b4cef5 100644 --- a/onnxruntime/server/environment.cc +++ b/onnxruntime/server/environment.cc @@ -2,70 +2,80 @@ // Licensed under the MIT License. #include -#include "core/common/logging/logging.h" - #include "environment.h" -#include "log_sink.h" +#include "core/session/onnxruntime_cxx_api.h" namespace onnxruntime { namespace server { -ServerEnvironment::ServerEnvironment(logging::Severity severity, logging::LoggingManager::InstanceType instance_type, bool env_init) : severity_(severity), - logger_id_("ServerApp"), - default_logging_manager_( - std::unique_ptr{new LogSink{}}, - severity, - /* default_filter_user_data */ false, - instance_type, - &logger_id_) { - if (env_init) { - auto status = onnxruntime::Environment::Create(runtime_environment_); +static spdlog::level::level_enum Convert(OrtLoggingLevel in) { + switch (in) { + case OrtLoggingLevel::ORT_LOGGING_LEVEL_VERBOSE: + return spdlog::level::level_enum::debug; + case OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO: + return spdlog::level::level_enum::info; + case OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING: + return spdlog::level::level_enum::warn; + case OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR: + return spdlog::level::level_enum::err; + case OrtLoggingLevel::ORT_LOGGING_LEVEL_FATAL: + return spdlog::level::level_enum::critical; + default: + return spdlog::level::level_enum::off; } +} - // The session initialization MUST BE AFTER environment creation - session = std::make_unique(options_, &default_logging_manager_); +void ORT_API_CALL Log(void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, + const char* message) { + spdlog::logger* logger = static_cast(param); + logger->log(Convert(severity), "[{} {} {}]: {}", logid, category, code_location, message); + return; } -common::Status ServerEnvironment::InitializeModel(const std::string& model_path) { - auto status = session->Load(model_path); - if (!status.IsOK()) { - return status; - } +ServerEnvironment::ServerEnvironment(OrtLoggingLevel severity, spdlog::sinks_init_list sink) : severity_(severity), + logger_id_("ServerApp"), + sink_(sink), + default_logger_(std::make_shared(logger_id_, sink)), + runtime_environment_(severity, logger_id_.c_str(), Log, default_logger_.get()), + session(nullptr) { + spdlog::set_automatic_registration(false); + spdlog::set_level(Convert(severity_)); + spdlog::initialize_logger(default_logger_); +} - auto outputs = session->GetModelOutputs(); - if (!outputs.first.IsOK()) { - return outputs.first; - } +void ServerEnvironment::InitializeModel(const std::string& model_path) { + session = Ort::Session(runtime_environment_, model_path.c_str(), Ort::SessionOptions()); - for (const auto* output_node : *(outputs.second)) { - model_output_names_.push_back(output_node->Name()); - } + auto output_count = session.GetOutputCount(); - return common::Status::OK(); + auto allocator = Ort::Allocator::CreateDefault(); + for (size_t i = 0; i < output_count; i++) { + auto name = session.GetOutputName(i, allocator); + model_output_names_.push_back(name); + allocator.Free(name); + } } const std::vector& ServerEnvironment::GetModelOutputNames() const { return model_output_names_; } -const logging::Logger& ServerEnvironment::GetAppLogger() const { - return default_logging_manager_.DefaultLogger(); -} - -logging::Severity ServerEnvironment::GetLogSeverity() const { +OrtLoggingLevel ServerEnvironment::GetLogSeverity() const { return severity_; } -std::unique_ptr ServerEnvironment::GetLogger(const std::string& id) { - if (id.empty()) { - LOGS(GetAppLogger(), WARNING) << "Request id is null or empty string"; - } +const Ort::Session& ServerEnvironment::GetSession() const { + return session; +} - return default_logging_manager_.CreateLogger(id, severity_, false); +std::shared_ptr ServerEnvironment::GetLogger(const std::string& request_id) const { + auto logger = std::make_shared(request_id, sink_.begin(), sink_.end()); + spdlog::initialize_logger(logger); + return logger; } -onnxruntime::InferenceSession* ServerEnvironment::GetSession() const { - return session.get(); +std::shared_ptr ServerEnvironment::GetAppLogger() const { + return default_logger_; } } // namespace server diff --git a/onnxruntime/server/environment.h b/onnxruntime/server/environment.h index fc57531300589..332ab6e064ac5 100644 --- a/onnxruntime/server/environment.h +++ b/onnxruntime/server/environment.h @@ -6,38 +6,35 @@ #include #include -#include "core/session/environment.h" -#include "core/common/logging/logging.h" -#include "core/session/inference_session.h" +#include "core/session/onnxruntime_cxx_api.h" +#include namespace onnxruntime { namespace server { -namespace logging = logging; - class ServerEnvironment { public: - explicit ServerEnvironment(logging::Severity severity, logging::LoggingManager::InstanceType instance_type = logging::LoggingManager::Default, bool env_init = true); + explicit ServerEnvironment(OrtLoggingLevel severity, spdlog::sinks_init_list sink); ~ServerEnvironment() = default; ServerEnvironment(const ServerEnvironment&) = delete; - const logging::Logger& GetAppLogger() const; - std::unique_ptr GetLogger(const std::string& id); - logging::Severity GetLogSeverity() const; + OrtLoggingLevel GetLogSeverity() const; - onnxruntime::InferenceSession* GetSession() const; - common::Status InitializeModel(const std::string& model_path); + const Ort::Session& GetSession() const; + void InitializeModel(const std::string& model_path); const std::vector& GetModelOutputNames() const; - + std::shared_ptr GetLogger(const std::string& request_id) const; + std::shared_ptr GetAppLogger() const; private: - const logging::Severity severity_; + const OrtLoggingLevel severity_; const std::string logger_id_; - logging::LoggingManager default_logging_manager_; + const std::vector sink_; + const std::shared_ptr default_logger_; - std::unique_ptr runtime_environment_; - onnxruntime::SessionOptions options_; - std::unique_ptr session; + Ort::Env runtime_environment_; + Ort::SessionOptions options_; + Ort::Session session; std::vector model_output_names_; }; diff --git a/onnxruntime/server/executor.cc b/onnxruntime/server/executor.cc index 45b64ab184059..e7cae666f45b4 100644 --- a/onnxruntime/server/executor.cc +++ b/onnxruntime/server/executor.cc @@ -2,15 +2,15 @@ // Licensed under the MIT License. #include -#include #include "core/common/logging/logging.h" #include "core/framework/data_types.h" #include "core/session/environment.h" #include "core/framework/framework_common.h" -#include "core/framework/mem_buffer.h" +#include "serializing/mem_buffer.h" #include "core/framework/ml_value.h" #include "core/framework/tensor.h" -#include "core/framework/tensorprotoutils.h" +#include "serializing/tensorprotoutils.h" +#include "core/common/callback.h" #include "onnx-ml.pb.h" #include "predict.pb.h" @@ -27,30 +27,33 @@ namespace protobufutil = google::protobuf::util; protobufutil::Status Executor::SetMLValue(const onnx::TensorProto& input_tensor, MemBufferArray& buffers, OrtAllocatorInfo* cpu_allocator_info, - /* out */ MLValue& ml_value) { + /* out */ Ort::Value& ml_value) { auto logger = env_->GetLogger(request_id_); size_t cpu_tensor_length = 0; - auto status = onnxruntime::utils::GetSizeInBytesFromTensorProto<0>(input_tensor, &cpu_tensor_length); - if (!status.IsOK()) { - LOGS(*logger, ERROR) << "GetSizeInBytesFromTensorProto() failed. Error Message: " << status.ToString(); - return GenerateProtobufStatus(status, "GetSizeInBytesFromTensorProto() failed: " + status.ToString()); + try { + onnxruntime::server::GetSizeInBytesFromTensorProto<0>(input_tensor, &cpu_tensor_length); + } catch (const Ort::Exception& e) { + logger->error("GetSizeInBytesFromTensorProto() failed. Error Message: {}", e.what()); + return GenerateProtobufStatus(e.GetOrtErrorCode(), e.what()); } - OrtCallback deleter; auto* buf = buffers.AllocNewBuffer(cpu_tensor_length); - status = onnxruntime::utils::TensorProtoToMLValue(onnxruntime::Env::Default(), nullptr, input_tensor, - onnxruntime::MemBuffer(buf, cpu_tensor_length, *cpu_allocator_info), - ml_value, deleter); - if (!status.IsOK()) { - LOGS(*logger, ERROR) << "TensorProtoToMLValue() failed. Message: " << status.ToString(); - return GenerateProtobufStatus(status, "TensorProtoToMLValue() failed:" + status.ToString()); + try { + onnxruntime::server::TensorProtoToMLValue(input_tensor, + onnxruntime::server::MemBuffer(buf, cpu_tensor_length, *cpu_allocator_info), + ml_value); + + } catch (const Ort::Exception& e) { + logger->error("TensorProtoToMLValue() failed. Message: {}", e.what()); + return GenerateProtobufStatus(e.GetOrtErrorCode(), e.what()); } return protobufutil::Status::OK; } -protobufutil::Status Executor::SetNameMLValueMap(onnxruntime::NameMLValMap& name_value_map, +protobufutil::Status Executor::SetNameMLValueMap(std::vector& input_names, + std::vector& input_values, const onnxruntime::server::PredictRequest& request, MemBufferArray& buffers) { auto logger = env_->GetLogger(request_id_); @@ -59,49 +62,68 @@ protobufutil::Status Executor::SetNameMLValueMap(onnxruntime::NameMLValMap& name auto ort_status = OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &allocator_info); if (ort_status != nullptr || allocator_info == nullptr) { - LOGS(*logger, ERROR) << "OrtCreateAllocatorInfo failed"; - return protobufutil::Status(protobufutil::error::Code::RESOURCE_EXHAUSTED, "OrtCreateAllocatorInfo() failed"); + logger->error("OrtCreateCpuAllocatorInfo failed"); + return protobufutil::Status(protobufutil::error::Code::RESOURCE_EXHAUSTED, "OrtCreateCpuAllocatorInfo() failed"); } - // Prepare the MLValue object + // Prepare the Value object for (const auto& input : request.inputs()) { using_raw_data_ = using_raw_data_ && input.second.has_raw_data(); - MLValue ml_value; + Ort::Value ml_value{nullptr}; auto status = SetMLValue(input.second, buffers, allocator_info, ml_value); if (status != protobufutil::Status::OK) { OrtReleaseAllocatorInfo(allocator_info); - LOGS(*logger, ERROR) << "SetMLValue() failed! Input name: " << input.first; + logger->error("SetMLValue() failed! Input name: {}", input.first); return status; } - auto insertion_result = name_value_map.insert(std::make_pair(input.first, ml_value)); - if (!insertion_result.second) { - OrtReleaseAllocatorInfo(allocator_info); - LOGS(*logger, ERROR) << "SetNameMLValueMap() failed! Input name: " << input.first << " Trying to overwrite existing input value"; - return protobufutil::Status(protobufutil::error::Code::INVALID_ARGUMENT, "SetNameMLValueMap() failed: Cannot have two inputs with the same name"); - } + input_names.push_back(input.first); + input_values.push_back(std::move(ml_value)); } OrtReleaseAllocatorInfo(allocator_info); return protobufutil::Status::OK; } +std::vector Run(const Ort::Session& session, const Ort::RunOptions& options, const std::vector& input_names, const std::vector& input_values, const std::vector& output_names) { + size_t input_count = input_names.size(); + size_t output_count = output_names.size(); + + std::vector input_ptrs{}; + input_ptrs.reserve(input_count); + for (const auto& input : input_names) { + input_ptrs.push_back(input.data()); + } + std::vector output_ptrs{}; + output_ptrs.reserve(output_count); + for (const auto& output : output_names) { + output_ptrs.push_back(output.data()); + } + + return const_cast(session).Run(options, input_ptrs.data(), const_cast(input_values.data()), input_count, output_ptrs.data(), output_count); +} + protobufutil::Status Executor::Predict(const std::string& model_name, const std::string& model_version, - onnxruntime::server::PredictRequest& request, + const onnxruntime::server::PredictRequest& request, /* out */ onnxruntime::server::PredictResponse& response) { auto logger = env_->GetLogger(request_id_); // Convert PredictRequest to NameMLValMap MemBufferArray buffer_array; - onnxruntime::NameMLValMap name_ml_value_map{}; - auto conversion_status = SetNameMLValueMap(name_ml_value_map, request, buffer_array); + std::vector input_names; + std::vector input_values; + auto conversion_status = SetNameMLValueMap(input_names, input_values, request, buffer_array); if (conversion_status != protobufutil::Status::OK) { return conversion_status; } - // Prepare the output names and vector + Ort::RunOptions run_options{}; + run_options.SetRunLogVerbosityLevel(static_cast(env_->GetLogSeverity())); + run_options.SetRunTag(request_id_.c_str()); + + // Prepare the output names std::vector output_names; if (!request.output_filter().empty()) { @@ -113,36 +135,28 @@ protobufutil::Status Executor::Predict(const std::string& model_name, output_names = env_->GetModelOutputNames(); } - std::vector outputs(output_names.size()); - - // Run - OrtRunOptions run_options{}; - run_options.run_log_verbosity_level = static_cast(env_->GetLogSeverity()); - run_options.run_tag = request_id_; - - auto status = env_->GetSession()->Run(run_options, name_ml_value_map, output_names, &outputs); - - if (!status.IsOK()) { - LOGS(*logger, ERROR) << "Run() failed." - << ". Error Message: " << status.ToString(); - return GenerateProtobufStatus(status, "Run() failed: " + status.ToString()); + std::vector outputs; + try { + outputs = Run(env_->GetSession(), run_options, input_names, input_values, output_names); + } catch (const Ort::Exception& e) { + return GenerateProtobufStatus(e.GetOrtErrorCode(), e.what()); } // Build the response for (size_t i = 0, sz = outputs.size(); i < sz; ++i) { onnx::TensorProto output_tensor{}; - status = MLValueToTensorProto(outputs[i], using_raw_data_, std::move(logger), output_tensor); - logger = env_->GetLogger(request_id_); - - if (!status.IsOK()) { - LOGS(*logger, ERROR) << "MLValueToTensorProto() failed. Output name: " << output_names[i] << ". Error Message: " << status.ToString(); - return GenerateProtobufStatus(status, "MLValueToTensorProto() failed: " + status.ToString()); + try { + MLValueToTensorProto(outputs[i], using_raw_data_, logger, output_tensor); + } catch (const Ort::Exception& e) { + logger = env_->GetLogger(request_id_); + logger->error("MLValueToTensorProto() failed. Output name: {}. Error Message: {}", output_names[i], e.what()); + return GenerateProtobufStatus(e.GetOrtErrorCode(), e.what()); } auto insertion_result = response.mutable_outputs()->insert({output_names[i], output_tensor}); if (!insertion_result.second) { - LOGS(*logger, ERROR) << "SetNameMLValueMap() failed. Output name: " << output_names[i] << " Trying to overwrite existing output value"; + logger->error("SetNameMLValueMap() failed. Output name: {}. Trying to overwrite existing output value", output_names[i]); return protobufutil::Status(protobufutil::error::Code::INVALID_ARGUMENT, "SetNameMLValueMap() failed: Cannot have two outputs with the same name"); } } diff --git a/onnxruntime/server/executor.h b/onnxruntime/server/executor.h index 179d8915fa84f..b67720287c1d0 100644 --- a/onnxruntime/server/executor.h +++ b/onnxruntime/server/executor.h @@ -8,6 +8,7 @@ #include "environment.h" #include "predict.pb.h" #include "util.h" +#include "core/session/onnxruntime_cxx_api.h" namespace onnxruntime { namespace server { @@ -21,7 +22,7 @@ class Executor { // Prediction method google::protobuf::util::Status Predict(const std::string& model_name, const std::string& model_version, - onnxruntime::server::PredictRequest& request, + const onnxruntime::server::PredictRequest& request, /* out */ onnxruntime::server::PredictResponse& response); private: @@ -32,9 +33,10 @@ class Executor { google::protobuf::util::Status SetMLValue(const onnx::TensorProto& input_tensor, MemBufferArray& buffers, OrtAllocatorInfo* cpu_allocator_info, - /* out */ MLValue& ml_value); + /* out */ Ort::Value& ml_value); - google::protobuf::util::Status SetNameMLValueMap(onnxruntime::NameMLValMap& name_value_map, + google::protobuf::util::Status SetNameMLValueMap(/* out */ std::vector& input_names, + /* out */ std::vector& input_values, const onnxruntime::server::PredictRequest& request, MemBufferArray& buffers); }; diff --git a/onnxruntime/server/grpc/grpc_app.cc b/onnxruntime/server/grpc/grpc_app.cc new file mode 100644 index 0000000000000..fe73b91abdc4c --- /dev/null +++ b/onnxruntime/server/grpc/grpc_app.cc @@ -0,0 +1,25 @@ +#include "grpc_app.h" +#include +#include +#include +namespace onnx_grpc = onnxruntime::server::grpc; + +namespace onnxruntime { +namespace server { +GRPCApp::GRPCApp(const std::shared_ptr& env, const std::string& host, const unsigned short port) : prediction_service_implementation_(env) { + ::grpc::EnableDefaultHealthCheckService(true); + ::grpc::channelz::experimental::InitChannelzService(); + ::grpc::reflection::InitProtoReflectionServerBuilderPlugin(); + ::grpc::ServerBuilder builder; + builder.RegisterService(&prediction_service_implementation_); + builder.AddListeningPort(host + ":" + std::to_string(port), ::grpc::InsecureServerCredentials()); + + server_ = builder.BuildAndStart(); + server_->GetHealthCheckService()->SetServingStatus(PredictionService::service_full_name(), true); +} + +void GRPCApp::Run() { + server_->Wait(); +} +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/grpc/grpc_app.h b/onnxruntime/server/grpc/grpc_app.h new file mode 100644 index 0000000000000..27cea08d15155 --- /dev/null +++ b/onnxruntime/server/grpc/grpc_app.h @@ -0,0 +1,25 @@ +#pragma once +#include +#include "prediction_service_impl.h" +#include "environment.h" + +namespace onnxruntime { +namespace server { +class GRPCApp { + public: + GRPCApp(const std::shared_ptr& env, const std::string& host, const unsigned short port); + ~GRPCApp() = default; + GRPCApp(const GRPCApp& other) = delete; + GRPCApp(GRPCApp&& other) = delete; + + GRPCApp& operator=(const GRPCApp&) = delete; + + //Block until the server shuts down. + void Run(); + + private: + grpc::PredictionServiceImpl prediction_service_implementation_; + std::unique_ptr<::grpc::Server> server_; +}; +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/grpc/prediction_service_impl.cc b/onnxruntime/server/grpc/prediction_service_impl.cc new file mode 100644 index 0000000000000..cd2817abab300 --- /dev/null +++ b/onnxruntime/server/grpc/prediction_service_impl.cc @@ -0,0 +1,39 @@ +#include "prediction_service_impl.h" +#include "request_id.h" + +namespace onnxruntime { +namespace server { +namespace grpc { + +PredictionServiceImpl::PredictionServiceImpl(const std::shared_ptr& env) : environment_(env) {} + +::grpc::Status PredictionServiceImpl::Predict(::grpc::ServerContext* context, const ::onnxruntime::server::PredictRequest* request, ::onnxruntime::server::PredictResponse* response) { + auto request_id = SetRequestContext(context); + onnxruntime::server::Executor executor(environment_.get(), request_id); + //TODO: (csteegz) Add modelspec for both paths. + auto status = executor.Predict("default", "1", *request, *response); // Currently only support one model so hard coded. + if (!status.ok()) { + return ::grpc::Status(::grpc::StatusCode(status.error_code()), status.error_message()); + } + return ::grpc::Status::OK; +} + +std::string PredictionServiceImpl::SetRequestContext(::grpc::ServerContext* context) { + auto metadata = context->client_metadata(); + auto request_id = util::InternalRequestId(); + context->AddInitialMetadata(util::MS_REQUEST_ID_HEADER, request_id); + auto logger = environment_->GetLogger(request_id); + auto search = metadata.find(util::MS_CLIENT_REQUEST_ID_HEADER); + if (search != metadata.end()) { + std::string id{search->second.data(), search->second.length()}; + context->AddInitialMetadata(util::MS_CLIENT_REQUEST_ID_HEADER, id); + logger->info("{}: [{}]", util::MS_CLIENT_REQUEST_ID_HEADER, id); + } + + return request_id; +} + +} // namespace grpc +} // namespace server + +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/grpc/prediction_service_impl.h b/onnxruntime/server/grpc/prediction_service_impl.h new file mode 100644 index 0000000000000..b1024d31c28c2 --- /dev/null +++ b/onnxruntime/server/grpc/prediction_service_impl.h @@ -0,0 +1,24 @@ +#pragma once +#include "prediction_service.grpc.pb.h" +#include "environment.h" +#include "executor.h" +#include + +namespace onnxruntime { +namespace server { +namespace grpc { +class PredictionServiceImpl final : public onnxruntime::server::PredictionService::Service { + public: + PredictionServiceImpl(const std::shared_ptr& env); + ::grpc::Status Predict(::grpc::ServerContext* context, const ::onnxruntime::server::PredictRequest* request, ::onnxruntime::server::PredictResponse* response); + + private: + std::shared_ptr environment_; + + //Extract customer request ID and set request ID for response. + std::string SetRequestContext(::grpc::ServerContext* context); +}; +} // namespace grpc +} // namespace server + +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/http/core/context.h b/onnxruntime/server/http/core/context.h index b9598762fd5a5..919b3c68ab529 100644 --- a/onnxruntime/server/http/core/context.h +++ b/onnxruntime/server/http/core/context.h @@ -3,17 +3,10 @@ #pragma once -// boost random is using a deprecated header in 1.69 -// See: https://github.com/boostorg/random/issues/49 -#define BOOST_PENDING_INTEGER_LOG2_HPP -#include - #include #include -#include -#include -#include +#include "request_id.h" namespace onnxruntime { namespace server { @@ -33,7 +26,7 @@ class HttpContext { http::status error_code; std::string error_message; - HttpContext() : request_id(boost::uuids::to_string(boost::uuids::random_generator()())), + HttpContext() : request_id(util::InternalRequestId()), client_request_id(""), error_code(http::status::internal_server_error), error_message("An unknown server error has occurred") {} diff --git a/onnxruntime/server/http/core/listener.h b/onnxruntime/server/http/core/listener.h index 3295e6a448cb4..52957622395f8 100644 --- a/onnxruntime/server/http/core/listener.h +++ b/onnxruntime/server/http/core/listener.h @@ -41,4 +41,3 @@ class Listener : public std::enable_shared_from_this { } // namespace server } // namespace onnxruntime - diff --git a/onnxruntime/server/http/core/routes.h b/onnxruntime/server/http/core/routes.h index 5681f2437d605..5609393674ecf 100644 --- a/onnxruntime/server/http/core/routes.h +++ b/onnxruntime/server/http/core/routes.h @@ -38,4 +38,3 @@ class Routes { } //namespace server } // namespace onnxruntime - diff --git a/onnxruntime/server/http/core/session.cc b/onnxruntime/server/http/core/session.cc index 1b31ec06296c8..1d415a9f90751 100644 --- a/onnxruntime/server/http/core/session.cc +++ b/onnxruntime/server/http/core/session.cc @@ -86,7 +86,7 @@ void HttpSession::Send(Msg&& msg) { http::async_write(self_->socket_, *ptr, net::bind_executor(strand_, - [ self_, close = ptr->need_eof() ](beast::error_code ec, std::size_t bytes) { + [self_, close = ptr->need_eof()](beast::error_code ec, std::size_t bytes) { self_->OnWrite(ec, bytes, close); })); } @@ -117,8 +117,8 @@ http::status HttpSession::ExecuteUserFunction(HttpContext& context) { std::string model_name, model_version, action; HandlerFn func; - if (context.request.find("x-ms-client-request-id") != context.request.end()) { - context.client_request_id = context.request["x-ms-client-request-id"].to_string(); + if (context.request.find(util::MS_CLIENT_REQUEST_ID_HEADER) != context.request.end()) { + context.client_request_id = context.request[util::MS_CLIENT_REQUEST_ID_HEADER].to_string(); } if (path == "/score") { diff --git a/onnxruntime/server/http/core/util.h b/onnxruntime/server/http/core/util.h index 54faea9629ff8..3535a9af3a4aa 100644 --- a/onnxruntime/server/http/core/util.h +++ b/onnxruntime/server/http/core/util.h @@ -18,4 +18,3 @@ void ErrorHandling(beast::error_code ec, char const* what); } // namespace server } // namespace onnxruntime - diff --git a/onnxruntime/server/http/json_handling.cc b/onnxruntime/server/http/json_handling.cc index 6bc46b7878338..7568486276b4f 100644 --- a/onnxruntime/server/http/json_handling.cc +++ b/onnxruntime/server/http/json_handling.cc @@ -43,13 +43,27 @@ std::string escape_string(const std::string& message) { std::ostringstream o; for (char c : message) { switch (c) { - case '"': o << "\\\""; break; - case '\\': o << "\\\\"; break; - case '\b': o << "\\b"; break; - case '\f': o << "\\f"; break; - case '\n': o << "\\n"; break; - case '\r': o << "\\r"; break; - case '\t': o << "\\t"; break; + case '"': + o << "\\\""; + break; + case '\\': + o << "\\\\"; + break; + case '\b': + o << "\\b"; + break; + case '\f': + o << "\\f"; + break; + case '\n': + o << "\\n"; + break; + case '\r': + o << "\\r"; + break; + case '\t': + o << "\\t"; + break; default: if ('\x00' <= c && c <= '\x1f') { o << "\\u" diff --git a/onnxruntime/server/http/json_handling.h b/onnxruntime/server/http/json_handling.h index 1e3d8f7239db1..f8f07004d216f 100644 --- a/onnxruntime/server/http/json_handling.h +++ b/onnxruntime/server/http/json_handling.h @@ -31,4 +31,3 @@ std::string escape_string(const std::string& message); } // namespace server } // namespace onnxruntime - diff --git a/onnxruntime/server/http/predict_request_handler.cc b/onnxruntime/server/http/predict_request_handler.cc index 30e259a77cb1e..10c5b6280e6ed 100644 --- a/onnxruntime/server/http/predict_request_handler.cc +++ b/onnxruntime/server/http/predict_request_handler.cc @@ -14,18 +14,18 @@ namespace server { namespace protobufutil = google::protobuf::util; -#define GenerateErrorResponse(logger, error_code, message, context) \ - { \ - auto http_error_code = (error_code); \ - (context).response.insert("x-ms-request-id", ((context).request_id)); \ - if (!(context).client_request_id.empty()) { \ - (context).response.insert("x-ms-client-request-id", (context).client_request_id); \ - } \ - auto json_error_message = CreateJsonError(http_error_code, (message)); \ - LOGS((*logger), VERBOSE) << json_error_message; \ - (context).response.result(http_error_code); \ - (context).response.body() = json_error_message; \ - (context).response.set(http::field::content_type, "application/json"); \ +#define GenerateErrorResponse(logger, error_code, message, context) \ + { \ + auto http_error_code = (error_code); \ + (context).response.insert(util::MS_REQUEST_ID_HEADER, ((context).request_id)); \ + if (!(context).client_request_id.empty()) { \ + (context).response.insert(util::MS_CLIENT_REQUEST_ID_HEADER, (context).client_request_id); \ + } \ + auto json_error_message = CreateJsonError(http_error_code, (message)); \ + logger->debug(json_error_message); \ + (context).response.result(http_error_code); \ + (context).response.body() = json_error_message; \ + (context).response.set(http::field::content_type, "application/json"); \ } static bool ParseRequestPayload(const HttpContext& context, SupportedContentType request_type, @@ -37,10 +37,10 @@ void Predict(const std::string& name, /* in, out */ HttpContext& context, const std::shared_ptr& env) { auto logger = env->GetLogger(context.request_id); - LOGS(*logger, INFO) << "Model Name: " << name << ", Version: " << version << ", Action: " << action; + logger->info("Model Name: {}, Version: {}, Action: {}", name, version, action); if (!context.client_request_id.empty()) { - LOGS(*logger, INFO) << "x-ms-client-request-id: [" << context.client_request_id << "]"; + logger->info("{}: [{}]", util::MS_CLIENT_REQUEST_ID_HEADER, context.client_request_id); } // Request and Response content type information @@ -89,9 +89,9 @@ void Predict(const std::string& name, } // Build HTTP response - context.response.insert("x-ms-request-id", context.request_id); + context.response.insert(util::MS_REQUEST_ID_HEADER, context.request_id); if (!context.client_request_id.empty()) { - context.response.insert("x-ms-client-request-id", context.client_request_id); + context.response.insert(util::MS_CLIENT_REQUEST_ID_HEADER, context.client_request_id); } context.response.body() = response_body; context.response.result(http::status::ok); diff --git a/onnxruntime/server/http/util.h b/onnxruntime/server/http/util.h index ba38b3976a1cb..e8c90c8f2ef65 100644 --- a/onnxruntime/server/http/util.h +++ b/onnxruntime/server/http/util.h @@ -7,7 +7,7 @@ #include #include -#include "server/http/core/context.h" +#include "http/core/context.h" namespace onnxruntime { namespace server { diff --git a/onnxruntime/server/logging/console_sink.h b/onnxruntime/server/logging/console_sink.h deleted file mode 100644 index b0d32507318fb..0000000000000 --- a/onnxruntime/server/logging/console_sink.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include "core/common/logging/logging.h" -#include "core/common/logging/sinks/ostream_sink.h" - -namespace onnxruntime { -namespace server { - -class ConsoleSink : public onnxruntime::logging::OStreamSink { - public: - ConsoleSink() : OStreamSink(std::cout, /*flush*/ true) { - } -}; -} // namespace server -} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/logging/log_sink.h b/onnxruntime/server/logging/log_sink.h deleted file mode 100644 index 471dfb1d27d61..0000000000000 --- a/onnxruntime/server/logging/log_sink.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include "core/common/logging/sinks/composite_sink.h" - -#ifdef USE_SYSLOG -#include "core/platform/posix/logging/syslog_sink.h" -#endif - -#include "console_sink.h" - -namespace onnxruntime { -namespace server { - -class LogSink : public onnxruntime::logging::CompositeSink { - public: - LogSink() { - this->AddSink(std::make_unique()); -#ifdef USE_SYSLOG - this->AddSink(std::make_unique(nullptr)); -#endif - } -}; -} // namespace server -} // namespace onnxruntime diff --git a/onnxruntime/server/main.cc b/onnxruntime/server/main.cc index 2a3a105f7af43..724b563689b60 100644 --- a/onnxruntime/server/main.cc +++ b/onnxruntime/server/main.cc @@ -5,10 +5,16 @@ #include "http_server.h" #include "predict_request_handler.h" #include "server_configuration.h" +#include "grpc/grpc_app.h" +#include +#include +#include +#include +#include #define VALUE_TO_STRING(x) #x #define VALUE(x) VALUE_TO_STRING(x) -#define VAR_NAME_VALUE(var) #var "=" VALUE(var) +#define VAR_NAME_VALUE(var) #var "=" VALUE(var) #define LOCAL_BUILD_VERSION "local_build" #if !defined(SRV_VERSION) @@ -30,12 +36,12 @@ int main(int argc, char* argv[]) { // Here we use std::cout print out the version and latest commit id, // to make sure in case even logger has problem, we still have the version information and commit id. std::string version = SRV_VERSION; - if (version.empty()){ + if (version.empty()) { version = LOCAL_BUILD_VERSION; } std::string commit_id = LATEST_COMMIT_ID; - if (commit_id.empty()){ + if (commit_id.empty()) { commit_id = DEFAULT_COMMIT_ID; } @@ -52,48 +58,47 @@ int main(int argc, char* argv[]) { exit(EXIT_FAILURE); } - const auto env = std::make_shared(config.logging_level); + const auto env = std::make_shared(config.logging_level, spdlog::sinks_init_list{std::make_shared(), std::make_shared()}); auto logger = env->GetAppLogger(); - LOGS(logger, VERBOSE) << "Logging manager initialized."; - LOGS(logger, INFO) << "Model path: " << config.model_path; + logger->info("Model path: {}", config.model_path); - auto status = env->InitializeModel(config.model_path); - if (!status.IsOK()) { - LOGS(logger, FATAL) << "Initialize Model Failed: " << status.Code() << " ---- Error: [" << status.ErrorMessage() << "]"; + try { + env->InitializeModel(config.model_path); + logger->debug("Initialize Model Successfully!"); + } catch (const Ort::Exception& ex) { + logger->critical("Initialize Model Failed: {} ---- Error: [{}]", ex.GetOrtErrorCode(), ex.what()); exit(EXIT_FAILURE); - } else { - LOGS(logger, VERBOSE) << "Initialize Model Successfully!"; } - status = env->GetSession()->Initialize(); - if (!status.IsOK()) { - LOGS(logger, FATAL) << "Session Initialization Failed:" << status.Code() << " ---- Error: [" << status.ErrorMessage() << "]"; - exit(EXIT_FAILURE); - } else { - LOGS(logger, VERBOSE) << "Initialize Session Successfully!"; - } + //Setup GRPC Server + auto const grpc_address = config.address; + auto const grpc_port = config.grpc_port; + + server::GRPCApp grpc_app{env, grpc_address, grpc_port}; + logger->info("GRPC Listening at: {}:{}", grpc_address, grpc_port); + + //Setup HTTP Server auto const boost_address = boost::asio::ip::make_address(config.address); server::App app{}; app.RegisterStartup( [&env](const auto& details) -> void { auto logger = env->GetAppLogger(); - LOGS(logger, INFO) << "Listening at: " - << "http://" << details.address << ":" << details.port; + logger->info("Listening at: http://{}:{}", details.address.to_string(), details.port); }); app.RegisterError( [&env](auto& context) -> void { auto logger = env->GetLogger(context.request_id); - LOGS(*logger, VERBOSE) << "Error code: " << context.error_code; - LOGS(*logger, VERBOSE) << "Error message: " << context.error_message; + logger->debug("Error code: {}", context.error_code); + logger->debug("Error message: {}", context.error_message); context.response.result(context.error_code); context.response.insert("Content-Type", "application/json"); - context.response.insert("x-ms-request-id", context.request_id); + context.response.insert(server::util::MS_REQUEST_ID_HEADER, context.request_id); if (!context.client_request_id.empty()) { - context.response.insert("x-ms-client-request-id", (context).client_request_id); + context.response.insert(server::util::MS_CLIENT_REQUEST_ID_HEADER, (context).client_request_id); } context.response.body() = server::CreateJsonError(context.error_code, context.error_message); }); @@ -108,5 +113,7 @@ int main(int argc, char* argv[]) { .NumThreads(config.num_http_threads) .Run(); + grpc_app.Run(); + return EXIT_SUCCESS; } diff --git a/onnxruntime/server/protobuf/onnx-ml.proto b/onnxruntime/server/protobuf/onnx-ml.proto deleted file mode 120000 index 8e7e1a6a3db65..0000000000000 --- a/onnxruntime/server/protobuf/onnx-ml.proto +++ /dev/null @@ -1 +0,0 @@ -../../core/protobuf/onnx-ml.proto3 \ No newline at end of file diff --git a/onnxruntime/server/protobuf/onnx-ml.proto b/onnxruntime/server/protobuf/onnx-ml.proto new file mode 100644 index 0000000000000..857eb6f68edd1 --- /dev/null +++ b/onnxruntime/server/protobuf/onnx-ml.proto @@ -0,0 +1,599 @@ +// +// WARNING: This file is automatically generated! Please edit onnx.in.proto. +// + + +// Copyright (c) ONNX Project Contributors. +// Licensed under the MIT license. + +syntax = "proto2"; + +package onnx; + +// Overview +// +// ONNX is an open specification that is comprised of the following components: +// +// 1) A definition of an extensible computation graph model. +// 2) Definitions of standard data types. +// 3) Definitions of built-in operators. +// +// This document describes the syntax of models and their computation graphs, +// as well as the standard data types. Together, they are referred to as the ONNX +// Intermediate Representation, or 'IR' for short. +// +// The normative semantic specification of the ONNX IR is found in docs/IR.md. +// Definitions of the built-in neural network operators may be found in docs/Operators.md. +// Definitions of the built-in classical machine learning operators may be found in +// docs/Operators-ml.md. + +// Notes +// +// Release +// +// We are still in the very early stage of defining ONNX. The current +// version of ONNX is a starting point. While we are actively working +// towards a complete spec, we would like to get the community involved +// by sharing our working version of ONNX. +// +// Protobuf compatibility +// +// To simplify framework compatibility, ONNX is defined using the subset of protobuf +// that is compatible with both protobuf v2 and v3. This means that we do not use any +// protobuf features that are only available in one of the two versions. +// +// Here are the most notable contortions we have to carry out to work around +// these limitations: +// +// - No 'map' (added protobuf 3.0). We instead represent mappings as lists +// of key-value pairs, where order does not matter and duplicates +// are not allowed. + + +// Versioning +// +// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md +// +// To be compatible with both proto2 and proto3, we will use a version number +// that is not defined by the default value but an explicit enum number. +enum Version { + // proto3 requires the first enum value to be zero. + // We add this just to appease the compiler. + _START_VERSION = 0; + // The version field is always serialized and we will use it to store the + // version that the graph is generated from. This helps us set up version + // control. + // For the IR, we are using simple numbers starting with with 0x00000001, + // which was the version we published on Oct 10, 2017. + IR_VERSION_2017_10_10 = 0x0000000000000001; + + // IR_VERSION 2 published on Oct 30, 2017 + // - Added type discriminator to AttributeProto to support proto3 users + IR_VERSION_2017_10_30 = 0x0000000000000002; + + // IR VERSION 3 published on Nov 3, 2017 + // - For operator versioning: + // - Added new message OperatorSetIdProto + // - Added opset_import in ModelProto + // - For vendor extensions, added domain in NodeProto + IR_VERSION_2017_11_3 = 0x0000000000000003; + + // IR VERSION 4 published on Jan 22, 2019 + // - Relax constraint that initializers should be a subset of graph inputs + // - Add type BFLOAT16 + IR_VERSION_2019_1_22 = 0x0000000000000004; + + // IR VERSION 5 published on March 18, 2019 + // - Add message TensorAnnotation. + // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters. + IR_VERSION = 0x0000000000000005; +} + +// Attributes +// +// A named attribute containing either singular float, integer, string, graph, +// and tensor values, or repeated float, integer, string, graph, and tensor values. +// An AttributeProto MUST contain the name field, and *only one* of the +// following content fields, effectively enforcing a C/C++ union equivalent. +message AttributeProto { + + // Note: this enum is structurally identical to the OpSchema::AttrType + // enum defined in schema.h. If you rev one, you likely need to rev the other. + enum AttributeType { + UNDEFINED = 0; + FLOAT = 1; + INT = 2; + STRING = 3; + TENSOR = 4; + GRAPH = 5; + + FLOATS = 6; + INTS = 7; + STRINGS = 8; + TENSORS = 9; + GRAPHS = 10; + } + + // The name field MUST be present for this version of the IR. + optional string name = 1; // namespace Attribute + + // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. + // In this case, this AttributeProto does not contain data, and it's a reference of attribute + // in parent scope. + // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. + optional string ref_attr_name = 21; + + // A human-readable documentation for this attribute. Markdown is allowed. + optional string doc_string = 13; + + // The type field MUST be present for this version of the IR. + // For 0.0.1 versions of the IR, this field was not defined, and + // implementations needed to use has_field hueristics to determine + // which value field was in use. For IR_VERSION 0.0.2 or later, this + // field MUST be set and match the f|i|s|t|... field in use. This + // change was made to accomodate proto3 implementations. + optional AttributeType type = 20; // discriminator that indicates which field below is in use + + // Exactly ONE of the following fields must be present for this version of the IR + optional float f = 2; // float + optional int64 i = 3; // int + optional bytes s = 4; // UTF-8 string + optional TensorProto t = 5; // tensor value + optional GraphProto g = 6; // graph + // Do not use field below, it's deprecated. + // optional ValueProto v = 12; // value - subsumes everything but graph + + repeated float floats = 7; // list of floats + repeated int64 ints = 8; // list of ints + repeated bytes strings = 9; // list of UTF-8 strings + repeated TensorProto tensors = 10; // list of tensors + repeated GraphProto graphs = 11; // list of graph +} + +// Defines information on value, including the name, the type, and +// the shape of the value. +message ValueInfoProto { + // This field MUST be present in this version of the IR. + optional string name = 1; // namespace Value + // This field MUST be present in this version of the IR. + optional TypeProto type = 2; + // A human-readable documentation for this value. Markdown is allowed. + optional string doc_string = 3; +} + +// Nodes +// +// Computation graphs are made up of a DAG of nodes, which represent what is +// commonly called a "layer" or "pipeline stage" in machine learning frameworks. +// +// For example, it can be a node of type "Conv" that takes in an image, a filter +// tensor and a bias tensor, and produces the convolved output. +message NodeProto { + repeated string input = 1; // namespace Value + repeated string output = 2; // namespace Value + + // An optional identifier for this node in a graph. + // This field MAY be absent in ths version of the IR. + optional string name = 3; // namespace Node + + // The symbolic identifier of the Operator to execute. + optional string op_type = 4; // namespace Operator + // The domain of the OperatorSet that specifies the operator named by op_type. + optional string domain = 7; // namespace Domain + + // Additional named attributes. + repeated AttributeProto attribute = 5; + + // A human-readable documentation for this node. Markdown is allowed. + optional string doc_string = 6; +} + +// Models +// +// ModelProto is a top-level file/container format for bundling a ML model and +// associating its computation graph with metadata. +// +// The semantics of the model are described by the associated GraphProto. +message ModelProto { + // The version of the IR this model targets. See Version enum above. + // This field MUST be present. + optional int64 ir_version = 1; + + // The OperatorSets this model relies on. + // All ModelProtos MUST have at least one entry that + // specifies which version of the ONNX OperatorSet is + // being imported. + // + // All nodes in the ModelProto's graph will bind against the operator + // with the same-domain/same-op_type operator with the HIGHEST version + // in the referenced operator sets. + repeated OperatorSetIdProto opset_import = 8; + + // The name of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_name = 2; + + // The version of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_version = 3; + + // Domain name of the model. + // We use reverse domain names as name space indicators. For example: + // `com.facebook.fair` or `com.microsoft.cognitiveservices` + // + // Together with `model_version` and GraphProto.name, this forms the unique identity of + // the graph. + optional string domain = 4; + + // The version of the graph encoded. See Version enum below. + optional int64 model_version = 5; + + // A human-readable documentation for this model. Markdown is allowed. + optional string doc_string = 6; + + // The parameterized graph that is evaluated to execute the model. + optional GraphProto graph = 7; + + // kezhan: This field is not in ONNX, and will be pushed into ONNX with good use cases in microsoft. + repeated FunctionProto functions = 100; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 14; +}; + +// StringStringEntryProto follows the pattern for cross-proto-version maps. +// See https://developers.google.com/protocol-buffers/docs/proto3#maps +message StringStringEntryProto { + optional string key = 1; + optional string value= 2; +}; + +message TensorAnnotation { + optional string tensor_name = 1; + // pairs to annotate tensor specified by above. + // The keys used in the mapping below must be pre-defined in ONNX spec. + // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as + // quantization parameter keys. + repeated StringStringEntryProto quant_parameter_tensor_names = 2; +} + +// Graphs +// +// A graph defines the computational logic of a model and is comprised of a parameterized +// list of nodes that form a directed acyclic graph based on their inputs and outputs. +// This is the equivalent of the "network" or "graph" in many deep learning +// frameworks. +message GraphProto { + // The nodes in the graph, sorted topologically. + repeated NodeProto node = 1; + + // The name of the graph. + optional string name = 2; // namespace Graph + + // A list of named tensor values, used to specify constant inputs of the graph. + // Each TensorProto entry must have a distinct name (within the list) that + // MAY also appear in the input list. + repeated TensorProto initializer = 5; + + // A human-readable documentation for this graph. Markdown is allowed. + optional string doc_string = 10; + + // The inputs and outputs of the graph. + repeated ValueInfoProto input = 11; + repeated ValueInfoProto output = 12; + + // Information for the values in the graph. The ValueInfoProto.name's + // must be distinct. It is optional for a value to appear in value_info list. + repeated ValueInfoProto value_info = 13; + + // This field carries information to indicate the mapping among a tensor and its + // quantization parameter tensors. For example: + // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated, + // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. + repeated TensorAnnotation quantization_annotation = 14; + + // DO NOT USE the following fields, they were deprecated from earlier versions. + // repeated string input = 3; + // repeated string output = 4; + // optional int64 ir_version = 6; + // optional int64 producer_version = 7; + // optional string producer_tag = 8; + // optional string domain = 9; +} + +// Tensors +// +// A serialized tensor value. +message TensorProto { + enum DataType { + UNDEFINED = 0; + // Basic types. + FLOAT = 1; // float + UINT8 = 2; // uint8_t + INT8 = 3; // int8_t + UINT16 = 4; // uint16_t + INT16 = 5; // int16_t + INT32 = 6; // int32_t + INT64 = 7; // int64_t + STRING = 8; // string + BOOL = 9; // bool + + // IEEE754 half-precision floating-point format (16 bits wide). + // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits. + FLOAT16 = 10; + + DOUBLE = 11; + UINT32 = 12; + UINT64 = 13; + COMPLEX64 = 14; // complex with float32 real and imaginary components + COMPLEX128 = 15; // complex with float64 real and imaginary components + + // Non-IEEE floating-point format based on IEEE754 single-precision + // floating-point number truncated to 16 bits. + // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits. + BFLOAT16 = 16; + + // Future extensions go here. + } + + // The shape of the tensor. + repeated int64 dims = 1; + + // The data type of the tensor. + // This field MUST have a valid TensorProto.DataType value + optional int32 data_type = 2; + + // For very large tensors, we may want to store them in chunks, in which + // case the following fields will specify the segment that is stored in + // the current TensorProto. + message Segment { + optional int64 begin = 1; + optional int64 end = 2; + } + optional Segment segment = 3; + + // Tensor content must be organized in row-major order. + // + // Depending on the data_type field, exactly one of the fields below with + // name ending in _data is used to store the elements of the tensor. + + // For float and complex64 values + // Complex64 tensors are encoded as a single array of floats, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. + repeated float float_data = 4 [packed = true]; + + // For int32, uint8, int8, uint16, int16, bool, and float16 values + // float16 values must be bit-wise converted to an uint16_t prior + // to writing to the buffer. + // When this field is present, the data_type field MUST be + // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16 + repeated int32 int32_data = 5 [packed = true]; + + // For strings. + // Each element of string_data is a UTF-8 encoded Unicode + // string. No trailing null, no leading BOM. The protobuf "string" + // scalar type is not used to match ML community conventions. + // When this field is present, the data_type field MUST be STRING + repeated bytes string_data = 6; + + // For int64. + // When this field is present, the data_type field MUST be INT64 + repeated int64 int64_data = 7 [packed = true]; + + // Optionally, a name for the tensor. + optional string name = 8; // namespace Value + + // A human-readable documentation for this tensor. Markdown is allowed. + optional string doc_string = 12; + + // Serializations can either use one of the fields above, or use this + // raw bytes field. The only exception is the string case, where one is + // required to store the content in the repeated bytes string_data field. + // + // When this raw_data field is used to store tensor value, elements MUST + // be stored in as fixed-width, little-endian order. + // Floating-point data types MUST be stored in IEEE 754 format. + // Complex64 elements must be written as two consecutive FLOAT values, real component first. + // Complex128 elements must be written as two consecutive DOUBLE values, real component first. + // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). + // + // Note: the advantage of specific field rather than the raw_data field is + // that in some cases (e.g. int data), protobuf does a better packing via + // variable length storage, and may lead to smaller binary footprint. + // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED + optional bytes raw_data = 9; + + // Data can be stored inside the protobuf file using type-specific fields or raw_data. + // Alternatively, raw bytes data can be stored in an external file, using the external_data field. + // external_data stores key-value pairs describing data location. Recognized keys are: + // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX + // protobuf model was stored + // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string. + // Offset values SHOULD be multiples 4096 (page size) to enable mmap support. + // - "length" (optional) - number of bytes containing data. Integer stored as string. + // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key. + repeated StringStringEntryProto external_data = 13; + + // Location of the data for this tensor. MUST be one of: + // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field. + // - EXTERNAL - data stored in an external location as described by external_data field. + enum DataLocation { + DEFAULT = 0; + EXTERNAL = 1; + } + + // If value not set, data is stored in raw_data (if set) otherwise in type-specified field. + optional DataLocation data_location = 14; + + // For double + // Complex128 tensors are encoded as a single array of doubles, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 + repeated double double_data = 10 [packed = true]; + + // For uint64 and uint32 values + // When this field is present, the data_type field MUST be + // UINT32 or UINT64 + repeated uint64 uint64_data = 11 [packed = true]; +} + +// Defines a tensor shape. A dimension can be either an integer value +// or a symbolic variable. A symbolic variable represents an unknown +// dimension. +message TensorShapeProto { + message Dimension { + oneof value { + int64 dim_value = 1; + string dim_param = 2; // namespace Shape + }; + // Standard denotation can optionally be used to denote tensor + // dimensions with standard semantic descriptions to ensure + // that operations are applied to the correct axis of a tensor. + // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition + // for pre-defined dimension denotations. + optional string denotation = 3; + }; + repeated Dimension dim = 1; +} + +// Types +// +// The standard ONNX data types. +message TypeProto { + + message Tensor { + // This field MUST NOT have the value of UNDEFINED + // This field MUST have a valid TensorProto.DataType value + // This field MUST be present for this version of the IR. + optional int32 elem_type = 1; + optional TensorShapeProto shape = 2; + } + + + // repeated T + message Sequence { + // The type and optional shape of each element of the sequence. + // This field MUST be present for this version of the IR. + optional TypeProto elem_type = 1; + }; + + // map + message Map { + // This field MUST have a valid TensorProto.DataType value + // This field MUST be present for this version of the IR. + // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING + optional int32 key_type = 1; + // This field MUST be present for this version of the IR. + optional TypeProto value_type = 2; + }; + + message Opaque { + // When missing, the domain is the same as the model's. + optional string domain = 1; + // The name is optional but significant when provided. + optional string name = 2; + // parameters that help defining the type + // DEPRECATED do not use. + // repeated TypeProto parameters = 3; + } + + message SparseTensor { + // This field MUST NOT have the value of UNDEFINED + // This field MUST have a valid TensorProto.DataType value + // This field MUST be present for this version of the IR. + optional int32 elem_type = 1; + optional TensorShapeProto shape = 2; + } + + + oneof value { + // The type of a tensor. + Tensor tensor_type = 1; + + + // NOTE: DNN-only implementations of ONNX MAY elect to not support non-tensor values + // as input and output to graphs and nodes. These types are needed to naturally + // support classical ML operators. DNN operators SHOULD restrict their input + // and output types to tensors. + + // The type of a sequence. + Sequence sequence_type = 4; + + // The type of a map. + Map map_type = 5; + + Opaque opaque_type = 7; + + SparseTensor sparse_tensor_type = 8; + + } + + // An optional denotation can be used to denote the whole + // type with a standard semantic description as to what is + // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition + // for pre-defined type denotations. + optional string denotation = 6; +} + +// Operator Sets +// +// OperatorSets are uniquely identified by a (domain, opset_version) pair. +message OperatorSetIdProto { + // The domain of the operator set being identified. + // The empty string ("") or absence of this field implies the operator + // set that is defined as part of the ONNX specification. + // This field MUST be present in this version of the IR when referring to any other operator set. + optional string domain = 1; + + // The version of the operator set being identified. + // This field MUST be present in this version of the IR. + optional int64 version = 2; +} + +// Operator/function status. +enum OperatorStatus { + EXPERIMENTAL = 0; + STABLE = 1; +} + +message FunctionProto { + // The name of the function, similar usage of op_type in OperatorProto. + optional string name = 1; + + // The first version of a function set which contains this function. + // When there's any breaking change for this function, the function set + // contains the function needs to bump its version, and since_version of + // the updated function will be changed to the updated function set version. + optional int64 since_version = 2; + + // This field indicates whether the syntax, semantics, or presence + // of this function is in an experimental or stable stage. Once an + // function is published as STABLE, its syntax and semantics MUST NOT + // change in subsequent versions of the operator set. + // When a function is published as EXPERIMENTAL, the syntax and semantics + // of the function MAY change across operator set versions. + // Functions "become" stable by deprecating the experimental version and + // introducing a new stable function with the same name. + optional OperatorStatus status = 3; + + // The inputs and outputs of the function. + repeated string input = 4; + repeated string output = 5; + + // The attributes of the function. + repeated string attribute= 6; + + // The nodes in the function. + repeated NodeProto node = 7; + // A human-readable documentation for this function. Markdown is allowed. + optional string doc_string = 8; +} \ No newline at end of file diff --git a/onnxruntime/server/protobuf/predict.proto b/onnxruntime/server/protobuf/predict.proto index 21b04386353eb..e71d4c98147c0 100644 --- a/onnxruntime/server/protobuf/predict.proto +++ b/onnxruntime/server/protobuf/predict.proto @@ -4,24 +4,24 @@ import "onnx-ml.proto"; package onnxruntime.server; -// PredictRequest specifies how inputs are mapped to tensors -// and how outputs are filtered before returning to user. -message PredictRequest { - reserved 1; +// PredictRequest specifies how inputs are mapped to tensors +// and how outputs are filtered before returning to user. +message PredictRequest { + reserved 1; - // Input Tensors. - // This is a mapping between output name and tensor. - map inputs = 2; + // Input Tensors. + // This is a mapping between output name and tensor. + map inputs = 2; - // Output Filters. - // This field is to specify which output fields need to be returned. - // If the list is empty, all outputs will be included. - repeated string output_filter = 3; -} + // Output Filters. + // This field is to specify which output fields need to be returned. + // If the list is empty, all outputs will be included. + repeated string output_filter = 3; +} -// Response for PredictRequest on successful run. +// Response for PredictRequest on successful run. message PredictResponse { - // Output Tensors. - // This is a mapping between output name and tensor. - map outputs = 1; + // Output Tensors. + // This is a mapping between output name and tensor. + map outputs = 1; } \ No newline at end of file diff --git a/onnxruntime/server/protobuf/prediction_service.proto b/onnxruntime/server/protobuf/prediction_service.proto new file mode 100644 index 0000000000000..268010ac10c27 --- /dev/null +++ b/onnxruntime/server/protobuf/prediction_service.proto @@ -0,0 +1,8 @@ +syntax = "proto3"; +import "predict.proto"; + +package onnxruntime.server; + +service PredictionService { + rpc Predict(PredictRequest) returns (PredictResponse); +} \ No newline at end of file diff --git a/onnxruntime/server/serializing/mem_buffer.h b/onnxruntime/server/serializing/mem_buffer.h new file mode 100644 index 0000000000000..5675466a2492b --- /dev/null +++ b/onnxruntime/server/serializing/mem_buffer.h @@ -0,0 +1,21 @@ +#pragma once +#include "core/common/common.h" + +namespace onnxruntime { +namespace server { +class MemBuffer { + public: + MemBuffer(void* buffer, size_t len, const OrtAllocatorInfo& alloc_info) + : buffer_(buffer), len_(len), alloc_info_(alloc_info) {} + void* GetBuffer() const { return buffer_; } + + size_t GetLen() const { return len_; } + const OrtAllocatorInfo& GetAllocInfo() const { return alloc_info_; } + + private: + void* const buffer_; + const size_t len_; + const OrtAllocatorInfo& alloc_info_; +}; +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/serializing/tensorprotoutils.cc b/onnxruntime/server/serializing/tensorprotoutils.cc new file mode 100644 index 0000000000000..9fa76192c7559 --- /dev/null +++ b/onnxruntime/server/serializing/tensorprotoutils.cc @@ -0,0 +1,431 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "tensorprotoutils.h" + +#include +#include +#include +#include +#include "core/framework/data_types.h" +#include "core/framework/allocator.h" +#include "onnx-ml.pb.h" +#include "core/session/onnxruntime_cxx_api.h" + +namespace onnxruntime { +namespace server { +#ifdef __GNUC__ +constexpr inline bool IsLittleEndianOrder() noexcept { return __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; } +#else +// On Windows and Mac, this function should always return true +GSL_SUPPRESS(type .1) // allow use of reinterpret_cast for this special case +inline bool IsLittleEndianOrder() noexcept { + static int n = 1; + return (*reinterpret_cast(&n) == 1); +} +#endif + +//From core common +inline void MakeStringInternal(std::ostringstream& /*ss*/) noexcept { +} + +template +inline void MakeStringInternal(std::ostringstream& ss, const T& t) noexcept { + ss << t; +} + +template +inline void MakeStringInternal(std::ostringstream& ss, const T& t, const Args&... args) noexcept { + ::onnxruntime::MakeStringInternal(ss, t); + ::onnxruntime::MakeStringInternal(ss, args...); +} + +template +std::string MakeString(const Args&... args) { + std::ostringstream ss; + ::onnxruntime::MakeStringInternal(ss, args...); + return std::string(ss.str()); +} + +// Specializations for already-a-string types. +template <> +inline std::string MakeString(const std::string& str) { + return str; +} +inline std::string MakeString(const char* p_str) { + return p_str; +} + +std::vector GetTensorShapeFromTensorProto(const onnx::TensorProto& tensor_proto) { + const auto& dims = tensor_proto.dims(); + std::vector tensor_shape_vec(static_cast(dims.size())); + for (int i = 0; i < dims.size(); ++i) { + tensor_shape_vec[i] = dims[i]; + } + + return tensor_shape_vec; +} + +// This function doesn't support string tensors +template +static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length, size_t expected_size, + /*out*/ T* p_data) { + // allow this low level routine to be somewhat unsafe. assuming it's thoroughly tested and valid + GSL_SUPPRESS(type) // type.1 reinterpret-cast; type.4 C-style casts; type.5 'T result;' is uninitialized; + GSL_SUPPRESS(bounds .1) // pointer arithmetic + GSL_SUPPRESS(f .23) // buff and temp_bytes never tested for nullness and could be gsl::not_null + { + size_t expected_size_in_bytes; + if (!onnxruntime::IAllocator::CalcMemSizeForArray(expected_size, sizeof(T), &expected_size_in_bytes)) { + throw Ort::Exception("size overflow", OrtErrorCode::ORT_FAIL); + } + if (raw_data_length != expected_size_in_bytes) + throw Ort::Exception(MakeString("UnpackTensor: the pre-allocated size does not match the raw data size, expected ", + expected_size_in_bytes, ", got ", raw_data_length), + OrtErrorCode::ORT_FAIL); + if (IsLittleEndianOrder()) { + memcpy(p_data, raw_data, raw_data_length); + } else { + const size_t type_size = sizeof(T); + const char* buff = reinterpret_cast(raw_data); + for (size_t i = 0; i < raw_data_length; i += type_size, buff += type_size) { + T result; + const char* temp_bytes = reinterpret_cast(&result); + for (size_t j = 0; j < type_size; ++j) { + memcpy((void*)&temp_bytes[j], (void*)&buff[type_size - 1 - i], 1); + } + p_data[i] = result; + } + } + } +} + +// This macro doesn't work for Float16/bool/string tensors +#define DEFINE_UNPACK_TENSOR(T, Type, field_name, field_size) \ + template <> \ + void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len, \ + /*out*/ T* p_data, int64_t expected_size) { \ + if (nullptr == p_data) { \ + const size_t size = raw_data != nullptr ? raw_data_len : tensor.field_size(); \ + if (size == 0) return; \ + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); \ + } \ + if (nullptr == p_data || Type != tensor.data_type()) { \ + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); \ + } \ + if (raw_data != nullptr) { \ + UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data); \ + return; \ + } \ + if (tensor.field_size() != expected_size) \ + throw Ort::Exception(MakeString("corrupted protobuf data: tensor shape size(", expected_size, \ + ") does not match the data size(", tensor.field_size(), ") in proto"), \ + OrtErrorCode::ORT_FAIL); \ + auto& data = tensor.field_name(); \ + for (auto data_iter = data.cbegin(); data_iter != data.cend(); ++data_iter) \ + *p_data++ = *reinterpret_cast(data_iter); \ + return; \ + } + +// TODO: complex64 complex128 +DEFINE_UNPACK_TENSOR(float, onnx::TensorProto_DataType_FLOAT, float_data, float_data_size) +DEFINE_UNPACK_TENSOR(double, onnx::TensorProto_DataType_DOUBLE, double_data, double_data_size); +DEFINE_UNPACK_TENSOR(uint8_t, onnx::TensorProto_DataType_UINT8, int32_data, int32_data_size) +DEFINE_UNPACK_TENSOR(int8_t, onnx::TensorProto_DataType_INT8, int32_data, int32_data_size) +DEFINE_UNPACK_TENSOR(int16_t, onnx::TensorProto_DataType_INT16, int32_data, int32_data_size) +DEFINE_UNPACK_TENSOR(uint16_t, onnx::TensorProto_DataType_UINT16, int32_data, int32_data_size) +DEFINE_UNPACK_TENSOR(int32_t, onnx::TensorProto_DataType_INT32, int32_data, int32_data_size) +DEFINE_UNPACK_TENSOR(int64_t, onnx::TensorProto_DataType_INT64, int64_data, int64_data_size) +DEFINE_UNPACK_TENSOR(uint64_t, onnx::TensorProto_DataType_UINT64, uint64_data, uint64_data_size) +DEFINE_UNPACK_TENSOR(uint32_t, onnx::TensorProto_DataType_UINT32, uint64_data, uint64_data_size) + +// doesn't support raw data +template <> +void UnpackTensor(const onnx::TensorProto& tensor, const void* /*raw_data*/, size_t /*raw_data_len*/, + /*out*/ std::string* p_data, int64_t expected_size) { + if (nullptr == p_data) { + if (tensor.string_data_size() == 0) return; + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + if (onnx::TensorProto_DataType_STRING != tensor.data_type()) { + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + + if (tensor.string_data_size() != expected_size) + throw Ort::Exception( + "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL); + + auto& string_data = tensor.string_data(); + for (const auto& iter : string_data) { + *p_data++ = iter; + } + + return; +} +template <> +void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len, + /*out*/ bool* p_data, int64_t expected_size) { + if (nullptr == p_data) { + const size_t size = raw_data != nullptr ? raw_data_len : tensor.int32_data_size(); + if (size == 0) return; + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + if (onnx::TensorProto_DataType_BOOL != tensor.data_type()) { + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + + if (raw_data != nullptr) { + return UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data); + } + + if (tensor.int32_data_size() != expected_size) + throw Ort::Exception( + "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL); + for (int iter : tensor.int32_data()) { + *p_data++ = static_cast(iter); + } + + return; +} +template <> +void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len, + /*out*/ MLFloat16* p_data, int64_t expected_size) { + if (nullptr == p_data) { + const size_t size = raw_data != nullptr ? raw_data_len : tensor.int32_data_size(); + if (size == 0) return; + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + if (onnx::TensorProto_DataType_FLOAT16 != tensor.data_type()) { + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + + if (raw_data != nullptr) { + return UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data); + } + + if (tensor.int32_data_size() != expected_size) + throw Ort::Exception( + "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL); + + constexpr int max_value = std::numeric_limits::max(); + for (int i = 0; i < static_cast(expected_size); i++) { + int v = tensor.int32_data()[i]; + if (v < 0 || v > max_value) { + throw Ort::Exception( + "data overflow", OrtErrorCode::ORT_FAIL); + } + p_data[i] = MLFloat16(static_cast(v)); + } + + return; +} + +template <> +void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len, + /*out*/ BFloat16* p_data, int64_t expected_size) { + if (nullptr == p_data) { + const size_t size = raw_data != nullptr ? raw_data_len : tensor.int32_data_size(); + if (size == 0) + return; + + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + if (onnx::TensorProto_DataType_BFLOAT16 != tensor.data_type()) { + throw Ort::Exception("", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + + if (raw_data != nullptr) { + return UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data); + } + + if (tensor.int32_data_size() != expected_size) + throw Ort::Exception( + "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL); + + constexpr int max_value = std::numeric_limits::max(); + for (int i = 0; i < static_cast(expected_size); i++) { + int v = tensor.int32_data()[i]; + if (v < 0 || v > max_value) { + throw Ort::Exception( + "data overflow", OrtErrorCode::ORT_FAIL); + } + p_data[i] = BFloat16(static_cast(v)); + } + + return; +} + +#define CASE_PROTO_TRACE(X, Y) \ + case onnx::TensorProto_DataType::TensorProto_DataType_##X: \ + if (!IAllocator::CalcMemSizeForArrayWithAlignment(size, sizeof(Y), out)) { \ + throw Ort::Exception("Invalid TensorProto", OrtErrorCode::ORT_FAIL); \ + } \ + break; + +template +void GetSizeInBytesFromTensorProto(const onnx::TensorProto& tensor_proto, size_t* out) { + const auto& dims = tensor_proto.dims(); + size_t size = 1; + for (google::protobuf::int64 dim : dims) { + if (dim < 0 || static_cast(dim) >= std::numeric_limits::max()) { + throw Ort::Exception("Invalid TensorProto", OrtErrorCode::ORT_FAIL); + } + if (!IAllocator::CalcMemSizeForArray(size, static_cast(dim), &size)) { + throw Ort::Exception("Invalid TensorProto", OrtErrorCode::ORT_FAIL); + } + } + switch (tensor_proto.data_type()) { + CASE_PROTO_TRACE(FLOAT, float); + CASE_PROTO_TRACE(DOUBLE, double); + CASE_PROTO_TRACE(BOOL, bool); + CASE_PROTO_TRACE(INT8, int8_t); + CASE_PROTO_TRACE(INT16, int16_t); + CASE_PROTO_TRACE(INT32, int32_t); + CASE_PROTO_TRACE(INT64, int64_t); + CASE_PROTO_TRACE(UINT8, uint8_t); + CASE_PROTO_TRACE(UINT16, uint16_t); + CASE_PROTO_TRACE(UINT32, uint32_t); + CASE_PROTO_TRACE(UINT64, uint64_t); + CASE_PROTO_TRACE(FLOAT16, MLFloat16); + CASE_PROTO_TRACE(BFLOAT16, BFloat16); + CASE_PROTO_TRACE(STRING, std::string); + default: + throw Ort::Exception("", OrtErrorCode::ORT_NOT_IMPLEMENTED); + } + return; +} + +struct UnInitializeParam { + void* preallocated; + size_t preallocated_size; + ONNXTensorElementDataType ele_type; +}; + +void OrtInitializeBufferForTensor(void* input, size_t input_len, + ONNXTensorElementDataType type) { + try { + if (type != ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING || input == nullptr) return; + size_t tensor_size = input_len / sizeof(std::string); + std::string* ptr = reinterpret_cast(input); + for (size_t i = 0, n = tensor_size; i < n; ++i) { + new (ptr + i) std::string(); + } + } catch (std::exception& ex) { + throw Ort::Exception(ex.what(), OrtErrorCode::ORT_RUNTIME_EXCEPTION); + } + return; +} + +#define CASE_PROTO(X, Y) \ + case onnx::TensorProto_DataType::TensorProto_DataType_##X: \ + ::onnxruntime::server::UnpackTensor(tensor_proto, raw_data, raw_data_len, (Y*)preallocated, tensor_size); \ + break; + +#define CASE_TYPE(X) \ + case onnx::TensorProto_DataType_##X: \ + return ONNX_TENSOR_ELEMENT_DATA_TYPE_##X; + +ONNXTensorElementDataType CApiElementTypeFromProtoType(int type) { + switch (type) { + CASE_TYPE(FLOAT) + CASE_TYPE(UINT8) + CASE_TYPE(INT8) + CASE_TYPE(UINT16) + CASE_TYPE(INT16) + CASE_TYPE(INT32) + CASE_TYPE(INT64) + CASE_TYPE(STRING) + CASE_TYPE(BOOL) + CASE_TYPE(FLOAT16) + CASE_TYPE(DOUBLE) + CASE_TYPE(UINT32) + CASE_TYPE(UINT64) + CASE_TYPE(COMPLEX64) + CASE_TYPE(COMPLEX128) + CASE_TYPE(BFLOAT16) + default: + return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + } +} + +ONNXTensorElementDataType GetTensorElementType(const onnx::TensorProto& tensor_proto) { + return CApiElementTypeFromProtoType(tensor_proto.data_type()); +} + +void TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuffer& m, Ort::Value& value) { + const OrtAllocatorInfo& allocator = m.GetAllocInfo(); + ONNXTensorElementDataType ele_type = server::GetTensorElementType(tensor_proto); + const void* raw_data = nullptr; + size_t raw_data_len = 0; + void* tensor_data; + { + if (tensor_proto.data_location() == onnx::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL) { + throw Ort::Exception("Server doesn't support external data.", OrtErrorCode::ORT_INVALID_ARGUMENT); + } else if (tensor_proto.has_raw_data()) { + if (ele_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) + throw Ort::Exception("String tensor cannot have raw data.", OrtErrorCode::ORT_FAIL); + raw_data = tensor_proto.raw_data().data(); + raw_data_len = tensor_proto.raw_data().size(); + } + { + void* preallocated = m.GetBuffer(); + size_t preallocated_size = m.GetLen(); + int64_t tensor_size = 1; + { + for (auto i : tensor_proto.dims()) { + if (i < 0) throw Ort::Exception("Tensor can't contain negative dims", OrtErrorCode::ORT_FAIL); + tensor_size *= i; + } + } + // tensor_size could be zero. see test_slice_start_out_of_bounds\test_data_set_0\output_0.pb + if (static_cast(tensor_size) > SIZE_MAX) { + throw Ort::Exception("Size overflow", OrtErrorCode::ORT_INVALID_ARGUMENT); + } + size_t size_to_allocate; + GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_to_allocate); + + if (preallocated && preallocated_size < size_to_allocate) + throw Ort::Exception(MakeString( + "The buffer planner is not consistent with tensor buffer size, expected ", + size_to_allocate, ", got ", preallocated_size), + OrtErrorCode::ORT_FAIL); + switch (tensor_proto.data_type()) { + CASE_PROTO(FLOAT, float); + CASE_PROTO(DOUBLE, double); + CASE_PROTO(BOOL, bool); + CASE_PROTO(INT8, int8_t); + CASE_PROTO(INT16, int16_t); + CASE_PROTO(INT32, int32_t); + CASE_PROTO(INT64, int64_t); + CASE_PROTO(UINT8, uint8_t); + CASE_PROTO(UINT16, uint16_t); + CASE_PROTO(UINT32, uint32_t); + CASE_PROTO(UINT64, uint64_t); + CASE_PROTO(FLOAT16, MLFloat16); + CASE_PROTO(BFLOAT16, BFloat16); + case onnx::TensorProto_DataType::TensorProto_DataType_STRING: + if (preallocated != nullptr) { + OrtInitializeBufferForTensor(preallocated, preallocated_size, ele_type); + } + ::onnxruntime::server::UnpackTensor(tensor_proto, raw_data, raw_data_len, + (std::string*)preallocated, tensor_size); + break; + default: { + std::ostringstream ostr; + ostr << "Initialized tensor with unexpected type: " << tensor_proto.data_type(); + throw Ort::Exception(ostr.str(), OrtErrorCode::ORT_INVALID_ARGUMENT); + } + } + tensor_data = preallocated; + } + } + std::vector tensor_shape_vec = GetTensorShapeFromTensorProto(tensor_proto); + // Note: We permit an empty tensor_shape_vec, and treat it as a scalar (a tensor of size 1). + value = Ort::Value::CreateTensor(&allocator, tensor_data, m.GetLen(), tensor_shape_vec.data(), tensor_shape_vec.size(), (ONNXTensorElementDataType)tensor_proto.data_type()); + return; +} +template void GetSizeInBytesFromTensorProto<256>(const onnx::TensorProto& tensor_proto, + size_t* out); +template void GetSizeInBytesFromTensorProto<0>(const onnx::TensorProto& tensor_proto, size_t* out); +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/serializing/tensorprotoutils.h b/onnxruntime/server/serializing/tensorprotoutils.h new file mode 100644 index 0000000000000..243b122ef64b5 --- /dev/null +++ b/onnxruntime/server/serializing/tensorprotoutils.h @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include "core/session/onnxruntime_c_api.h" +#include "core/session/onnxruntime_cxx_api.h" + +#include "mem_buffer.h" + +#include "onnx-ml.pb.h" +#include "predict.pb.h" + +namespace onnxruntime { +namespace server { +// How much memory it will need for putting the content of this tensor into a plain array +// complex64/complex128 tensors are not supported. +// The output value could be zero or -1. +template +void GetSizeInBytesFromTensorProto(const onnx::TensorProto& tensor_proto, size_t* out); +/** + * deserialize a TensorProto into a preallocated memory buffer. + * Impl must correspond to onnxruntime/core/framework/tensorprotoutils.cc + * This implementation does not support external data so as to reduce dependency surface. + */ +void TensorProtoToMLValue(const onnx::TensorProto& input, const server::MemBuffer& m, /* out */ Ort::Value& value); + +template +void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len, + /*out*/ T* p_data, int64_t expected_size); + +ONNXTensorElementDataType CApiElementTypeFromProtoType(int type); +ONNXTensorElementDataType GetTensorElementType(const onnx::TensorProto& tensor_proto); +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/server/server_configuration.h b/onnxruntime/server/server_configuration.h index cfc9287683fe8..93d6983d33e51 100644 --- a/onnxruntime/server/server_configuration.h +++ b/onnxruntime/server/server_configuration.h @@ -8,7 +8,7 @@ #include #include "boost/program_options.hpp" -#include "core/common/logging/logging.h" +#include "core/session/onnxruntime_cxx_api.h" namespace onnxruntime { namespace server { @@ -26,12 +26,12 @@ enum class Result { ContinueSuccess }; -static std::unordered_map supported_log_levels{ - {"verbose", onnxruntime::logging::Severity::kVERBOSE}, - {"info", onnxruntime::logging::Severity::kINFO}, - {"warning", onnxruntime::logging::Severity::kWARNING}, - {"error", onnxruntime::logging::Severity::kERROR}, - {"fatal", onnxruntime::logging::Severity::kFATAL}}; +static std::unordered_map supported_log_levels{ + {"verbose", ORT_LOGGING_LEVEL_VERBOSE}, + {"info", ORT_LOGGING_LEVEL_INFO}, + {"warning", ORT_LOGGING_LEVEL_WARNING}, + {"error", ORT_LOGGING_LEVEL_ERROR}, + {"fatal", ORT_LOGGING_LEVEL_FATAL}}; // Wrapper around Boost program_options and should provide all the functionality for options parsing // Provides sane default values @@ -41,8 +41,9 @@ class ServerConfiguration { std::string model_path; std::string address = "0.0.0.0"; unsigned short http_port = 8001; + unsigned short grpc_port = 50051; int num_http_threads = std::thread::hardware_concurrency(); - onnxruntime::logging::Severity logging_level{}; + OrtLoggingLevel logging_level{}; ServerConfiguration() { desc.add_options()("help,h", "Shows a help message and exits"); @@ -51,6 +52,7 @@ class ServerConfiguration { desc.add_options()("address", po::value(&address)->default_value(address), "The base HTTP address"); desc.add_options()("http_port", po::value(&http_port)->default_value(http_port), "HTTP port to listen to requests"); desc.add_options()("num_http_threads", po::value(&num_http_threads)->default_value(num_http_threads), "Number of http threads"); + desc.add_options()("grpc_port", po::value(&grpc_port)->default_value(grpc_port), "GRPC port to listen to requests"); } // Parses argc and argv and sets the values for the class diff --git a/onnxruntime/server/util.cc b/onnxruntime/server/util.cc index 579c2e81bcc3c..83ef6521d1de0 100644 --- a/onnxruntime/server/util.cc +++ b/onnxruntime/server/util.cc @@ -12,13 +12,14 @@ namespace server { namespace protobufutil = google::protobuf::util; -protobufutil::Status GenerateProtobufStatus(const onnxruntime::common::Status& onnx_status, const std::string& message) { +protobufutil::Status GenerateProtobufStatus(const int& onnx_status, const std::string& message) { protobufutil::error::Code code = protobufutil::error::Code::UNKNOWN; - switch (onnx_status.Code()) { + switch (onnx_status) { case onnxruntime::common::StatusCode::OK: case onnxruntime::common::StatusCode::MODEL_LOADED: code = protobufutil::error::Code::OK; break; + case onnxruntime::common::StatusCode::FAIL: case onnxruntime::common::StatusCode::INVALID_ARGUMENT: case onnxruntime::common::StatusCode::INVALID_PROTOBUF: case onnxruntime::common::StatusCode::INVALID_GRAPH: @@ -31,7 +32,6 @@ protobufutil::Status GenerateProtobufStatus(const onnxruntime::common::Status& o case onnxruntime::common::StatusCode::NOT_IMPLEMENTED: code = protobufutil::error::Code::UNIMPLEMENTED; break; - case onnxruntime::common::StatusCode::FAIL: case onnxruntime::common::StatusCode::RUNTIME_EXCEPTION: code = protobufutil::error::Code::INTERNAL; break; @@ -40,7 +40,7 @@ protobufutil::Status GenerateProtobufStatus(const onnxruntime::common::Status& o } std::ostringstream oss; - oss << "ONNX Runtime Status Code: " << onnx_status.Code() << ". " << message; + oss << "ONNX Runtime Status Code: " << onnx_status << ". " << message; return protobufutil::Status(code, oss.str()); } diff --git a/onnxruntime/server/util.h b/onnxruntime/server/util.h index 10b7aeb158d0e..f136324e195ca 100644 --- a/onnxruntime/server/util.h +++ b/onnxruntime/server/util.h @@ -38,9 +38,9 @@ class MemBufferArray { } }; +google::protobuf::util::Status GenerateProtobufStatus(const int& onnx_status, const std::string& message); // Generate protobuf status from ONNX Runtime status google::protobuf::util::Status GenerateProtobufStatus(const onnxruntime::common::Status& onnx_status, const std::string& message); } // namespace server } // namespace onnxruntime - diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index 65843d7a7b38a..c555c9ba21900 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -196,7 +196,7 @@ class PlannerTest : public ::testing::Test { void BindKernel(onnxruntime::Node* p_node, ::onnxruntime::KernelDef& kernel_def) { auto info = std::make_unique(*p_node, kernel_def, *execution_providers_.Get(*p_node), state_.GetInitializedTensors(), state_.GetOrtValueNameIdxMap(), - state_.GetFuncMgr()); + state_.GetFuncMgr(), state_.GetDataTransferMgr()); auto dummy = std::make_unique(*info); op_kernel_infos_.push_back(std::move(info)); state_.AddKernel(p_node->Index(), std::move(dummy)); diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc index c0594514f2127..694b826267a92 100644 --- a/onnxruntime/test/framework/allocator_test.cc +++ b/onnxruntime/test/framework/allocator_test.cc @@ -49,7 +49,7 @@ class TestAllocator : public IAllocator { } virtual const OrtAllocatorInfo& Info() const override { - static OrtAllocatorInfo info("test", OrtDeviceAllocator, 0); + static OrtAllocatorInfo info("test", OrtDeviceAllocator); return info; } diff --git a/onnxruntime/test/framework/dummy_provider.h b/onnxruntime/test/framework/dummy_provider.h index f7ff420a8d970..8ee907a2ca9df 100644 --- a/onnxruntime/test/framework/dummy_provider.h +++ b/onnxruntime/test/framework/dummy_provider.h @@ -18,23 +18,6 @@ class DummyExecutionProvider : public IExecutionProvider { InsertAllocator(std::make_unique()); } - Status CopyTensor(const Tensor& src, Tensor& dst) const override { - // we can 'copy' from anything we allocated to/from CPU - ORT_ENFORCE(strcmp(dst.Location().name, DummyAllocator::kDummyAllocator) == 0 || - strcmp(dst.Location().name, CPU) == 0); - ORT_ENFORCE(strcmp(src.Location().name, DummyAllocator::kDummyAllocator) == 0 || - strcmp(src.Location().name, CPU) == 0); - - // no really copy needed. - const void* src_data = src.DataRaw(); - void* dst_data = dst.MutableDataRaw(); - - // copying between cpu memory - memcpy(dst_data, src_data, src.Size()); - - return Status::OK(); - } - std::shared_ptr GetKernelRegistry() const override; }; diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc index a77ce02805190..48e2ce8c3b0e6 100644 --- a/onnxruntime/test/framework/execution_frame_test.cc +++ b/onnxruntime/test/framework/execution_frame_test.cc @@ -6,8 +6,12 @@ #include "core/framework/session_state.h" #include "core/graph/model.h" #include "core/providers/cpu/cpu_execution_provider.h" +#include "core/session/inference_session.h" #include "test_utils.h" +#include "test/test_environment.h" + #include "gtest/gtest.h" +#include "gmock/gmock.h" using namespace ONNX_NAMESPACE; using namespace std; @@ -259,5 +263,43 @@ TEST(ExecutionFrameTest, MemPatternTest) { EXPECT_EQ(p->GetBlock(3)->offset_, 0); EXPECT_EQ(p->GetBlock(4)->offset_, 64); } + +TEST(ExecutionFrameTest, BadModelInvalidDimParamUsage) { + // load model with 2 Scan ops that both incorrectly use shapes of { 'None', 'None' } for their outputs. + // as 'None' is not a special value it's treated as a variable name, leading to a runtime error when we + // attempt to re-use the output from the first Scan node for the second. validate we detect this and error out. + SessionOptions so; + so.session_logid = "BadModelInvalidDimParamUsage"; + + InferenceSession session_object{so, &DefaultLoggingManager()}; + Status st; + ASSERT_TRUE((st = session_object.Load("testdata/invalid_dim_param_value_repetition.onnx")).IsOK()) << st; + ASSERT_TRUE((st = session_object.Initialize()).IsOK()) << st; + + std::vector dims_X = {10, 6}; + std::vector values_X; + values_X.reserve(60); + for (int i = 0; i < 60; ++i) { + values_X.push_back(float(i)); + } + + OrtValue ml_value; + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_X, values_X, &ml_value); + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value)); + + // prepare outputs + std::vector output_names; + output_names.push_back("Y"); + std::vector fetches; + + // Now run + RunOptions run_options; + st = session_object.Run(run_options, feeds, output_names, &fetches); + + EXPECT_FALSE(st.IsOK()) << st; + EXPECT_THAT(st.ErrorMessage(), testing::HasSubstr("Shape mismatch attempting to re-use buffer.")); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/framework/float_16_test.cc b/onnxruntime/test/framework/float_16_test.cc index 399ac6882d1e3..c141ea817c456 100644 --- a/onnxruntime/test/framework/float_16_test.cc +++ b/onnxruntime/test/framework/float_16_test.cc @@ -96,7 +96,7 @@ ONNX_NAMESPACE::OpSchema GetMulFP16Schema() { return schema; } -static const std::string MUL_MODEL_URI = "testdata/mul_16.pb"; +static const std::string MUL_MODEL_URI = "testdata/mul_16.onnx"; void RunSession(InferenceSession& session_object, RunOptions& run_options, diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 07291b5ee9834..1c4cdad4f867e 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -14,6 +14,7 @@ #include "core/common/logging/logging.h" #include "core/common/profiler.h" #include "core/framework/compute_capability.h" +#include "core/framework/data_transfer_manager.h" #include "core/framework/execution_provider.h" #include "core/framework/kernel_registry.h" #include "core/framework/op_kernel.h" @@ -25,6 +26,9 @@ #include "core/platform/env.h" #include "core/providers/cpu/cpu_execution_provider.h" #include "core/providers/cpu/math/element_wise_ops.h" +#ifdef USE_CUDA +#include "core/providers/cuda/gpu_data_transfer.h" +#endif #include "core/session/IOBinding.h" #include "dummy_provider.h" #include "test_utils.h" @@ -112,19 +116,13 @@ class FuseExecutionProvider : public IExecutionProvider { static std::shared_ptr kernel_registry = GetFusedKernelRegistry(); return kernel_registry; } - - common::Status CopyTensor(const Tensor& src, Tensor& dst) const override { - ORT_UNUSED_PARAMETER(src); - ORT_UNUSED_PARAMETER(dst); - return Status::OK(); - } }; namespace test { static void VerifyOutputs(const std::vector& fetches, const std::vector& expected_dims, const std::vector& expected_values); -static const std::string MODEL_URI = "testdata/mul_1.pb"; -static const std::string MODEL_URI_NO_OPSET = "testdata/mul_1.pb.noopset"; +static const std::string MODEL_URI = "testdata/mul_1.onnx"; +static const std::string MODEL_URI_NO_OPSET = "testdata/mul_1.noopset.onnx"; //static const std::string MODEL_URI = "./testdata/squeezenet/model.onnx"; // TODO enable this after we've weights? static void CreateMatMulModel(std::unique_ptr& p_model, ProviderType provider_type) { @@ -284,7 +282,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, std::unique_ptr cpu_tensor = std::make_unique(element_type, shape, cpu_allocator); - st = TestCudaExecutionProvider()->CopyTensor(rtensor, *cpu_tensor.get()); + st = GPUDataTransfer().CopyTensor(rtensor, *cpu_tensor.get(), 0); ASSERT_TRUE(st.IsOK()); OrtValue ml_value; ml_value.Init(cpu_tensor.release(), @@ -348,8 +346,8 @@ static bool Compare(const InputDefList& f_arg, const InputDefList& s_arg) { if (!x->Shape()) { continue; } - vector x_shape = utils::GetTensorShapeFromTensorShapeProto(*x->Shape()); - vector y_shape = utils::GetTensorShapeFromTensorShapeProto(*y->Shape()); + auto x_shape = utils::GetTensorShapeFromTensorShapeProto(*x->Shape()); + auto y_shape = utils::GetTensorShapeFromTensorShapeProto(*y->Shape()); if (x->Name() == y->Name() && x_shape == y_shape && *x->Type() == *y->Type()) { continue; } @@ -828,56 +826,19 @@ TEST(InferenceSessionTests, ModelWithoutOpset) { } } -static ONNX_NAMESPACE::ModelProto CreateModelWithOptionalInputs() { - Model model("ModelWithOptionalInputs"); - auto& graph = model.MainGraph(); - - // create an initializer, which is an optional input that can be overridden - ONNX_NAMESPACE::TensorProto tensor_proto; - tensor_proto.add_dims(1); - tensor_proto.set_data_type(TensorProto_DataType_FLOAT); - tensor_proto.add_float_data(1.f); - tensor_proto.set_name("optional_input"); - - graph.AddInitializedTensor(tensor_proto); - - TypeProto single_float; - single_float.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT); - single_float.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); - - auto& required_input = graph.GetOrCreateNodeArg("required_input", &single_float); - auto& optional_input = graph.GetOrCreateNodeArg("optional_input", nullptr); - auto& add_output = graph.GetOrCreateNodeArg("add_output", &single_float); - - EXPECT_TRUE(optional_input.Shape() != nullptr) << "AddInitializedTensor should have created the NodeArg with shape."; - - graph.AddNode("add", "Add", "Add required and optional inputs", {&required_input, &optional_input}, {&add_output}); - - auto status = graph.Resolve(); - EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); - - auto model_proto = model.ToProto(); - - return model_proto; -} - static common::Status RunOptionalInputTest(bool add_required_input, bool add_optional_input, - bool add_invalid_input) { - auto model_proto = CreateModelWithOptionalInputs(); - + bool add_invalid_input, + int model_ir_version) { SessionOptions so; - so.session_logid = "InferenceSessionTests.TestOptionalInputs"; + so.session_logid = "RunOptionalInputTest"; InferenceSession session_object{so, &DefaultLoggingManager()}; + Status status; + std::string model_path = "testdata/optional_inputs_ir" + std::to_string(model_ir_version) + ".onnx"; - std::string s1; - model_proto.SerializeToString(&s1); - std::stringstream sstr(s1); - auto status = session_object.Load(sstr); - EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); - status = session_object.Initialize(); - EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); + ORT_RETURN_IF_ERROR(session_object.Load(model_path)); + ORT_RETURN_IF_ERROR(session_object.Initialize()); RunOptions run_options; run_options.run_tag = so.session_logid; @@ -885,6 +846,7 @@ static common::Status RunOptionalInputTest(bool add_required_input, // prepare inputs std::vector dims = {1}; std::vector required_input_val = {1.f}; + std::vector other_required_input_val = {0.f}; std::vector optional_input_val = {10.f}; // override initializer value of 1 std::vector unknown_input_val = {20.f}; @@ -892,6 +854,10 @@ static common::Status RunOptionalInputTest(bool add_required_input, CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims, required_input_val, &required_input_mlvalue); + OrtValue other_required_input_mlvalue; + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), + dims, other_required_input_val, &other_required_input_mlvalue); + OrtValue optional_input_mlvalue; CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims, optional_input_val, &optional_input_mlvalue); @@ -905,6 +871,9 @@ static common::Status RunOptionalInputTest(bool add_required_input, if (add_required_input) feeds.insert(std::make_pair("required_input", required_input_mlvalue)); + // always add this one + feeds.insert(std::make_pair("other_required_input", other_required_input_mlvalue)); + if (add_optional_input) feeds.insert(std::make_pair("optional_input", optional_input_mlvalue)); @@ -933,24 +902,37 @@ static common::Status RunOptionalInputTest(bool add_required_input, return status; } +// test the change in handling of graph inputs that match initializers between IR version 3 and 4 +// in V3 disallow overriding an initializer via the feeds +// for V4 allow it TEST(InferenceSessionTests, TestOptionalInputs) { - // required input only - auto status = RunOptionalInputTest(true, false, false); - ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); - - // required and optional input - status = RunOptionalInputTest(true, true, false); - ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); - - // required, optional and invalid input - status = RunOptionalInputTest(true, true, true); - ASSERT_FALSE(status.IsOK()); - EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid Feed Input Name")); - - // missing required - status = RunOptionalInputTest(false, true, false); - ASSERT_FALSE(status.IsOK()); - EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Missing Input:")); + std::vector ir_versions{3, 4}; + for (auto version : ir_versions) { + // required input only + auto status = RunOptionalInputTest(true, false, false, version); + ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); + + // required and optional input + status = RunOptionalInputTest(true, true, false, version); + if (version == 3) { + ASSERT_FALSE(status.IsOK()) << status.ErrorMessage(); + } else { + ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); + } + // required, optional and invalid input + status = RunOptionalInputTest(true, true, true, version); + ASSERT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid Feed Input Name")); + + // missing required + status = RunOptionalInputTest(false, true, false, version); + ASSERT_FALSE(status.IsOK()); + if (version == 3) { + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid Feed Input Name")); + } else { + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Missing Input:")); + } + } } TEST(ExecutionProviderTest, FunctionTest) { @@ -1138,7 +1120,8 @@ TEST(ExecutionProviderTest, FunctionInlineTest) { TEST(InferenceSessionTests, TestTruncatedSequence) { // model/data generated by /onnxruntime/test/testdata/CNTK/gen.py GenScan() - static const std::string LSTM_MODEL_URI = "testdata/scan_1.pb"; + // Manually updated to have IR version of 4. + static const std::string LSTM_MODEL_URI = "testdata/scan_1.onnx"; // This model is a 4x forward LSTM. Parse it to find out mapping between init_state input/output ONNX_NAMESPACE::ModelProto model_proto; int model_fd; diff --git a/onnxruntime/test/framework/local_kernel_registry_test.cc b/onnxruntime/test/framework/local_kernel_registry_test.cc index 7cd6d3b6865c3..09352e92b0bcd 100644 --- a/onnxruntime/test/framework/local_kernel_registry_test.cc +++ b/onnxruntime/test/framework/local_kernel_registry_test.cc @@ -184,11 +184,11 @@ OpKernel* CreateOptionalOpKernel(const OpKernelInfo& kernel_info) { return new OptionalOpKernel(kernel_info); } -static const std::string MUL_MODEL_URI = "testdata/mul_1.pb"; -static const std::string FOO_MODEL_URI = "testdata/foo_1.pb"; -static const std::string FOO_TRUNCATE_MODEL_URI = "testdata/foo_2.pb"; +static const std::string MUL_MODEL_URI = "testdata/mul_1.onnx"; +static const std::string FOO_MODEL_URI = "testdata/foo_1.onnx"; +static const std::string FOO_TRUNCATE_MODEL_URI = "testdata/foo_2.onnx"; -static const std::string OPTIONAL_MODEL1_URI = "testdata/optional_1.pb"; +static const std::string OPTIONAL_MODEL1_URI = "testdata/optional_1.onnx"; void RunSession(InferenceSession& session_object, RunOptions& run_options, diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc index f8ef7b35622c5..8eea68976e24d 100644 --- a/onnxruntime/test/framework/memcpy_transformer_test.cc +++ b/onnxruntime/test/framework/memcpy_transformer_test.cc @@ -177,6 +177,112 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) { ExpectSame(node2, node4, 0); ExpectSame(node2, node4, 1); } +TEST(TransformerTest, TestCopyNodeInsertionInitializerInSubgraph) { + // In this test, we are going to create a subgraph consuming an implicit input + // which is an initializer in the outer scope, and this implicit input to the subgraph + // is consumed by nodes on multiple devices + TensorProto value_tensor; + value_tensor.add_dims(1); + value_tensor.add_float_data(1.f); + value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + TypeProto tensor_float_type; + tensor_float_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT); + + TypeProto tensor_bool_type; + tensor_bool_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL); + + onnxruntime::NodeArg i1_def("I1", &tensor_bool_type), + o1_def("O1", &tensor_float_type), + o2_def("O2", &tensor_float_type); + + // main graph + // this will only contain one 'If' node + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = 7; + auto model = std::make_shared("test", + false, + ModelMetaData(), + IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version); + onnxruntime::Graph& graph = model->MainGraph(); + + TensorProto parent_constant(value_tensor); + parent_constant.set_name("parent_constant"); + graph.AddInitializedTensor(parent_constant); + + // subgraph + // this will contain 2 'Add' nodes - one on CPU and one of GPU + // one of the inputs to the 'Add' nodes is an implicit input to the subgraph + // which is an initializer in the main graph + std::unordered_map subgraph_domain_to_version; + subgraph_domain_to_version[kOnnxDomain] = 7; + auto sub_model = std::make_shared("test_subgraph", + false, + ModelMetaData(), + IOnnxRuntimeOpSchemaRegistryList(), + subgraph_domain_to_version); + onnxruntime::Graph& subgraph = sub_model->MainGraph(); + + TensorProto local_constant(value_tensor); + local_constant.set_name("local_constant"); + subgraph.AddInitializedTensor(local_constant); + + subgraph.AddOuterScopeNodeArg("parent_constant"); + subgraph.AddNode("node1", "Add", "operator1", + ArgMap{&subgraph.GetOrCreateNodeArg("local_constant", &tensor_float_type), + &graph.GetOrCreateNodeArg("parent_constant", &tensor_float_type)}, + ArgMap{&o1_def}); + + subgraph.AddNode("node2", "Add", "operator2", + ArgMap{&subgraph.GetOrCreateNodeArg("local_constant", &tensor_float_type), + &graph.GetOrCreateNodeArg("parent_constant", &tensor_float_type)}, + ArgMap{&o2_def}); + + auto status = subgraph.Resolve(); + ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); + + // main graph continued + // create the 'If' node + auto& if_node = graph.AddNode("node3", "If", "cpu operator2", ArgMap{&i1_def}, ArgMap{&o1_def, &o2_def}); + if_node.AddAttribute("then_branch", {subgraph.ToGraphProto()}); + if_node.AddAttribute("else_branch", {subgraph.ToGraphProto()}); + + onnxruntime::Graph* subgraph_1 = if_node.GetMutableGraphAttribute("then_branch"); + for (auto& node : subgraph_1->Nodes()) { + if (node.Name() == "node2") { + // only this node is on GPU + node.SetExecutionProviderType(onnxruntime::kCudaExecutionProvider); + } else { + node.SetExecutionProviderType(onnxruntime::kCpuExecutionProvider); + } + } + + onnxruntime::Graph* subgraph_2 = if_node.GetMutableGraphAttribute("else_branch"); + for (auto& node : subgraph_2->Nodes()) { + node.SetExecutionProviderType(onnxruntime::kCpuExecutionProvider); + } + + status = graph.Resolve(); + ASSERT_TRUE(status.IsOK()) << status.ErrorMessage(); + + KernelRegistryManager kernel_registry_manager; + ExecutionProviders execution_providers; + execution_providers.Add(onnxruntime::kCudaExecutionProvider, + std::make_unique(CUDAExecutionProviderInfo())); + execution_providers.Add(onnxruntime::kCpuExecutionProvider, + std::make_unique(CPUExecutionProviderInfo())); + KernelRegistryManager test_registry_manager; + test_registry_manager.RegisterKernels(execution_providers); + + MemcpyTransformer transformer({onnxruntime::kCudaExecutionProvider}, test_registry_manager); + + bool modified = false; + status = transformer.Apply(graph, modified); + EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); + EXPECT_TRUE(modified); +} + #endif } // namespace test diff --git a/onnxruntime/test/framework/op_kernel_test.cc b/onnxruntime/test/framework/op_kernel_test.cc index 3a6fb5efa98a5..707495b388da6 100644 --- a/onnxruntime/test/framework/op_kernel_test.cc +++ b/onnxruntime/test/framework/op_kernel_test.cc @@ -21,13 +21,6 @@ namespace test { class XPUExecutionProvider : public IExecutionProvider { public: XPUExecutionProvider() : IExecutionProvider{onnxruntime::kCpuExecutionProvider} {} - - Status CopyTensor(const Tensor& src, Tensor& dst) const override { - ORT_UNUSED_PARAMETER(src); - ORT_UNUSED_PARAMETER(dst); - return Status::OK(); - } - }; } // namespace test diff --git a/onnxruntime/test/framework/parallel_executor_test.cc b/onnxruntime/test/framework/parallel_executor_test.cc index a4ccd7f72d3cb..fc6d885d67b92 100644 --- a/onnxruntime/test/framework/parallel_executor_test.cc +++ b/onnxruntime/test/framework/parallel_executor_test.cc @@ -46,7 +46,7 @@ struct TestOp { // success Tensor* Y = ctx->Output(0, action_tensor.Shape()); void* target = Y->MutableData(); - memcpy(target, action, action_tensor.Size()); + memcpy(target, action, action_tensor.SizeInBytes()); break; } case 1: { diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index 5404463ee9343..6064b74630897 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -4,8 +4,11 @@ #include #include "core/framework/execution_providers.h" +#include "core/framework/graph_partitioner.h" #include "core/framework/op_kernel.h" #include "core/framework/session_state.h" +#include "core/framework/session_state_initializer.h" +#include "core/graph/graph_utils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" #include "core/graph/op.h" @@ -53,8 +56,8 @@ TEST(SessionStateTest, AddGetKernelTest) { KernelDef kernel_def; CPUExecutionProvider execution_provider{CPUExecutionProviderInfo{"CPUExecutionProvider"}}; - OpKernelInfo p_info(node, kernel_def, execution_provider, s.GetInitializedTensors(), s.GetOrtValueNameIdxMap(), - s.GetFuncMgr()); + OpKernelInfo p_info(node, kernel_def, execution_provider, s.GetConstantInitializedTensors(), + s.GetOrtValueNameIdxMap(), s.GetFuncMgr(), s.GetDataTransferMgr()); unique_ptr p_kernel; p_kernel.reset(new TestOpKernel(p_info)); size_t orig_num_outputs = p_kernel->Node().OutputDefs().size(); @@ -66,5 +69,69 @@ TEST(SessionStateTest, AddGetKernelTest) { std::cout << "orig: " << orig_num_outputs << " new: " << test_kernel->Node().OutputDefs().size() << std::endl; EXPECT_EQ(orig_num_outputs, test_kernel->Node().OutputDefs().size()); } + +// Test that we separate out constant and non-constant initializers correctly +TEST(SessionStateTest, TestInitializerProcessing) { + std::vector ir_versions = {3, 4}; + for (auto ir_version : ir_versions) { + std::string model_path = "testdata/optional_inputs_ir" + std::to_string(ir_version) + ".onnx"; + Status status; + std::shared_ptr model; + ASSERT_TRUE((status = Model::Load(model_path, model)).IsOK()) << status; + Graph& graph = model->MainGraph(); + // take a copy as this gets cleared during session state initialization + InitializedTensorSet initializers = graph.GetAllInitializedTensors(); + + const bool enable_mem_pattern = false; + ExecutionProviders execution_providers; + CPUExecutionProviderInfo epi{false}; + status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(epi)); + ASSERT_TRUE(status.IsOK()) << status; + + KernelRegistryManager krm; + status = krm.RegisterKernels(execution_providers); + ASSERT_TRUE(status.IsOK()) << status; + + SessionState session_state(execution_providers, enable_mem_pattern); + SessionStateInitializer session_initializer(enable_mem_pattern, ToWideString(model_path), graph, + session_state, execution_providers, krm); + + GraphPartitioner partitioner(krm, execution_providers); + status = partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()); + ASSERT_TRUE(status.IsOK()) << status; + + status = session_initializer.CreatePlan(nullptr, nullptr, true); + ASSERT_TRUE(status.IsOK()) << status; + + status = session_initializer.InitializeAndSave(nullptr); + ASSERT_TRUE(status.IsOK()) << status; + + const auto& initialized_tensors = session_state.GetInitializedTensors(); + const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors(); + + ASSERT_EQ(initializers.size(), initialized_tensors.size()) + << "SessionState should have an entry for all initializers in Graph."; + + if (ir_version < 4) { + ASSERT_EQ(initialized_tensors.size(), const_initialized_tensors.size()) + << "All initializers should be considered constant if IR version < 4."; + } else { + const auto& name_to_idx = session_state.GetOrtValueNameIdxMap(); + + for (auto entry : initializers) { + int idx; + name_to_idx.GetIdx(entry.first, idx); + + bool found = initialized_tensors.find(idx) != initialized_tensors.cend(); + ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state initialized tensors"; + + if (graph_utils::IsConstantInitializer(graph, entry.first, false)) { + found = const_initialized_tensors.find(idx) != const_initialized_tensors.cend(); + ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state const initialized tensors"; + } + } + } + } +} } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index 80ede37bce554..02923b569ac40 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -196,7 +196,7 @@ This operator applies the Abs op element-wise to the input sparse-tensor. // So, we copy indices/shape from input to output. // TODO: Extend allocation-planner to enable such sharing. const auto& input_indices = input->Indices(); - memcpy(output->MutableIndices().MutableData(), input_indices.Data(), input_indices.Size()); + memcpy(output->MutableIndices().MutableData(), input_indices.Data(), input_indices.SizeInBytes()); return Status::OK(); } }; diff --git a/onnxruntime/test/framework/test_utils.cc b/onnxruntime/test/framework/test_utils.cc index f39498999c76c..7604e15ce7429 100644 --- a/onnxruntime/test/framework/test_utils.cc +++ b/onnxruntime/test/framework/test_utils.cc @@ -35,15 +35,37 @@ IExecutionProvider* TestOpenVINOExecutionProvider() { } #endif +#ifdef USE_NNAPI +IExecutionProvider* TestNnapiExecutionProvider() { + static NnapiExecutionProvider nnapi_provider; + return &nnapi_provider; +} +#endif + +static void CountOpsInGraphImpl(const Graph& graph, std::map& ops) { + for (auto& node : graph.Nodes()) { + auto pos = ops.find(node.OpType()); + if (pos == ops.end()) { + ops[node.OpType()] = 1; + } else { + ++pos->second; + } + + if (node.ContainsSubgraph()) { + for (auto& subgraph : node.GetSubgraphs()) { + CountOpsInGraphImpl(*subgraph, ops); + } + } + } +} + // Returns a map with the number of occurrences of each operator in the graph. // Helper function to check that the graph transformations have been successfully applied. std::map CountOpsInGraph(const Graph& graph) { - std::map op_to_count; - for (auto& node : graph.Nodes()) { - op_to_count[node.OpType()] = - op_to_count.count(node.OpType()) == 0 ? 1 : ++op_to_count[node.OpType()]; - } - return op_to_count; + std::map ops; + CountOpsInGraphImpl(graph, ops); + + return ops; } } // namespace test diff --git a/onnxruntime/test/framework/test_utils.h b/onnxruntime/test/framework/test_utils.h index 82dc3294067a8..9b36d8bd72902 100644 --- a/onnxruntime/test/framework/test_utils.h +++ b/onnxruntime/test/framework/test_utils.h @@ -10,6 +10,8 @@ #include "core/providers/cpu/cpu_execution_provider.h" #include "core/framework/ml_value.h" +#include "gsl/gsl_algorithm" + #ifdef USE_CUDA #include "core/providers/cuda/cuda_execution_provider.h" #endif @@ -19,6 +21,9 @@ #ifdef USE_OPENVINO #include "core/providers/openvino/openvino_execution_provider.h" #endif +#ifdef USE_NNAPI +#include "core/providers/nnapi/nnapi_execution_provider.h" +#endif namespace onnxruntime { class Graph; @@ -41,6 +46,24 @@ IExecutionProvider* TestTensorrtExecutionProvider(); IExecutionProvider* TestOpenVINOExecutionProvider(); #endif +#ifdef USE_NNAPI +IExecutionProvider* TestNnapiExecutionProvider(); +#endif + +template +inline void CopyVectorToTensor(const std::vector& value, Tensor& tensor) { + gsl::copy(gsl::make_span(value), tensor.MutableDataAsSpan()); +} + +// vector is specialized so we need to handle it separately +template <> +inline void CopyVectorToTensor(const std::vector& value, Tensor& tensor) { + auto output_span = tensor.MutableDataAsSpan(); + for (size_t i = 0, end = value.size(); i < end; ++i) { + output_span[i] = value[i]; + } +} + template void CreateMLValue(AllocatorPtr alloc, const std::vector& dims, const std::vector& value, OrtValue* p_mlvalue) { @@ -50,8 +73,9 @@ void CreateMLValue(AllocatorPtr alloc, const std::vector& dims, const s shape, alloc); if (value.size() > 0) { - memcpy(p_tensor->MutableData(), &value[0], element_type->Size() * shape.Size()); + CopyVectorToTensor(value, *p_tensor); } + p_mlvalue->Init(p_tensor.release(), DataTypeImpl::GetType(), DataTypeImpl::GetType()->GetDeleteFunc()); diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc index 992c88b7a3eed..70f2bfe3069eb 100644 --- a/onnxruntime/test/ir/graph_test.cc +++ b/onnxruntime/test/ir/graph_test.cc @@ -322,9 +322,9 @@ TEST(ResolvingGraphTest, GraphConstruction_CheckIsAcyclic) { auto status = graph.Resolve(); EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); - EXPECT_TRUE(Model::Save(model, "graph_1.pb").IsOK()); + EXPECT_TRUE(Model::Save(model, "graph_1.onnx").IsOK()); std::shared_ptr model2; - EXPECT_TRUE(Model::Load("graph_1.pb", model2).IsOK()); + EXPECT_TRUE(Model::Load("graph_1.onnx", model2).IsOK()); auto model_proto = model.ToProto(); auto model_proto2 = model2->ToProto(); @@ -709,9 +709,9 @@ TEST(ResolvingGraphTest, GraphConstruction_TypeInference) { EXPECT_EQ("node_4_out_1", graph.GetOutputs()[0]->Name()); EXPECT_EQ(2, graph.GetInputs().size()); - EXPECT_TRUE(Model::Save(model, "model_x.pb").IsOK()); + EXPECT_TRUE(Model::Save(model, "model_x.onnx").IsOK()); std::shared_ptr loaded_model; - EXPECT_TRUE(Model::Load("model_x.pb", loaded_model).IsOK()); + EXPECT_TRUE(Model::Load("model_x.onnx", loaded_model).IsOK()); EXPECT_EQ(2, loaded_model->MainGraph().GetInputs().size()); auto& graph_proto = graph.ToGraphProto(); diff --git a/onnxruntime/test/mlas/unittest.cpp b/onnxruntime/test/mlas/unittest.cpp index 28bc23c69e3d2..36c96c819c87d 100644 --- a/onnxruntime/test/mlas/unittest.cpp +++ b/onnxruntime/test/mlas/unittest.cpp @@ -1678,6 +1678,101 @@ class MlasPool3DTest : public MlasTestBase } }; +class MlasActivationTest : public MlasTestBase +{ +public: + void + ExecuteShort( + void + ) override + { + union AliasedValue { + unsigned u; + float f; + }; + + // N.B. The test data includes values at the edge of Tanh/Logistic boundaries. + // Identity, Relu, LeakyRelu, Tanh, Logistic, Clip, + static const AliasedValue TestData[20][6] = { + { {0x00000001}, {0x00000001}, {0x00000001}, {0x00000000}, {0x3f000000}, {0x00000001}, }, // positive denormal + { {0x80000001}, {0x00000000}, {0x80000000}, {0x80000000}, {0x3f000000}, {0x00000000}, }, // negative denormal + { {0x7ff00002}, {0x7ff00002}, {0x7ff00002}, {0x7ff00002}, {0x7ff00002}, {0x7ff00002}, }, // positive NaN + { {0xfff00002}, {0xfff00002}, {0xfff00002}, {0xfff00002}, {0xfff00002}, {0xfff00002}, }, // negative NaN + { {0x00000000}, {0x00000000}, {0x00000000}, {0x00000000}, {0x3f000000}, {0x00000000}, }, // 0.0f + { {0x80000000}, {0x80000000}, {0x80000000}, {0x80000000}, {0x3f000000}, {0x80000000}, }, // -0.0f + { {0x3e800000}, {0x3e800000}, {0x3e800000}, {0x3e7acbf5}, {0x3f0feacc}, {0x3e800000}, }, // 0.25f + { {0xbe800000}, {0x00000000}, {0xbd4ccccd}, {0xbe7acbf5}, {0x3ee02a67}, {0x00000000}, }, // -0.25f + { {0x40800000}, {0x40800000}, {0x40800000}, {0x3f7fd40a}, {0x3f7b6541}, {0x40800000}, }, // 4.0f + { {0xc0800000}, {0x00000000}, {0xbf4ccccd}, {0xbf7fd40a}, {0x3c9357e0}, {0x00000000}, }, // -4.0f + { {0x41200000}, {0x41200000}, {0x41200000}, {0x3f800000}, {0x3f7ffd06}, {0x40c00000}, }, // 10.0f + { {0xc1200000}, {0x00000000}, {0xc0000000}, {0xbf800000}, {0x383e6000}, {0x00000000}, }, // -10.0f + { {0xc18866eb}, {0x00000000}, {0xc05a3e45}, {0xbf800000}, {0x33000000}, {0x00000000}, }, // -17.0502529144f + { {0xc18869bb}, {0x00000000}, {0xc05a42c5}, {0xbf800000}, {0x33c00000}, {0x00000000}, }, // -17.0516262054f + { {0xc18852a8}, {0x00000000}, {0xc05a1dda}, {0xbf800000}, {0x00000000}, {0x00000000}, }, // -17.0403594971f + { {0xc18844aa}, {0x00000000}, {0xc05a0777}, {0xbf800000}, {0x00000000}, {0x00000000}, }, // -17.0335273743f + { {0x418866eb}, {0x418866eb}, {0x418866eb}, {0x3f800000}, {0x3f800000}, {0x40c00000}, }, // +17.0502529144f + { {0x418869bb}, {0x418869bb}, {0x418869bb}, {0x3f800000}, {0x3f7ffffe}, {0x40c00000}, }, // +17.0516262054f + { {0x418852a8}, {0x418852a8}, {0x418852a8}, {0x3f800000}, {0x3f800000}, {0x40c00000}, }, // +17.0403594971f + { {0x418844aa}, {0x418844aa}, {0x418844aa}, {0x3f800000}, {0x3f800000}, {0x40c00000}, }, // +17.0335273743f + }; + + MLAS_ACTIVATION Activation; + AliasedValue Buffer[_countof(TestData)]; + + for (unsigned kind = 0; kind < unsigned(MlasClipActivation); kind++) { + + Activation.ActivationKind = MLAS_ACTIVATION_KIND(kind); + + if (Activation.ActivationKind == MlasLeakyReluActivation) { + Activation.Parameters.LeakyRelu.alpha = 0.2f; + } else if (Activation.ActivationKind == MlasClipActivation) { + Activation.Parameters.Clip.minimum = 0.0f; + Activation.Parameters.Clip.maximum = 6.0f; + } + + // + // Test the vectorized activations. + // + + for (unsigned i = 0; i < _countof(TestData); i++) { + Buffer[i].u = TestData[i][0].u; + } + + MlasActivation(&Activation, &Buffer[0].f, nullptr, _countof(Buffer), 1, 1); + + for (unsigned i = 0; i < _countof(TestData); i++) { + // Sensitive to comparing positive/negative zero and NaNs. + if (Buffer[i].u != TestData[i][kind].u && Buffer[i].f != TestData[i][kind].f) { + printf("mismatch activation kind=%d i=%d value=%08x expected=%08x\n", kind, i, Buffer[i].u, TestData[i][kind].u); + } + } + + // + // Test the scalar activations. + // + + for (unsigned i = 0; i < _countof(TestData); i++) { + Buffer[i].u = TestData[i][0].u; + MlasActivation(&Activation, &Buffer[i].f, nullptr, 1, 1, 1); + } + + for (unsigned i = 0; i < _countof(TestData); i++) { + // Sensitive to comparing positive/negative zero and NaNs. + if (Buffer[i].u != TestData[i][kind].u && Buffer[i].f != TestData[i][kind].f) { + printf("mismatch activation kind=%d i=%d value=%08x expected=%08x\n", kind, i, Buffer[i].u, TestData[i][kind].u); + } + } + } + } + + void + ExecuteLong( + void + ) override + { + } +}; + int #if defined(_WIN32) __cdecl @@ -1691,15 +1786,22 @@ main( printf("Conv2D tests.\n"); std::make_unique()->ExecuteShort(); - std::make_unique()->ExecuteShort(); + if (MlasNchwcGetBlockSize() > 1) { + std::make_unique()->ExecuteShort(); + } printf("Pool2D tests.\n"); std::make_unique()->ExecuteShort(); - std::make_unique()->ExecuteShort(); + if (MlasNchwcGetBlockSize() > 1) { + std::make_unique()->ExecuteShort(); + } printf("Pool3D tests.\n"); std::make_unique()->ExecuteShort(); + printf("Activation tests.\n"); + std::make_unique()->ExecuteShort(); + printf("Done.\n"); return 0; diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index df59746e208b5..16b2e409f6214 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -92,7 +92,7 @@ OrtValue* CreateTensorWithDataAsOrtValue(OrtAllocatorInfo* info, std::vector& template OrtValue* PbMapToOrtValue(const google::protobuf::Map& map) { OrtAllocatorInfo* info; - ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &info)); + ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &info)); std::unique_ptr rel_info(info, OrtReleaseAllocatorInfo); const size_t ele_count = map.size(); std::vector dims(1, ele_count); @@ -122,7 +122,7 @@ OrtValue* PbMapToOrtValue(const google::protobuf::Map& map template void VectorProtoToOrtValue(const RepeatedPtrField& input, ORT_VALUE_HOLDER& output) { OrtAllocatorInfo* info; - ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &info)); + ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &info)); std::unique_ptr rel_info(info, OrtReleaseAllocatorInfo); OrtValueArray in(input.size()); size_t j = 0; diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 7d31f6e37764d..86218dc6e0c9f 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -19,6 +19,7 @@ #include #include "core/framework/path_lib.h" #include "core/session/onnxruntime_cxx_api.h" +#include "core/optimizer/graph_transformer_level.h" using namespace onnxruntime; @@ -37,10 +38,11 @@ void usage() { "\t-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'mkldnn', 'tensorrt', 'ngraph' or 'openvino'. " "Default: 'cpu'.\n" "\t-x: Use parallel executor, default (without -x): sequential executor.\n" - "\t-o [optimization level]: Specifies the graph optimization level to enable. Valid values are 0, 1 or 2. Default is 1.\n" + "\t-o [optimization level]: Specifies the graph optimization level to enable. Valid values are 0 through 3. Default is 1.\n" "\t\t0 -> Disable all optimizations\n" "\t\t1 -> Enable basic optimizations\n" - "\t\t2 -> Enable all optimizations\n" + "\t\t2 -> Enable extended optimizations\n" + "\t\t3 -> Enable extended+layout optimizations\n" "\t-h: help\n" "\n" "onnxruntime version: %s\n", @@ -95,6 +97,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { bool enable_tensorrt = false; bool enable_mem_pattern = true; bool enable_openvino = false; + bool enable_nnapi = false; uint32_t graph_optimization_level{}; bool user_graph_optimization_level_set = false; @@ -153,6 +156,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) { enable_tensorrt = true; } else if (!CompareCString(optarg, ORT_TSTR("openvino"))) { enable_openvino = true; + } else if (!CompareCString(optarg, ORT_TSTR("nnapi"))) { + enable_nnapi = true; } else { usage(); return -1; @@ -163,7 +168,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { break; case 'o': graph_optimization_level = static_cast(OrtStrtol(optarg, nullptr)); - if (graph_optimization_level > 2) { + if (graph_optimization_level >= static_cast(TransformerLevel::MaxTransformerLevel)) { fprintf(stderr, "See usage for valid values of graph optimization level\n"); usage(); return -1; @@ -207,9 +212,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) { { double per_sample_tolerance = 1e-3; // when cuda is enabled, set it to a larger value for resolving random MNIST test failure - double relative_per_sample_tolerance = enable_cuda ? 0.017 : 1e-3; // when openvino is enabled, set it to a larger value for resolving MNIST accuracy mismatch - relative_per_sample_tolerance = enable_openvino ? 0.009 : 1e-3; + double relative_per_sample_tolerance = enable_cuda ? 0.017 : enable_openvino ? 0.009 : 1e-3; Ort::SessionOptions sf; @@ -225,6 +229,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { sf.EnableSequentialExecution(); else sf.DisableSequentialExecution(); + if (enable_tensorrt) { #ifdef USE_TENSORRT ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf)); @@ -275,6 +280,14 @@ int real_main(int argc, char* argv[], Ort::Env& env) { return -1; #endif } + if (enable_nnapi) { +#ifdef USE_NNAPI + ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Nnapi(sf)); +#else + fprintf(stderr, "DNNLibrary/NNAPI is not supported in this build"); + return -1; +#endif + } if (user_graph_optimization_level_set) { sf.SetGraphOptimizationLevel(graph_optimization_level); @@ -341,28 +354,29 @@ int real_main(int argc, char* argv[], Ort::Env& env) { {"constantofshape_float_ones", "test data bug", {"onnx141","onnx150"}}, {"constantofshape_int_zeros", "test data bug", {"onnx141","onnx150"}}, {"convtranspose_1d", "disable reason"}, - {"convtranspose_3d", "disable reason"}, + {"convtranspose_3d", "disable reason"}, {"cast_STRING_to_FLOAT", "Cast opset 9 not supported yet"}, {"cast_FLOAT_to_STRING", "Cast opset 9 not supported yet"}, - {"tf_inception_resnet_v2", "Cast opset 9 not supported yet"}, - {"tf_inception_v4", "Cast opset 9 not supported yet"}, {"tf_nasnet_large", "disable temporarily"}, {"tf_nasnet_mobile", "disable temporarily"}, {"tf_pnasnet_large", "disable temporarily"}, {"shrink", "test case is wrong", {"onnx141"}}, {"maxpool_with_argmax_2d_precomputed_strides", "ShapeInferenceError"}, {"tf_inception_v2", "result mismatch"}, - {"tf_mobilenet_v2_1.0_224", "result mismatch"}, - {"tf_mobilenet_v2_1.4_224", "result mismatch"}, - {"tf_mobilenet_v1_1.0_224", "result mismatch"}, - {"mobilenetv2-1.0", "result mismatch"}, {"mxnet_arcface", "result mismatch"} }; #ifdef USE_NGRAPH broken_tests.insert({"dequantizelinear", "ambiguity in scalar dimensions [] vs [1]", {"onnx150"}}); broken_tests.insert({"qlinearconv", "ambiguity in scalar dimensions [] vs [1]"}); - broken_tests.insert({"quantizelinear", "ambiguity in scalar dimensions [] vs [1]", {"onnx150"}}); + broken_tests.insert({"quantizelinear", "ambiguity in scalar dimensions [] vs [1]", {"onnx150"}}); +#endif + +#ifdef USE_MKLDNN + broken_tests.insert({"tf_mobilenet_v2_1.0_224", "result mismatch"}); + broken_tests.insert({"tf_mobilenet_v2_1.4_224", "result mismatch"}); + broken_tests.insert({"tf_mobilenet_v1_1.0_224", "result mismatch"}); + broken_tests.insert({"mobilenetv2-1.0", "result mismatch"}); #endif #ifdef USE_OPENVINO @@ -378,6 +392,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) { #ifdef USE_CUDA broken_tests.insert({"mxnet_arcface", "result mismatch"}); + broken_tests.insert({"mlperf_ssd_mobilenet_300", "unknown error"}); + broken_tests.insert({"mlperf_ssd_resnet34_1200", "unknown error"}); broken_tests.insert({"tf_inception_v1", "flaky test"}); //TODO: Investigate cause for flakiness #endif // clang-format on diff --git a/onnxruntime/test/onnx/runner.cc b/onnxruntime/test/onnx/runner.cc index 3bb3648f71c09..575292fbc6dd3 100644 --- a/onnxruntime/test/onnx/runner.cc +++ b/onnxruntime/test/onnx/runner.cc @@ -362,7 +362,7 @@ EXECUTE_RESULT DataRunner::RunTaskImpl(size_t task_id) { output_names[i] = output_name; default_allocator->Free(output_name); } - if (feeds.size() > std::numeric_limits::max()) { + if (feeds.size() > static_cast(std::numeric_limits::max())) { ORT_THROW("length overflow"); } std::vector input_names(feeds.size()); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 043e0133d22f7..1052f1ea6378b 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -115,6 +115,80 @@ TEST(GraphTransformationTests, ConstantFolding) { ASSERT_TRUE(op_to_count["Unsqueeze"] == 0); } +TEST(GraphTransformationTests, ConstantFoldingSubgraph) { + TensorProto value_tensor; + value_tensor.add_dims(1); + value_tensor.add_float_data(1.f); + value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + TypeProto float_tensor_type; + float_tensor_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT); + float_tensor_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + + auto create_subgraph = [&](GraphProto& graph_proto) { + // create subgraph that has an Add node to add a local and parent graph initializer + Model model("ConstantFoldingSubgraphTest_subgraph"); + auto& graph = model.MainGraph(); + + TensorProto local_constant(value_tensor); + local_constant.set_name("local_constant"); + graph.AddInitializedTensor(local_constant); + + auto& local_constant_arg = graph.GetOrCreateNodeArg("local_constant", &float_tensor_type); + auto& parent_constant_arg = graph.GetOrCreateNodeArg("parent_constant", &float_tensor_type); + graph.AddOuterScopeNodeArg("parent_constant"); + + auto& add_out = graph.GetOrCreateNodeArg("add_out", &float_tensor_type); + graph.AddNode("add", "Add", "Add two inputs.", {&parent_constant_arg, &local_constant_arg}, {&add_out}); + + auto& subgraph_out = graph.GetOrCreateNodeArg("subgraph_out", &float_tensor_type); + graph.AddNode("identity", "Identity", "So Add isn't providing graph output.", {&add_out}, {&subgraph_out}); + + auto status = graph.Resolve(); + ASSERT_TRUE(status.IsOK()) << status; + graph_proto = graph.ToGraphProto(); + }; + + Model model("ConstantFoldingSubgraphTest_main_graph"); + auto& graph = model.MainGraph(); + + // add initializer at parent level + TensorProto parent_value_tensor(value_tensor); + parent_value_tensor.set_name("parent_constant"); + graph.AddInitializedTensor(parent_value_tensor); + + // put the subgraph in an If node + TypeProto if_cond_type; + if_cond_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL); + if_cond_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + auto& if_cond_input = graph.GetOrCreateNodeArg("if_in", &if_cond_type); + auto& if_output = graph.GetOrCreateNodeArg("if_out", &float_tensor_type); + + auto& if_node = graph.AddNode("if", "If", "If node", {&if_cond_input}, {&if_output}); + + GraphProto subgraph; + create_subgraph(subgraph); + + if_node.AddAttribute("then_branch", {subgraph}); + if_node.AddAttribute("else_branch", {subgraph}); + + auto status = graph.Resolve(); + ASSERT_TRUE(status.IsOK()) << status; + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["Add"] == 2); // one in each subgraph + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level1); + + status = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1); + ASSERT_TRUE(status.IsOK()) << status; + + op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["Add"] == 0) + << "Constant folding should have been able to remove the Add node in both subgraphs"; +} + TEST(GraphTransformationTests, ShapeToInitializer) { string model_uri = MODEL_FOLDER + "shape-add.onnx"; std::shared_ptr model; @@ -214,6 +288,7 @@ TEST(GraphTransformationTests, FuseConvBNMulAddUnsqueeze) { #ifndef DISABLE_CONTRIB_OPS TEST(GraphTransformationTests, FuseConvActivation) { std::unordered_map model_to_op_name{{"fusion/conv_relu.onnx", "Relu"}, + {"fusion/conv_clip.onnx", "Clip"}, {"fusion/conv_sigmoid.onnx", "Sigmoid"}, {"fusion/conv_tanh.onnx", "Tanh"}, {"fusion/conv_leakyrelu.onnx", "LeakyRelu"}}; @@ -286,6 +361,8 @@ TEST(GraphTransformationTests, FuseConvAddNoBias) { ASSERT_TRUE(op_to_count["Unsqueeze"] == 0); } +// if IR version is 4 or higher the weights can be overridden if there's a matching graph input. +// check that we don't fuse if that is the case TEST(GraphTransformationTests, NegativeFuseConvAddNoBias) { string model_uri = MODEL_FOLDER + "fusion/negative-fuse-conv-add-no-bias.onnx"; @@ -306,9 +383,10 @@ TEST(GraphTransformationTests, NegativeFuseConvAddNoBias) { ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2).IsOK()); // Nodes are not fused because the weights to conv/add are not constants (they appear in the graph inputs). + // Unsqueeze is also not eliminated as the initializer that is its input is also not constant std::map op_to_count = CountOpsInGraph(graph); ASSERT_TRUE(op_to_count["Add"] != 0); - ASSERT_TRUE(op_to_count["Unsqueeze"] == 0); + ASSERT_TRUE(op_to_count["Unsqueeze"] != 0); } TEST(GraphTransformationTests, FuseConvAddMul3D) { diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc new file mode 100644 index 0000000000000..68e4821e8f114 --- /dev/null +++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc @@ -0,0 +1,891 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/session/inference_session.h" +#include "core/graph/model.h" +#include "test/test_environment.h" +#include "test/framework/test_utils.h" +#include "test/compare_ortvalue.h" +#include "gtest/gtest.h" +#include "core/mlas/inc/mlas.h" + +namespace onnxruntime { +namespace test { + +// InferenceSession wrapper in order to gain access to the loaded graph. +class NchwcInferenceSession : public InferenceSession { + public: + explicit NchwcInferenceSession(const SessionOptions& session_options, + logging::LoggingManager* logging_manager) : InferenceSession(session_options, logging_manager) { + } + + std::unordered_map CountOpsInGraph() { + std::unordered_map op_to_count; + if (model_.get() != nullptr) { + for (auto& node : model_->MainGraph().Nodes()) { + std::string key = node.OpType(); + if (node.Domain() == kMSNchwcDomain) { + key = "nchwc." + key; + } + op_to_count[key] = op_to_count[key] + 1; + } + } + return op_to_count; + } + + const Graph& GetGraph() { + return model_->MainGraph(); + } +}; + +struct NchwcTestHelper { + NchwcTestHelper(Graph& graph) : graph_(graph), fill_value_(0) { + } + + NodeArg* MakeInput(const std::vector& shape, const ONNX_NAMESPACE::TypeProto& type_proto) { + int64_t num_elements = 1; + for (auto& dim : shape) { + num_elements *= dim; + } + + OrtValue input_value; + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), shape, + FillRandomData(static_cast(num_elements)), &input_value); + std::string name = graph_.GenerateNodeArgName("input"); + feeds_.insert(std::make_pair(name, input_value)); + + return &graph_.GetOrCreateNodeArg(name, &type_proto); + } + + NodeArg* MakeInput(const std::vector& shape) { + ONNX_NAMESPACE::TypeProto type_proto; + type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + for (auto& dim : shape) { + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(dim); + } + + return MakeInput(shape, type_proto); + } + + NodeArg* MakeOutput() { + std::string name = graph_.GenerateNodeArgName("output"); + output_names_.push_back(name); + return &graph_.GetOrCreateNodeArg(name, nullptr); + } + + NodeArg* MakeIntermediate() { + std::string name = graph_.GenerateNodeArgName("node"); + return &graph_.GetOrCreateNodeArg(name, nullptr); + } + + NodeArg* MakeInitializer(const std::vector& shape) { + std::string name = graph_.GenerateNodeArgName("constant"); + ONNX_NAMESPACE::TensorProto tensor_proto; + tensor_proto.set_name(name); + tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + int64_t num_elements = 1; + for (auto& dim : shape) { + tensor_proto.add_dims(dim); + num_elements *= dim; + } + + auto random_data = FillRandomData(static_cast(num_elements)); + tensor_proto.mutable_float_data()->Resize(static_cast(num_elements), 0.0f); + memcpy(tensor_proto.mutable_float_data()->mutable_data(), random_data.data(), random_data.size() * sizeof(float)); + + graph_.AddInitializedTensor(tensor_proto); + + return &graph_.GetOrCreateNodeArg(name, nullptr); + } + + Node& AddNode(const std::string& op_type, + const std::vector& input_args, + const std::vector& output_args) { + return graph_.AddNode(graph_.GenerateNodeName("node"), + op_type, + "description", + input_args, + output_args); + } + + Node& AddConvNode(NodeArg* input_arg, NodeArg* output_arg, const std::vector& weights_shape) { + auto* weights_arg = MakeInitializer(weights_shape); + auto* biases_arg = MakeInitializer({weights_shape[0]}); + return AddNode("Conv", {input_arg, weights_arg, biases_arg}, {output_arg}); + } + + std::vector FillRandomData(size_t count) { + constexpr int min_fill_value = -23; + constexpr int max_fill_value = 23; + + std::vector random_data; + random_data.resize(count); + for (size_t n = 0; n < count; n++) { + random_data[n] = static_cast(fill_value_); + fill_value_++; + if (fill_value_ == max_fill_value) { + fill_value_ = min_fill_value; + } + } + return random_data; + } + + Graph& graph_; + NameMLValMap feeds_; + std::vector output_names_; + int fill_value_; +}; + +void NchwcOptimizerTester(const std::function& build_test_case, + const std::function& check_nchwc_graph, + int opset_version = 10) { + // Ignore the test if NCHWc is not supported by the platform. + if (MlasNchwcGetBlockSize() <= 1) { + return; + } + + // Build the model for this test. + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = opset_version; + Model model("nchwc", false, ModelMetaData(), IOnnxRuntimeOpSchemaRegistryList(), domain_to_version); + NchwcTestHelper helper(model.MainGraph()); + build_test_case(helper); + ASSERT_TRUE(model.MainGraph().Resolve().IsOK()); + + // Serialize the model to a string. + std::string model_data; + model.ToProto().SerializeToString(&model_data); + + auto run_model = [&](TransformerLevel level, std::vector& fetches) { + SessionOptions session_options; + session_options.graph_optimization_level = level; + session_options.session_logid = "NchwcOptimizerTests"; + NchwcInferenceSession session{session_options, &DefaultLoggingManager()}; + ASSERT_TRUE(session.Load(model_data.data(), static_cast(model_data.size())).IsOK()); + ASSERT_TRUE(session.Initialize().IsOK()); + + RunOptions run_options; + auto status = session.Run(run_options, helper.feeds_, helper.output_names_, &fetches); + if (!status.IsOK()) { + std::cout << "Run failed with status message: " << status.ErrorMessage() << std::endl; + } + ASSERT_TRUE(status.IsOK()); + + if (level == TransformerLevel::Level3) { + check_nchwc_graph(session); + } + }; + + std::vector level2_fetches; + run_model(TransformerLevel::Level2, level2_fetches); + + std::vector level3_fetches; + run_model(TransformerLevel::Level3, level3_fetches); + + size_t num_outputs = level2_fetches.size(); + ASSERT_TRUE(num_outputs == level3_fetches.size()); + + for (size_t i = 0; i < num_outputs; i++) { + double per_sample_tolerance = 0.0; + double relative_per_sample_tolerance = 0.0; + std::pair ret = + CompareOrtValue(level3_fetches[i], + level2_fetches[i], + per_sample_tolerance, + relative_per_sample_tolerance, + false); + EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS); + } +} + +#ifndef DISABLE_CONTRIB_OPS + +TEST(NchwcOptimizerTests, ConvNchw) { + auto test_case = [&](const std::string& activation_op_type) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({16, 3, 112, 112}); + auto* output_arg = helper.MakeOutput(); + + auto* conv_output_arg = output_arg; + if (!activation_op_type.empty()) { + conv_output_arg = helper.MakeIntermediate(); + auto& act_node = helper.AddNode(activation_op_type, {conv_output_arg}, {output_arg}); + if (activation_op_type == "Clip") { + act_node.AddAttribute("min", 0.0f); + act_node.AddAttribute("max", 6.0f); + } + } + + auto& conv_node = helper.AddConvNode(input_arg, conv_output_arg, {130, 3, 3, 3}); + conv_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); + conv_node.AddAttribute("strides", std::vector{2, 2}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 0); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + if (!activation_op_type.empty()) { + EXPECT_EQ(op_to_count[activation_op_type], 0); + } + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + std::vector activation_op_types = {"", "Relu", "LeakyRelu", "Clip"}; + for (auto& activation_op_type : activation_op_types) { + test_case(activation_op_type); + } +} + +TEST(NchwcOptimizerTests, ConvNchwc) { + auto test_case = [&](const std::string& activation_op_type) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({16, 64, 28, 28}); + auto* output_arg = helper.MakeOutput(); + + auto* conv_output_arg = output_arg; + if (!activation_op_type.empty()) { + conv_output_arg = helper.MakeIntermediate(); + auto& act_node = helper.AddNode(activation_op_type, {conv_output_arg}, {output_arg}); + if (activation_op_type == "Clip") { + act_node.AddAttribute("min", -6.0f); + act_node.AddAttribute("max", 6.0f); + } + } + + helper.AddConvNode(input_arg, conv_output_arg, {127, 64, 3, 3}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + if (!activation_op_type.empty()) { + EXPECT_EQ(op_to_count[activation_op_type], 0); + } + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + std::vector activation_op_types = {"", "Relu", "LeakyRelu", "Clip"}; + for (auto& activation_op_type : activation_op_types) { + test_case(activation_op_type); + } +} + +TEST(NchwcOptimizerTests, ConvNchwcGrouped) { + auto test_case = [&](const std::string& activation_op_type) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({16, 48, 28, 28}); + auto* output_arg = helper.MakeOutput(); + + auto* conv_output_arg = output_arg; + if (!activation_op_type.empty()) { + conv_output_arg = helper.MakeIntermediate(); + helper.AddNode(activation_op_type, {conv_output_arg}, {output_arg}); + } + + auto& conv_node = helper.AddConvNode(input_arg, conv_output_arg, {192, 16, 3, 3}); + conv_node.AddAttribute("group", static_cast(3)); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + if (!activation_op_type.empty()) { + EXPECT_EQ(op_to_count[activation_op_type], 0); + } + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + std::vector activation_op_types = {"", "Relu", "LeakyRelu"}; + for (auto& activation_op_type : activation_op_types) { + test_case(activation_op_type); + } +} + +TEST(NchwcOptimizerTests, ConvDepthwise) { + auto test_case = [&](const std::string& activation_op_type) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({16, 96, 28, 28}); + auto* output_arg = helper.MakeOutput(); + + auto* conv_output_arg = output_arg; + if (!activation_op_type.empty()) { + conv_output_arg = helper.MakeIntermediate(); + helper.AddNode(activation_op_type, {conv_output_arg}, {output_arg}); + } + + auto& conv_node = helper.AddConvNode(input_arg, conv_output_arg, {96, 1, 3, 3}); + conv_node.AddAttribute("group", static_cast(96)); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + if (!activation_op_type.empty()) { + EXPECT_EQ(op_to_count[activation_op_type], 0); + } + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + std::vector activation_op_types = {"", "Relu", "LeakyRelu"}; + for (auto& activation_op_type : activation_op_types) { + test_case(activation_op_type); + } +} + +TEST(NchwcOptimizerTests, ConvPointwise) { + auto test_case = [&](const std::string& activation_op_type) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({16, 64, 28, 42}); + auto* output_arg = helper.MakeOutput(); + + auto* conv_output_arg = output_arg; + if (!activation_op_type.empty()) { + conv_output_arg = helper.MakeIntermediate(); + helper.AddNode(activation_op_type, {conv_output_arg}, {output_arg}); + } + + helper.AddConvNode(input_arg, conv_output_arg, {128, 64, 1, 1}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + if (!activation_op_type.empty()) { + EXPECT_EQ(op_to_count[activation_op_type], 0); + } + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + std::vector activation_op_types = {"", "Relu", "LeakyRelu"}; + for (auto& activation_op_type : activation_op_types) { + test_case(activation_op_type); + } +} + +TEST(NchwcOptimizerTests, ConvMaxPool) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 48, 34, 34}); + auto* conv_output_arg = helper.MakeIntermediate(); + auto* output_arg = helper.MakeOutput(); + + helper.AddConvNode(input_arg, conv_output_arg, {160, 48, 5, 5}); + + auto& pool_node = helper.AddNode("MaxPool", {conv_output_arg}, {output_arg}); + pool_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); + pool_node.AddAttribute("kernel_shape", std::vector{5, 5}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.MaxPool"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +TEST(NchwcOptimizerTests, ConvMaxPoolDilations) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 48, 66, 77}); + auto* conv_output_arg = helper.MakeIntermediate(); + auto* output_arg = helper.MakeOutput(); + + helper.AddConvNode(input_arg, conv_output_arg, {160, 48, 5, 5}); + + auto& pool_node = helper.AddNode("MaxPool", {conv_output_arg}, {output_arg}); + pool_node.AddAttribute("kernel_shape", std::vector{3, 3}); + pool_node.AddAttribute("dilations", std::vector{2, 2}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.MaxPool"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +TEST(NchwcOptimizerTests, ConvAveragePool) { + auto test_case = [&](bool count_include_pad) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 48, 34, 34}); + auto* conv_output_arg = helper.MakeIntermediate(); + auto* output_arg = helper.MakeOutput(); + + helper.AddConvNode(input_arg, conv_output_arg, {128, 48, 5, 5}); + + auto& pool_node = helper.AddNode("AveragePool", {conv_output_arg}, {output_arg}); + pool_node.AddAttribute("auto_pad", "SAME_UPPER"); + pool_node.AddAttribute("kernel_shape", std::vector{4, 4}); + if (count_include_pad) { + pool_node.AddAttribute("count_include_pad", static_cast(1)); + } + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc.AveragePool"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + test_case(false); + test_case(true); +} + +TEST(NchwcOptimizerTests, ConvGlobalPool) { + auto test_case = [&](const std::string& op_type) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 96, 54, 54}); + auto* conv_output_arg = helper.MakeIntermediate(); + auto* output_arg = helper.MakeOutput(); + + auto& conv_node = helper.AddConvNode(input_arg, conv_output_arg, {160, 96, 3, 3}); + conv_node.AddAttribute("dilations", std::vector{2, 2}); + + helper.AddNode(op_type, {conv_output_arg}, {output_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 1); + EXPECT_EQ(op_to_count["nchwc." + op_type], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + std::vector op_types = {"GlobalMaxPool", "GlobalAveragePool"}; + for (auto& op_type : op_types) { + test_case(op_type); + } +} + +TEST(NchwcOptimizerTests, ConvAddFusion) { + auto test_case = [&](const std::string& op_type, int opset_version, bool do_relu) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 32, 28, 28}); + auto* conv1_output_arg = helper.MakeIntermediate(); + auto* conv2_output_arg = helper.MakeIntermediate(); + auto* output_arg = helper.MakeOutput(); + + helper.AddConvNode(input_arg, conv1_output_arg, {32, 32, 3, 3}); + helper.AddConvNode(input_arg, conv2_output_arg, {32, 32, 3, 3}); + + if (do_relu) { + auto* add_output_arg = helper.MakeIntermediate(); + helper.AddNode(op_type, {conv1_output_arg, conv2_output_arg}, {add_output_arg}); + helper.AddNode("Relu", {add_output_arg}, {output_arg}); + } else { + helper.AddNode(op_type, {conv1_output_arg, conv2_output_arg}, {output_arg}); + } + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 2); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + EXPECT_EQ(op_to_count[op_type], 0); + EXPECT_EQ(op_to_count["Relu"], 0); + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph, opset_version); + }; + + // Verify that Add or Sum can be fused into a preceding NCHWc Conv node, + // with an optional Relu node following. + std::vector op_types = {"Add", "Sum"}; + static const int opset_versions[] = {7, 10}; + for (auto& op_type : op_types) { + for (auto opset_version : opset_versions) { + test_case(op_type, opset_version, false); + test_case(op_type, opset_version, true); + } + } +} + +TEST(NchwcOptimizerTests, FusedConvAddFusion) { + auto test_case = [&](bool do_relu1, bool do_relu2, int add_count) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 32, 28, 28}); + auto* add1_input_arg = helper.MakeIntermediate(); + auto* add2_input_arg = helper.MakeIntermediate(); + auto* output_arg = helper.MakeOutput(); + + helper.AddConvNode(input_arg, add1_input_arg, {32, 32, 3, 3}); + if (do_relu1) { + auto* relu_output_arg = helper.MakeIntermediate(); + helper.AddNode("Relu", {add1_input_arg}, {relu_output_arg}); + add1_input_arg = relu_output_arg; + } + + helper.AddConvNode(input_arg, add2_input_arg, {32, 32, 3, 3}); + if (do_relu2) { + auto* relu_output_arg = helper.MakeIntermediate(); + helper.AddNode("Relu", {add2_input_arg}, {relu_output_arg}); + add2_input_arg = relu_output_arg; + } + + helper.AddNode("Add", {add1_input_arg, add2_input_arg}, {output_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 2); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + EXPECT_EQ(op_to_count["Add"], add_count); + EXPECT_EQ(op_to_count["Relu"], 0); + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + // More variations of Conv/Add fusion: one or more of the inputs to the Add + // may already have a fused activation and cannot take the place of the Add + // node, but can be an input to another Conv node that doesn't have a fused + // activation. + test_case(false, false, 0); + test_case(false, true, 0); + test_case(true, false, 0); + test_case(true, true, 1); +} + +TEST(NchwcOptimizerTests, ConvConcat) { + auto test_case = [&](int axis, int channel_count, int reorder_output_count) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 48, 17, 34}); + auto* conv1_output_arg = helper.MakeIntermediate(); + auto* conv2_output_arg = helper.MakeIntermediate(); + auto* conv3_output_arg = helper.MakeIntermediate(); + auto* output_arg = helper.MakeOutput(); + + helper.AddConvNode(input_arg, conv1_output_arg, {64, 48, 5, 5}); + helper.AddConvNode(input_arg, conv2_output_arg, {channel_count, 48, 5, 5}); + helper.AddConvNode(input_arg, conv3_output_arg, {64, 48, 5, 5}); + + auto& concat_node = helper.AddNode("Concat", {conv1_output_arg, conv2_output_arg, conv3_output_arg}, {output_arg}); + concat_node.AddAttribute("axis", static_cast(axis)); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 3); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], reorder_output_count); + }; + + NchwcOptimizerTester(build_test_case, check_nchwc_graph); + }; + + // Concat along channel axis with aligned channel counts (stays in NCHWc format). + test_case(1, 96, 1); + + // Concat along channel axis with unaligned channel counts (reorders back to NCHW). + test_case(1, 98, 3); + + // Concat along non-channel axis (reorders back to NCHW). + test_case(0, 64, 3); +} + +TEST(NchwcOptimizerTests, ConvReuseWeightsOIHWBiBo) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({1, 64, 7, 7}); + auto* output1_arg = helper.MakeOutput(); + auto* output2_arg = helper.MakeOutput(); + auto* output3_arg = helper.MakeOutput(); + + std::vector weights_shape = {60, 64, 3, 3}; + auto* weights_arg = helper.MakeInitializer(weights_shape); + auto* biases_arg = helper.MakeInitializer({weights_shape[0]}); + + helper.AddNode("Conv", {input_arg, weights_arg, biases_arg}, {output1_arg}); + helper.AddNode("Conv", {input_arg, weights_arg, biases_arg}, {output2_arg}); + helper.AddNode("Conv", {input_arg, weights_arg, biases_arg}, {output3_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 3); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 1); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 3); + + // Verify that the weights and biases were converted once and reused. + std::unordered_set weight_args; + std::unordered_set bias_args; + const auto& graph = session.GetGraph(); + for (auto& node : graph.Nodes()) { + if (node.Domain() == kMSNchwcDomain && node.OpType() == "Conv") { + EXPECT_EQ(node.InputDefs().size(), 3); + weight_args.emplace(node.InputDefs()[1]); + bias_args.emplace(node.InputDefs()[2]); + } + } + EXPECT_EQ(weight_args.size(), 1); + EXPECT_EQ(bias_args.size(), 1); + }; + + // Verify that a single weight tensor is reordered once. + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +TEST(NchwcOptimizerTests, ConvReuseWeightsOIHWBo) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input1_arg = helper.MakeInput({1, 64, 7, 7}); + auto* input2_arg = helper.MakeInput({1, 64, 7, 7}); + auto* input3_arg = helper.MakeInput({1, 1, 7, 7}); + auto* input4_arg = helper.MakeInput({1, 1, 7, 7}); + auto* output1_arg = helper.MakeOutput(); + auto* output2_arg = helper.MakeOutput(); + auto* output3_arg = helper.MakeOutput(); + auto* output4_arg = helper.MakeOutput(); + + std::vector weights_shape = {64, 1, 3, 3}; + auto* weights_arg = helper.MakeInitializer(weights_shape); + auto* biases_arg = helper.MakeInitializer({weights_shape[0]}); + + auto& conv1_node = helper.AddNode("Conv", {input1_arg, weights_arg, biases_arg}, {output1_arg}); + conv1_node.AddAttribute("group", static_cast(64)); + + auto& conv2_node = helper.AddNode("Conv", {input2_arg, weights_arg, biases_arg}, {output2_arg}); + conv2_node.AddAttribute("group", static_cast(64)); + + helper.AddNode("Conv", {input3_arg, weights_arg, biases_arg}, {output3_arg}); + helper.AddNode("Conv", {input4_arg, weights_arg, biases_arg}, {output4_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 4); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 2); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 4); + + // Verify that the weights and biases were converted once and reused. + std::unordered_set weight_args; + std::unordered_set bias_args; + const auto& graph = session.GetGraph(); + for (auto& node : graph.Nodes()) { + if (node.Domain() == kMSNchwcDomain && node.OpType() == "Conv") { + EXPECT_EQ(node.InputDefs().size(), 3); + weight_args.emplace(node.InputDefs()[1]); + bias_args.emplace(node.InputDefs()[2]); + } + } + EXPECT_EQ(weight_args.size(), 1); + EXPECT_EQ(bias_args.size(), 1); + }; + + // Verify that a single weight tensor is reordered once. + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +TEST(NchwcOptimizerTests, ShapeInferencing) { + auto build_test_case = [&](NchwcTestHelper& helper) { + ONNX_NAMESPACE::TypeProto type_proto; + type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(3); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_height"); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_width"); + + auto* input_arg = helper.MakeInput({1, 3, 50, 100}, type_proto); + auto* output_arg = helper.MakeOutput(); + + // With these padding and kernel arguments, the shape along each spatial + // dimension is unchanged. + auto* conv1_output_arg = helper.MakeIntermediate(); + auto& conv1_node = helper.AddConvNode(input_arg, conv1_output_arg, {48, 3, 3, 3}); + conv1_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); + + auto* pool2a_output_arg = helper.MakeIntermediate(); + auto& pool2a_node = helper.AddNode("MaxPool", {conv1_output_arg}, {pool2a_output_arg}); + pool2a_node.AddAttribute("kernel_shape", std::vector{3, 3}); + pool2a_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); + + auto* pool2b_output_arg = helper.MakeIntermediate(); + auto& pool2b_node = helper.AddNode("MaxPool", {conv1_output_arg}, {pool2b_output_arg}); + pool2b_node.AddAttribute("kernel_shape", std::vector{3, 3}); + pool2b_node.AddAttribute("auto_pad", "SAME_LOWER"); + + auto* conv3a_output_arg = helper.MakeIntermediate(); + auto& conv3a_node = helper.AddConvNode(pool2a_output_arg, conv3a_output_arg, {64, 48, 3, 3}); + conv3a_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); + + auto* conv3b_output_arg = helper.MakeIntermediate(); + auto& conv3b_node = helper.AddConvNode(pool2b_output_arg, conv3b_output_arg, {64, 48, 3, 3}); + conv3b_node.AddAttribute("auto_pad", "SAME_UPPER"); + + helper.AddNode("Add", {conv3a_output_arg, conv3b_output_arg}, {output_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 3); + EXPECT_EQ(op_to_count["nchwc.MaxPool"], 2); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 0); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + EXPECT_EQ(op_to_count["Add"], 0); + }; + + // The NCHWc optimizer does a limited amount of symbolic shape inferencing to + // handle models such as YoloV3 which can have variable height/width. Without + // shape inferencing, the transformer would be unable to detect that the inputs + // to the Add node have identical shapes and thus is eligble for Conv/Add + // fusion. + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +TEST(NchwcOptimizerTests, ShapeInferencing2) { + auto build_test_case = [&](NchwcTestHelper& helper) { + ONNX_NAMESPACE::TypeProto type_proto; + type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_height"); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_width"); + + auto* input_arg = helper.MakeInput({1, 1, 49, 98}, type_proto); + auto* output_arg = helper.MakeOutput(); + + auto* conv1_output_arg = helper.MakeIntermediate(); + helper.AddConvNode(input_arg, conv1_output_arg, {16, 1, 1, 1}); + + auto* conv2a1_output_arg = helper.MakeIntermediate(); + auto& conv2a1_node = helper.AddConvNode(conv1_output_arg, conv2a1_output_arg, {16, 16, 2, 2}); + conv2a1_node.AddAttribute("pads", std::vector{1, 1, 0, 0}); + conv2a1_node.AddAttribute("strides", std::vector{2, 2}); + + auto* conv2a_output_arg = helper.MakeIntermediate(); + auto& conv2a2_node = helper.AddConvNode(conv2a1_output_arg, conv2a_output_arg, {16, 16, 2, 2}); + conv2a2_node.AddAttribute("auto_pad", "SAME_UPPER"); + + auto* conv2b_output_arg = helper.MakeIntermediate(); + auto& conv2b_node = helper.AddConvNode(conv1_output_arg, conv2b_output_arg, {16, 16, 1, 1}); + conv2b_node.AddAttribute("strides", std::vector{2, 2}); + + helper.AddNode("Add", {conv2a_output_arg, conv2b_output_arg}, {output_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 4); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 0); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + EXPECT_EQ(op_to_count["Add"], 0); + }; + + // Verify that convolutions using strides of 2 and variable height/width are + // recognized as eligible for Conv/Add fusion. This pattern occurs in models + // such as Faster-RCNN. + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +TEST(NchwcOptimizerTests, MixedOutputUsage) { + auto build_test_case = [&](NchwcTestHelper& helper) { + auto* input_arg = helper.MakeInput({6, 5, 11, 11}); + auto* output_arg = helper.MakeOutput(); + + auto* conv1_output_arg = helper.MakeIntermediate(); + helper.AddConvNode(input_arg, conv1_output_arg, {96, 5, 2, 2}); + + // Use conv1_output_arg as NCHWc. + auto* conv2_output_arg = helper.MakeIntermediate(); + auto& conv2_node = helper.AddConvNode(conv1_output_arg, conv2_output_arg, {96, 96, 3, 3}); + conv2_node.AddAttribute("auto_pad", "SAME_LOWER"); + + // Use conv1_output_arg as NCHW. + auto* neg_output_arg = helper.MakeIntermediate(); + helper.AddNode("Neg", {conv1_output_arg}, {neg_output_arg}); + + helper.AddNode("Add", {conv2_output_arg, neg_output_arg}, {output_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 2); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 0); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 2); + }; + + // Verify that mixed NCHWc/NCHW usages of NCHWc nodes. + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +TEST(NchwcOptimizerTests, TensorAlignment) { + auto build_test_case = [&](NchwcTestHelper& helper) { + // Input channel count must currently be a multiple of the NCHWc block size. + auto* input1_arg = helper.MakeInput({1, 60, 28, 42}); + auto* output1_arg = helper.MakeOutput(); + helper.AddConvNode(input1_arg, output1_arg, {128, 60, 1, 1}); + + // Grouped input channel count must be a multiple of the NCHWc block size. + auto* input2_arg = helper.MakeInput({1, 48, 28, 42}); + auto* output2_arg = helper.MakeOutput(); + auto& conv2_node = helper.AddConvNode(input2_arg, output2_arg, {128, 12, 3, 3}); + conv2_node.AddAttribute("group", static_cast(4)); + + // Grouped output channel count must be a multiple of the NCHWc block size. + auto* input3_arg = helper.MakeInput({1, 64, 28, 42}); + auto* output3_arg = helper.MakeOutput(); + auto& conv3_node = helper.AddConvNode(input3_arg, output3_arg, {48, 16, 3, 3}); + conv3_node.AddAttribute("group", static_cast(4)); + + // Channel count must currently be a multiple of the NCHWc block size. + auto* input4_arg = helper.MakeInput({1, 60, 12, 12}); + auto* output4_arg = helper.MakeOutput(); + auto& pool_node = helper.AddNode("MaxPool", {input4_arg}, {output4_arg}); + pool_node.AddAttribute("kernel_shape", std::vector{2, 2}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["Conv"], 3); + EXPECT_EQ(op_to_count["MaxPool"], 1); + EXPECT_EQ(op_to_count["nchwc.Conv"], 0); + EXPECT_EQ(op_to_count["nchwc.MaxPool"], 0); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 0); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 0); + }; + + // Verify that convolutions with unaligned inputs are not transformed. + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + +#endif + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/optimizer_test.cc b/onnxruntime/test/optimizer/optimizer_test.cc index fce027650fb5d..b8422cd1d9b3c 100644 --- a/onnxruntime/test/optimizer/optimizer_test.cc +++ b/onnxruntime/test/optimizer/optimizer_test.cc @@ -20,8 +20,6 @@ using namespace std; using namespace ONNX_NAMESPACE; -using namespace onnx; - namespace onnxruntime { namespace test { @@ -48,7 +46,7 @@ TEST(OptimizerTest, Basic) { initializer_tensor[i].set_name(inputs[i]->Name()); initializer_tensor[i].add_dims(tensor_dim); - initializer_tensor[i].set_data_type(onnx::TensorProto_DataType_INT32); + initializer_tensor[i].set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT32); for (int j = 0; j < tensor_dim; j++) { initializer_tensor[i].add_int32_data((i + 1) * j); } diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index ac3a921ffe944..849e4f1723431 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -16,6 +16,7 @@ #include #include +#include #include "test_configuration.h" @@ -31,7 +32,7 @@ namespace perftest { "\t-M: Disable memory pattern.\n" "\t-A: Disable memory arena\n" "\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n" - "\t-e [cpu|cuda|mkldnn|tensorrt|ngraph]: Specifies the provider 'cpu','cuda','mkldnn','tensorrt', 'ngraph' or 'openvino'. " + "\t-e [cpu|cuda|mkldnn|tensorrt|ngraph|openvino]: Specifies the provider 'cpu','cuda','mkldnn','tensorrt', 'ngraph' or 'openvino'. " "Default:'cpu'.\n" "\t-b [tf|ort]: backend to use. Default:ort\n" "\t-r [repeated_times]: Specifies the repeated times if running in 'times' test mode.Default:1000.\n" @@ -41,7 +42,7 @@ namespace perftest { "\t-v: Show verbose information.\n" "\t-x [thread_size]: Session thread pool size.\n" "\t-P: Use parallel executor instead of sequential executor.\n" - "\t-o [optimization level]: 0: No transformer optimization, 1:basic optimization, 2: full optimization. \n" + "\t-o [optimization level]: 0: disable optimization, 1: basic optimization, 2: extended optimization, 3: extended+layout optimization. \n" "\t-h: help\n"); } @@ -85,6 +86,8 @@ namespace perftest { test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider; } else if (!CompareCString(optarg, ORT_TSTR("openvino"))) { test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider; + } else if (!CompareCString(optarg, ORT_TSTR("nnapi"))) { + test_config.machine_config.provider_type_name = onnxruntime::kNnapiExecutionProvider; } else { return false; } @@ -127,8 +130,7 @@ namespace perftest { break; case 'o': test_config.run_config.optimization_level = static_cast(OrtStrtol(optarg, nullptr)); - // Valid values are: 0, 1, 2. - if (test_config.run_config.optimization_level > 2) { + if (test_config.run_config.optimization_level >= static_cast(TransformerLevel::MaxTransformerLevel)) { return false; } break; diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 906ba5693ea70..f863c61a179e6 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -66,6 +66,12 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_OpenVINO(session_options, "CPU")); #else ORT_THROW("OpenVINO is not supported in this build\n"); +#endif + } else if (provider_name == onnxruntime::kNnapiExecutionProvider) { +#ifdef USE_NNAPI + ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_Nnapi(session_options)); +#else + ORT_THROW("NNAPI is not supported in this build\n"); #endif } else if (!provider_name.empty() && provider_name != onnxruntime::kCpuExecutionProvider) { ORT_THROW("This backend is not included in perf test runner.\n"); diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc index 9064e3791ef62..cd8b284056205 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc @@ -34,8 +34,8 @@ void TestUnaryElementwiseOp(const char* szOp, std::vector& input_vals, excluded_providers.insert(kTensorrtExecutionProvider); } -//Disabled because of accuracy issues for MYRIAD FP16 and VAD_R -#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_VAD_R) +//Disabled because of accuracy issues for MYRIAD FP16 and VAD_M +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_VAD_M) int relu = strcmp(szOp, "Relu"); int leaky = strcmp(szOp, "LeakyRelu"); if(relu == 0 || leaky == 0){ diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc index d4293450f17ab..36a62bf4b90fe 100644 --- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc @@ -11,6 +11,7 @@ #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" +#include "test/framework/test_utils.h" using namespace ONNX_NAMESPACE; @@ -573,6 +574,62 @@ TEST(Loop, InfiniteLoopTermination) { terminator_thread.join(); } +// Regression test that a subgraph input overrides an outer scope value of the same name. +// Replicate issue from https://github.com/onnx/onnx/issues/2082 +TEST(Loop, SubgraphInputShadowsOuterScopeValue) { + SessionOptions so; + so.session_logid = "SubgraphInputShadowsOuterScopeValue"; + + InferenceSession session_object{so, &DefaultLoggingManager()}; + Status st; + ASSERT_TRUE((st = session_object.Load("testdata/subgraph_input_shadows_outer_scope_value.onnx")).IsOK()) << st; + ASSERT_TRUE((st = session_object.Initialize()).IsOK()) << st; + + // prepare inputs + std::vector scalar = {1}; + std::vector a = {3.f}, b = {6.f}; + std::vector trip_count = {10}; + std::vector keep_going = {true}; + + NameMLValMap feeds; + OrtValue ml_value; + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), scalar, a, &ml_value); + feeds.insert(std::make_pair("a", ml_value)); + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), scalar, b, &ml_value); + feeds.insert(std::make_pair("b", ml_value)); + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), scalar, trip_count, &ml_value); + feeds.insert(std::make_pair("max_trip_count", ml_value)); + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), scalar, keep_going, &ml_value); + feeds.insert(std::make_pair("keep_going_inp", ml_value)); + + // prepare outputs + std::vector output_names{"b", "user_defined_vals"}; + std::vector fetches; + + // Now run + onnxruntime::RunOptions run_options; + st = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(st.IsOK()) << st; + ASSERT_EQ(2, fetches.size()); + + // prepare expected outputs + float expected_value_b = 6.f; + std::vector expected_dims_user_defined_vals = {2, 1}; + std::vector expected_user_defined_vals = {-6.f, 12.f}; + + auto& b_out = fetches[0].Get(); + TensorShape expected_shape(scalar); + ASSERT_EQ(expected_shape, b_out.Shape()); + ASSERT_EQ(b_out.DataAsSpan()[0], expected_value_b); + + auto user_defined_vals_out = fetches[1].Get().DataAsSpan(); + ASSERT_EQ(expected_user_defined_vals.size(), static_cast(user_defined_vals_out.size())); + for (size_t i = 0, end = expected_user_defined_vals.size(); i < end; ++i) { + ASSERT_THAT(user_defined_vals_out[i], testing::FloatEq(expected_user_defined_vals[i])); + } +} + #ifdef USE_CUDA // test that when part of the subgraph run on CUDA it executes successfully TEST(Loop, MixedExecutionProviders) { diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index d917e5757685f..80770af33a06b 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -26,7 +26,7 @@ TEST(MathOpTest, Add_int64) { test.Run(); } -TEST(MathOpTest, Add) { +TEST(MathOpTest, Add_float) { OpTester test("Add"); std::vector dims{3, 3}; test.AddInput("A", dims, @@ -41,7 +41,31 @@ TEST(MathOpTest, Add) { {0.0f, 6.4f, 431.3f, 0.0f, 5.0f, -36.0f, -10.8f, 18.6f, 0.0f}); + +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_VAD_M) + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); //OpenVINO: Disabled due to accuracy mismatch for FP16 +#else test.Run(); +#endif +} + +TEST(MathOpTest, Add_double) { + OpTester test("Add"); + std::vector dims{3, 3}; + test.AddInput("A", dims, + {1.0, 2.0, -1.0, + 0.0, 1.5, -100.0, + -5.4, 9.3, -10'000.0}); + test.AddInput("B", dims, + {-1.0, 4.4, 432.3, + 0.0, 3.5, 64.0, + -5.4, 9.3, 10'000.0}); + test.AddOutput("C", dims, + {0.0, 6.4, 431.3, + 0.0, 5.0, -36.0, + -10.8, 18.6, 0.0}); + + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // Disabling OpenVINO as this type is not supported } TEST(MathOpTest, Add_Broadcast_Axis) { @@ -134,7 +158,14 @@ TEST(MathOpTest, Add_Broadcast_2x1x4_1x3x1) { 211.0f, 212.0f, 213.0f, 214.0f, 221.0f, 222.0f, 223.0f, 224.0f, 231.0f, 232.0f, 233.0f, 234.0f}); + +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_VAD_M) + //OpenVINO: Disabled due to software limitation for VPU Plugin. + //This test runs fine on CPU and GPU Plugins + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider,kOpenVINOExecutionProvider}); +#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Input batch size is inconsistent +#endif } TEST(MathOpTest, Add_Broadcast_2x1x1_3x4) { @@ -154,7 +185,13 @@ TEST(MathOpTest, Add_Broadcast_2x1x1_3x4) { 211.0f, 212.0f, 213.0f, 214.0f, 221.0f, 222.0f, 223.0f, 224.0f, 231.0f, 232.0f, 233.0f, 234.0f}); +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_VAD_M) + //OpenVINO: Disabled due to software limitation for VPU Plugin. + //This test runs fine on CPU and GPU Plugins + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider,kOpenVINOExecutionProvider}); +#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Input batch size is inconsistent +#endif } TEST(MathOpTest, Sub_int32) { @@ -238,7 +275,7 @@ TEST(MathOpTest, Mul) { 0.0f, 5.25f, -6'400.0f, 29.16f, 86.49f, -100'000'000.0f}); -#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_VAD_R) +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_VAD_M) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); //OpenVINO: Disabled due to accuracy issues for MYRIAD FP16 #else test.Run(); @@ -366,7 +403,7 @@ TEST(MathOpTest, Reciprocal) { test.Run(); } -TEST(MathOpTest, Sqrt) { +TEST(MathOpTest, Sqrt_Float) { OpTester test("Sqrt"); std::vector dims{2, 2}; test.AddInput("X", dims, @@ -378,7 +415,19 @@ TEST(MathOpTest, Sqrt) { test.Run(); } -TEST(MathOpTest, Pow) { +TEST(MathOpTest, Sqrt_Double) { + OpTester test("Sqrt"); + std::vector dims{2, 2}; + test.AddInput("X", dims, + {1.0, 4.0, + 0.0, 9.0}); + test.AddOutput("Y", dims, + {1.0, 2.0, + 0.0, 3.0}); + test.Run(); +} + +TEST(MathOpTest, Pow_Float) { OpTester test("Pow"); std::vector dims{2, 2}; test.AddInput("X", dims, @@ -393,6 +442,21 @@ TEST(MathOpTest, Pow) { test.Run(); } +TEST(MathOpTest, Pow_Double) { + OpTester test("Pow"); + std::vector dims{2, 2}; + test.AddInput("X", dims, + {2.0, 2.0, + std::sqrt(2.0), 1.0}); + test.AddInput("Y", dims, + {0.0, 8.0, + 2.0, 9.0}); + test.AddOutput("Z", dims, + {1.0, 256.0, + 2.0, 1.0}); + test.Run(); +} + TEST(MathOpTest, Pow_Broadcast_Scalar0) { OpTester test("Pow"); @@ -413,7 +477,7 @@ TEST(MathOpTest, Pow_Broadcast_Scalar1) { test.Run(); } -TEST(MathOpTest, Exp) { +TEST(MathOpTest, Exp_float) { OpTester test("Exp"); std::vector dims{2, 2}; test.AddInput("X", dims, @@ -426,6 +490,21 @@ TEST(MathOpTest, Exp) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: result differs } +TEST(MathOpTest, Exp_double) { + OpTester test("Exp"); + std::vector dims{2, 2}; + test.AddInput("X", dims, + {0.0, 1.0, + 2.0, 10.0}); + test.AddOutput("Y", dims, + {1.0, std::exp(1.0), + std::exp(2.0), std::exp(10.0)}); + test.SetOutputRelErr("Y", 1e-7f); + // TODO: Check if this test's result really differs for tensorRT + // For now basing this exclusion based on this test's float counterpart - Exp_float + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + TEST(MathOpTest, Log) { OpTester test("Log"); std::vector dims{2, 2}; @@ -457,7 +536,12 @@ TEST(MathOpTest, Sum_6) { {3.0f, 0.0f, 6.0f, -6.0f, 6.6f, 28.0f, -1.0f, 0.06f, 0.25f}); + +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_VAD_M) + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); //OpenVINO: Disabled due to accuracy mismatch for FP16 +#else test.Run(); +#endif } TEST(MathOpTest, Sum_8_Test1) { @@ -477,7 +561,13 @@ TEST(MathOpTest, Sum_8_Test1) { 311.0f, 312.0f, 313.0f, 321.0f, 322.0f, 323.0f, 331.0f, 332.0f, 333.0f}); +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_VAD_M) + //OpenVINO: Disabled due to software limitation for VPU Plugin. + //This test runs fine on CPU and GPU Plugins + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider,kOpenVINOExecutionProvider}); +#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Expected output shape [{3,3,3}] did not match run output shape [{3,1,1}] for sum +#endif } TEST(MathOpTest, Sum_8_Test2) { @@ -506,7 +596,13 @@ TEST(MathOpTest, Sum_8_Test2) { 3.3f, 4.4f, -94.7f, 59.6f, 64.01f, -8.0f}); +#if defined(OPENVINO_CONFIG_MYRIAD) || defined(OPENVINO_CONFIG_VAD_M) + //OpenVINO: Disabled due to software limitation for VPU Plugin. + //This test runs fine on CPU and GPU Plugins + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider,kOpenVINOExecutionProvider}); +#else test.Run(OpTester::ExpectResult::kExpectSuccess, "Sum is not correct", {kTensorrtExecutionProvider}); //TensorRT: result differs +#endif } TEST(MathOpTest, Min_6) { @@ -575,7 +671,7 @@ TEST(MathOpTest, Max_6) { test.Run(); } -TEST(MathOpTest, Max_8) { +TEST(MathOpTest, Max_8_Float) { OpTester test("Max", 8); test.AddInput("data_0", {1, 3}, {1.0f, 2.0f, 3.0f}); @@ -592,6 +688,23 @@ TEST(MathOpTest, Max_8) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Max_8_Double) { + OpTester test("Max", 8); + test.AddInput("data_0", {1, 3}, + {1.0, 2.0, 3.0}); + test.AddInput("data_2", {3, 3}, + {10.0, 20.0, 30.0, + 40.0, 50.0, 60.0, + 70.0, 80.0, 90.0}); + test.AddInput("data_1", {3, 1}, + {-1.0, -2.0, 300.0}); + test.AddOutput("max", {3, 3}, + {10.0, 20.0, 30.0, + 40.0, 50.0, 60.0, + 300.0, 300.0, 300.0}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Input batch size is inconsistent +} + TEST(MathOpTest, Max_8_2inputbroadcast) { OpTester test("Max", 8); test.AddInput("data_0", {1, 3}, @@ -693,7 +806,7 @@ TEST(MathOpTest, Less_Scalar1) { test.Run(); } -TEST(MathOpTest, Greater) { +TEST(MathOpTest, Greater_7) { OpTester test("Greater"); std::vector dims{4}; test.AddInput("A", dims, {1.0f, 0.0f, -1.0f, -1.0f}); @@ -702,6 +815,36 @@ TEST(MathOpTest, Greater) { test.Run(); } +TEST( MathOpTest, Greater_9_float ) +{ + OpTester test( "Greater", 9 ); + std::vector dims { 4 }; + test.AddInput( "A", dims, { 1.0f, 0.0f, -1.0f, -1.0f } ); + test.AddInput( "B", dims, { 1.0f, 1.0f, 2.0f, -1.0f } ); + test.AddOutput( "C", dims, { false, false, false, false } ); + test.Run(); +} + +TEST( MathOpTest, Greater_9_int32 ) +{ + OpTester test( "Greater", 9 ); + std::vector dims { 4 }; + test.AddInput( "A", dims, { 10, 11, 12, 13 } ); + test.AddInput( "B", dims, { 15, 7, 12, 9 } ); + test.AddOutput( "C", dims, { false, true, false, true } ); + test.Run(); +} + +TEST( MathOpTest, Greater_9_int64 ) +{ + OpTester test( "Greater", 9 ); + std::vector dims { 4 }; + test.AddInput( "A", dims, { 10, 11, 12, 13 } ); + test.AddInput( "B", dims, { 15, 7, 12, 9 } ); + test.AddOutput( "C", dims, { false, true, false, true } ); + test.Run(); +} + TEST(MathOpTest, Equal_bool) { OpTester test("Equal"); std::vector dims{4}; @@ -745,6 +888,15 @@ TEST(MathOpTest, Equal_int64) { test.Run(); } +TEST(MathOpTest, Equal_float) { + OpTester test("Equal", 11); + std::vector dims{4}; + test.AddInput("A", dims, {1.0f, 0.0f, -1.0f, -1.0f}); + test.AddInput("B", dims, {1.0f, 1.0f, 2.0f, -1.0f}); + test.AddOutput("C", dims, {true, false, false, true}); + test.Run(); +} + TEST(MathOpTest, Mean_6) { OpTester test("Mean", 6); std::vector dims{3, 3}; @@ -784,7 +936,7 @@ TEST(MathOpTest, Mean_8) { } template -void TrigTest(OpTester& test, std::initializer_list input) { +void TrigFloatTest(OpTester& test, std::initializer_list input) { std::vector dims{static_cast(input.size())}; std::vector output; @@ -796,59 +948,77 @@ void TrigTest(OpTester& test, std::initializer_list input) { test.Run(); } -TEST(MathOpTest, Sin) { +template +void TrigDoubleTest(OpTester& test, std::initializer_list input) { + std::vector dims{static_cast(input.size())}; + + std::vector output; + for (auto v : input) + output.push_back(op(v)); + + test.AddInput("X", dims, input); + test.AddOutput("Y", dims, output); + test.Run(); +} + +TEST(MathOpTest, SinFloat) { + OpTester test("Sin"); + TrigFloatTest(test, {1.1f, -1.1f, 2.2f, -2.2f}); +} + +TEST(MathOpTest, SinDouble) { OpTester test("Sin"); - TrigTest(test, {1.1f, -1.1f, 2.2f, -2.2f}); + TrigDoubleTest(test, {1.1, -1.1, 2.2, -2.2}); } TEST(MathOpTest, Cos) { OpTester test("Cos"); - TrigTest(test, {1.1f, -1.1f, 2.2f, -2.2f}); + TrigFloatTest(test, {1.1f, -1.1f, 2.2f, -2.2f}); } TEST(MathOpTest, Tan) { OpTester test("Tan"); - TrigTest(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f}); + TrigFloatTest(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f}); } TEST(MathOpTest, Asin) { OpTester test("Asin"); - TrigTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); + TrigFloatTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); } TEST(MathOpTest, Acos) { OpTester test("Acos"); - TrigTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); + TrigFloatTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); } TEST(MathOpTest, Atan) { OpTester test("Atan"); - TrigTest(test, {-10.0f, -5.0f, 0.0f, 5.0f, 10.0f}); + TrigFloatTest(test, {-10.0f, -5.0f, 0.0f, 5.0f, 10.0f}); } TEST(MathOpTest, Sinh) { OpTester test("Sinh", 9); - TrigTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); + TrigFloatTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); } TEST(MathOpTest, Cosh) { OpTester test("Cosh", 9); - TrigTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); + TrigFloatTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); } TEST(MathOpTest, Asinh) { OpTester test("Asinh", 9); - TrigTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); + TrigFloatTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); } TEST(MathOpTest, Acosh) { OpTester test("Acosh", 9); - TrigTest(test, {1.0f, 1.1f, 3.0f, 10.0f, 100.0f}); + TrigFloatTest(test, {1.0f, 1.1f, 3.0f, 10.0f, 100.0f}); } TEST(MathOpTest, Atanh) { OpTester test("Atanh", 9); - TrigTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); + TrigFloatTest(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); } TEST(MathOpTest, Expand_8_3x3) { @@ -955,9 +1125,9 @@ TEST(MathOpTest, Expand_8_3x1x3x1_int64) { test.AddInput("data_0", {1, 3, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9}); test.AddInput("data_1", {4}, {3, 1, 3, 1}); test.AddOutput("result", {3, 3, 3, 3}, - {1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, - 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, - 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9,}); + {1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, + 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, + 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9,}); test.Run(); } diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc index 2fb17717fa552..8673f9cc6b429 100644 --- a/onnxruntime/test/providers/cpu/math/gemm_test.cc +++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc @@ -157,7 +157,7 @@ TEST(GemmOpTest, GemmScalarBroadcast) { test.Run(); } -TEST(MathOpTest, Gemm2DBroadcast) { +TEST(GemmOpTest, Gemm2DBroadcast_1) { OpTester test("Gemm"); test.AddAttribute("transA", (int64_t)0); @@ -176,6 +176,26 @@ TEST(MathOpTest, Gemm2DBroadcast) { test.Run(); } +TEST(GemmOpTest, Gemm2DBroadcast_2) { + OpTester test("Gemm"); + + test.AddAttribute("transA", (int64_t)0); + test.AddAttribute("transB", (int64_t)0); + test.AddAttribute("alpha", 1.0f); + test.AddAttribute("beta", 1.0f); + + // Same as GemmBroadcast, but adding the unnecessary second dimension. + test.AddInput("A", {2, 4}, + {1.0f, 2.0f, 3.0f, 4.0f, + -1.0f, -2.0f, -3.0f, -4.0f}); + test.AddInput("B", {4, 3}, std::vector(12, 1.0f)); + test.AddInput("C", {1, 3}, std::vector{1.0f, 2.0f, 3.0f}); + test.AddOutput("Y", {2, 3}, + {11.0f, 12.0f, 13.0f, + -9.0f, -8.0f, -7.0f}); + test.Run(); +} + TEST(GemmOpTest, GemmFalseBroadcast) { OpTester test("Gemm"); diff --git a/onnxruntime/test/providers/cpu/ml/onehotencoder_test.cc b/onnxruntime/test/providers/cpu/ml/onehotencoder_test.cc index 05af2af22c218..a4ce3f3f1b509 100644 --- a/onnxruntime/test/providers/cpu/ml/onehotencoder_test.cc +++ b/onnxruntime/test/providers/cpu/ml/onehotencoder_test.cc @@ -41,6 +41,18 @@ void TestIntCategory(std::vector& input) { test_vector.AddAttribute("zeros", int64_t{0}); test_vector.Run(OpTester::ExpectResult::kExpectFailure); + + // Test MultiDimensional [:, :, Labels] + OpTester test_multiD("OneHotEncoder", 1, onnxruntime::kMLDomain); + test_multiD.AddAttribute("cats_int64s", categories); + test_multiD.AddInput("X", {1, 1, 7}, input); + test_multiD.AddOutput("Y", {1, 1, 7, 8}, expected_output); + + test_multiD.AddAttribute("zeros", int64_t{1}); + test_multiD.Run(); + + test_multiD.AddAttribute("zeros", int64_t{0}); + test_multiD.Run(OpTester::ExpectResult::kExpectFailure); } TEST(OneHotEncoderOpTest, IntegerWithInt64) { @@ -49,17 +61,18 @@ TEST(OneHotEncoderOpTest, IntegerWithInt64) { } /* +// TODO: Support int32_t type kernel for the op and uncomment the test TEST(OneHotEncoderOpTest, IntegerWithInt32) { - vector input{ 8, 1, 0, 0, 3, 7, 4 }; - TestIntCategory(input); + vector input{ 8, 1, 0, 0, 3, 7, 4 }; + TestIntCategory(input); } +*/ TEST(OneHotEncoderOpTest, IntegerWithDouble) { vector input{ 8.1f, 1.2f, 0.0f, 0.7f, 3.4f, 7.9f, 4.4f }; TestIntCategory(input); } -*/ TEST(OneHotEncoderOpTest, String) { std::vector categories{"Apple", "Orange", "Watermelon", "Blueberry", "Coconut", "Mango", "Tangerine"}; vector input{"Watermelon", "Orange", "Tangerine", "Apple", "Kit"}; @@ -95,6 +108,18 @@ TEST(OneHotEncoderOpTest, String) { test_vector.AddAttribute("zeros", int64_t{0}); test_vector.Run(OpTester::ExpectResult::kExpectFailure); + + // Test MultiDimensional [:, Labels, :] + OpTester test_multiD("OneHotEncoder", 1, onnxruntime::kMLDomain); + test_multiD.AddAttribute("cats_strings", categories); + test_multiD.AddInput("X", {1, 5, 1}, input); + test_multiD.AddOutput("Y", {1, 5, 1, 7}, expected_output); + + test_multiD.AddAttribute("zeros", int64_t{1}); + test_multiD.Run(); + + test_multiD.AddAttribute("zeros", int64_t{0}); + test_multiD.Run(OpTester::ExpectResult::kExpectFailure); } } // namespace test diff --git a/onnxruntime/test/providers/cpu/ml/tree_ensembler_classifier_test.cc b/onnxruntime/test/providers/cpu/ml/tree_ensembler_classifier_test.cc index f1b7616943430..36318226ef7b6 100644 --- a/onnxruntime/test/providers/cpu/ml/tree_ensembler_classifier_test.cc +++ b/onnxruntime/test/providers/cpu/ml/tree_ensembler_classifier_test.cc @@ -151,120 +151,5 @@ TEST(MLOpTest, TreeEnsembleClassifierBinary) { test.Run(); } -TEST(MLOpTest, TreeEnsembleClassifierBinaryBaseValue) { - OpTester test("TreeEnsembleClassifier", 1, onnxruntime::kMLDomain); - - // The example was generated by the following python script: - // model = GradientBoostingClassifier(n_estimators = 1, max_depth = 2) - // X, y = make_classification(10, n_features = 4, random_state = 42) - // X = X[:, :2] - // model.fit(X, y) - // model.init_.class_prior_ = np.array([0.231, 0.231]) - - std::vector base_values = {-1.202673316001892f, -1.202673316001892f}; - std::vector class_ids = {0, 0, 0}; - std::vector class_nodeids = {2, 3, 4}; - std::vector class_treeids = {0, 0, 0}; - std::vector class_weights = {-0.2f, -0.06f, 0.2f}; - std::vector classlabels_int64s = {0, 1}; - std::vector nodes_falsenodeids = {4, 3, 0, 0, 0}; - std::vector nodes_featureids = {0, 0, 0, 0, 0}; - std::vector nodes_hitrates = {1, 1, 1, 1, 1}; - std::vector nodes_missing_value_tracks_true = {0, 0, 0, 0, 0}; - std::vector nodes_modes = {"BRANCH_LEQ", "BRANCH_LEQ", "LEAF", "LEAF", "LEAF"}; - std::vector nodes_nodeids = {0, 1, 2, 3, 4}; - std::vector nodes_treeids = {0, 0, 0, 0, 0}; - std::vector nodes_truenodeids = {1, 2, 0, 0, 0}; - std::vector nodes_values = {0.21111594140529633f, -0.8440752029418945f, 0, 0, 0}; - std::string post_transform = "LOGISTIC"; - - std::vector X = {-0.92533575f, -1.14021544f, -0.46171143f, -0.58723065f, 1.44044386f, 1.77736657f}; - std::vector results = {0, 0, 0}; - std::vector probs = {}; - std::vector log_probs = {}; - std::vector scores{0.802607834f, 0.197392166f, 0.779485941f, 0.220514059f, 0.731583834f, 0.268416166f}; - - //define the context of the operator call - const int N = 3; - test.AddAttribute("base_values", base_values); - test.AddAttribute("class_ids", class_ids); - test.AddAttribute("class_nodeids", class_nodeids); - test.AddAttribute("class_treeids", class_treeids); - test.AddAttribute("class_weights", class_weights); - test.AddAttribute("classlabels_int64s", classlabels_int64s); - test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids); - test.AddAttribute("nodes_featureids", nodes_featureids); - test.AddAttribute("nodes_hitrates", nodes_hitrates); - test.AddAttribute("nodes_modes", nodes_modes); - test.AddAttribute("nodes_nodeids", nodes_nodeids); - test.AddAttribute("nodes_treeids", nodes_treeids); - test.AddAttribute("nodes_truenodeids", nodes_truenodeids); - test.AddAttribute("nodes_values", nodes_values); - test.AddAttribute("post_transform", post_transform); - - test.AddInput("X", {N, 2}, X); - test.AddOutput("Y", {N}, results); - test.AddOutput("Z", {N, 2}, scores); - - test.Run(); -} - -TEST(MLOpTest, TreeEnsembleClassifierBinaryBaseValueNull) { - OpTester test("TreeEnsembleClassifier", 1, onnxruntime::kMLDomain); - - // The example was generated by the following python script: - // model = GradientBoostingClassifier(n_estimators = 1, max_depth = 2) - // X, y = make_classification(10, n_features = 4, random_state = 42) - // X = X[:, :2] - // model.fit(X, y) - - std::vector base_values = {0, 0}; - std::vector class_ids = {0, 0, 0}; - std::vector class_nodeids = {2, 3, 4}; - std::vector class_treeids = {0, 0, 0}; - std::vector class_weights = {-0.2f, -0.0666f, 0.2f}; - std::vector classlabels_int64s = {0, 1}; - std::vector nodes_falsenodeids = {4, 3, 0, 0, 0}; - std::vector nodes_featureids = {0, 0, 0, 0, 0}; - std::vector nodes_hitrates = {1, 1, 1, 1, 1}; - std::vector nodes_missing_value_tracks_true = {0, 0, 0, 0, 0}; - std::vector nodes_modes = {"BRANCH_LEQ", "BRANCH_LEQ", "LEAF", "LEAF", "LEAF"}; - std::vector nodes_nodeids = {0, 1, 2, 3, 4}; - std::vector nodes_treeids = {0, 0, 0, 0, 0}; - std::vector nodes_truenodeids = {1, 2, 0, 0, 0}; - std::vector nodes_values = {0.24055418372154236f, -0.8440752029418945f, 0, 0, 0}; - std::string post_transform = "LOGISTIC"; - - std::vector X = {-0.92533575f, -1.14021544f, -0.46171143f, -0.58723065f, 1.44044386f, 1.77736657f}; - std::vector results = {0, 0, 1}; - std::vector probs = {}; - std::vector log_probs = {}; - std::vector scores{0.549834f, 0.450166f, 0.5166605f, 0.4833395f, 0.450166f, 0.549834f}; - - //define the context of the operator call - const int N = 3; - test.AddAttribute("base_values", base_values); - test.AddAttribute("class_ids", class_ids); - test.AddAttribute("class_nodeids", class_nodeids); - test.AddAttribute("class_treeids", class_treeids); - test.AddAttribute("class_weights", class_weights); - test.AddAttribute("classlabels_int64s", classlabels_int64s); - test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids); - test.AddAttribute("nodes_featureids", nodes_featureids); - test.AddAttribute("nodes_hitrates", nodes_hitrates); - test.AddAttribute("nodes_modes", nodes_modes); - test.AddAttribute("nodes_nodeids", nodes_nodeids); - test.AddAttribute("nodes_treeids", nodes_treeids); - test.AddAttribute("nodes_truenodeids", nodes_truenodeids); - test.AddAttribute("nodes_values", nodes_values); - test.AddAttribute("post_transform", post_transform); - - test.AddInput("X", {N, 2}, X); - test.AddOutput("Y", {N}, results); - test.AddOutput("Z", {N, 2}, scores); - - test.Run(); -} - } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc index 7b2e91b95afdf..802aaa84b310e 100644 --- a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc @@ -23,5 +23,29 @@ TEST(Dropout, Opset10) { test.Run(); } +TEST(Dropout, WithOptionalOutputOpset10) { + OpTester test("Dropout", 10, kOnnxDomain); + std::vector dims{2, 2}; + test.AddInput("X", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("Y", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("mask", dims, {false, false, false, false}); + // The NGraph execution provider doesn't seem to support 'Dropout' with optional mask output + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kNGraphExecutionProvider}); +} + +TEST(Dropout, WithOptionalOutputOpset7) { + // Opset 7 differs with Opset 10 in that the type of the 'mask' + // output is tied with the type of the input in Opset 7 whereas + // the type of 'mask' in Opset 10 is 'bool' always + OpTester test("Dropout", 7, kOnnxDomain); + std::vector dims{2, 2}; + test.AddInput("X", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("Y", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("mask", dims, {0.0f, 0.0f, 0.0f, 0.0f}); + // The NGraph execution provider doesn't seem to support 'Dropout' with optional mask output + // The TensorRT execution provider doesn't seem to support 'Dropout' with non-boolean mask output + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kNGraphExecutionProvider, kTensorrtExecutionProvider}); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc index 27658f1cc7ed7..73f0ca2eb9da9 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc @@ -51,7 +51,7 @@ TEST(PoolTest, MaxPool) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: result differs + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: result differs } // Only CUDA kernel has float 16 support @@ -104,11 +104,11 @@ TEST(PoolTest, MaxPool_F16) { test.AddInput("X", x_dims, f_X); test.AddOutput("Y", expected_dims, f_Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Assertion `!attrs.count("pads")' failed + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: Assertion `!attrs.count("pads")' failed } #endif -static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order=0) { +static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) { OpTester test("MaxPool", 8); test.AddAttribute("auto_pad", ""); @@ -160,7 +160,7 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order=0) { } TEST(PoolTest, MaxPool_8_With_Index) { - MaxPool_8_WithIndexTest(false); // row major + MaxPool_8_WithIndexTest(false); // row major MaxPool_8_WithIndexTest(true, 0 /*storage_order*/); // row major MaxPool_8_WithIndexTest(true, 1 /*storage_order*/); // col major } @@ -229,6 +229,26 @@ TEST(PoolTest, MaxPool_10_Dilation_1d) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(PoolTest, MaxPool_10_DilationPadding_1d) { + OpTester test("MaxPool", 10); + + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", std::vector{1}); + test.AddAttribute("pads", vector{1, 1}); + test.AddAttribute("kernel_shape", vector{3}); + test.AddAttribute("dilations", vector{3}); + + std::vector x_vals = { + 1, 3, 2, 4, -1, -3, -2, -4, -6, -5, -4, -2}; + std::vector x_dims = {1, 1, 12}; + std::vector expected_dims = {1, 1, 8}; + std::vector expected_vals = {2, 4, 3, 2, 4, -1, -2, -2}; + + test.AddInput("X", x_dims, x_vals); + test.AddOutput("Y", expected_dims, expected_vals); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider}); +} + TEST(PoolTest, MaxPool_10_Dilation_2d) { OpTester test("MaxPool", 10); @@ -239,11 +259,10 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) { test.AddAttribute("dilations", vector{2, 2}); std::vector x_vals = { - 1, 3, 2, 4, -1, - 5, 7, 6, 8, -2, - 9, 11, 10, 12, -3, - 13, 15, 14, 16, -4, - }; + 1, 3, 2, 4, -1, + 5, 7, 6, 8, -2, + 9, 11, 10, 12, -3, + 13, 15, 14, 16, -4}; std::vector x_dims = {1, 1, 4, 5}; std::vector expected_dims = {1, 1, 2, 3}; std::vector expected_vals = {10, 12, 10, 14, 16, 14}; @@ -253,6 +272,33 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(PoolTest, MaxPool_10_DilationPadding_2d) { + OpTester test("MaxPool", 10); + + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", std::vector{1, 1}); + test.AddAttribute("pads", vector{1, 1, 1, 1}); + test.AddAttribute("kernel_shape", vector{2, 2}); + test.AddAttribute("dilations", vector{2, 2}); + + std::vector x_vals = { + 1, 3, 2, 4, -1, + 5, 7, 6, 8, -2, + 9, 11, 10, 12, -3, + 13, 15, 14, 16, -4}; + std::vector x_dims = {1, 1, 4, 5}; + std::vector expected_dims = {1, 1, 4, 5}; + std::vector expected_vals = { + 7, 6, 8, 6, 8, + 11, 10, 12, 10, 12, + 15, 14, 16, 14, 16, + 11, 10, 12, 10, 12}; + + test.AddInput("X", x_dims, x_vals); + test.AddOutput("Y", expected_dims, expected_vals); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider}); +} + TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) { OpTester test("MaxPool", 10); @@ -263,11 +309,10 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) { test.AddAttribute("dilations", vector{2, 2}); std::vector x_vals = { - 1, 3, 2, 4, -1, - 5, 7, 6, 8, -2, - 9, 11, 10, 12, -3, - 13, 15, 14, 16, -4, - }; + 1, 3, 2, 4, -1, + 5, 7, 6, 8, -2, + 9, 11, 10, 12, -3, + 13, 15, 14, 16, -4}; std::vector x_dims = {1, 1, 4, 5}; std::vector expected_dims = {1, 1, 1, 3}; std::vector expected_vals = {10, 12, 10}; @@ -288,11 +333,10 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) { test.AddAttribute("ceil_mode", (int64_t)1); std::vector x_vals = { - 1, 3, 2, 4, -1, - 5, 7, 6, 8, -2, - 9, 11, 10, 12, -3, - 13, 15, 14, 16, -4, - }; + 1, 3, 2, 4, -1, + 5, 7, 6, 8, -2, + 9, 11, 10, 12, -3, + 13, 15, 14, 16, -4}; std::vector x_dims = {1, 1, 4, 5}; std::vector expected_dims = {1, 1, 2, 3}; std::vector expected_vals = {10, 12, 10, 10, 12, 10}; @@ -302,6 +346,41 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(PoolTest, MaxPool_10_DilationPadding_3d) { + OpTester test("MaxPool", 10); + + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", std::vector{1, 1, 1}); + test.AddAttribute("pads", vector{1, 1, 1, 1, 1, 1}); + test.AddAttribute("kernel_shape", vector{2, 2, 2}); + test.AddAttribute("dilations", vector{2, 2, 2}); + + std::vector x_vals = { + 1, 3, 2, 4, -1, + 5, 7, 6, 8, -2, + 9, 11, 10, 12, -3, + 13, 15, 14, 16, -4, + 1, 3, 2, 4, -1, + 5, 7, 6, 8, -2, + 9, 11, 10, 12, -3, + 13, 15, 14, 16, -4}; + std::vector x_dims = {1, 1, 2, 4, 5}; + std::vector expected_dims = {1, 1, 2, 4, 5}; + std::vector expected_vals = { + 7, 6, 8, 6, 8, + 11, 10, 12, 10, 12, + 15, 14, 16, 14, 16, + 11, 10, 12, 10, 12, + 7, 6, 8, 6, 8, + 11, 10, 12, 10, 12, + 15, 14, 16, 14, 16, + 11, 10, 12, 10, 12}; + + test.AddInput("X", x_dims, x_vals); + test.AddOutput("Y", expected_dims, expected_vals); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider}); +} + TEST(PoolTest, GlobalMaxPool) { OpTester test("GlobalMaxPool"); @@ -566,17 +645,16 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) { test.AddAttribute("strides", std::vector{3, 1}); test.AddAttribute("pads", vector{0, 0, 0, 0}); test.AddAttribute("kernel_shape", vector{2, 2}); - test.AddAttribute("ceil_mode", (int64_t) 1); + test.AddAttribute("ceil_mode", (int64_t)1); std::vector x_vals = { - 1, 3, 2, 4, - 5, 7, 6, 8, - 9, 11, 10, 12, - 13, 15, 14, 16, - }; + 1, 3, 2, 4, + 5, 7, 6, 8, + 9, 11, 10, 12, + 13, 15, 14, 16}; std::vector x_dims = {1, 1, 4, 4}; std::vector expected_dims = {1, 1, 2, 3}; - std::vector expected_vals = {4.0f, 4.5f, 5.0f , 14.0f, 14.5f, 15.0f}; + std::vector expected_vals = {4.0f, 4.5f, 5.0f, 14.0f, 14.5f, 15.0f}; test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); diff --git a/onnxruntime/test/providers/cpu/nn/non_max_suppression_test.cc b/onnxruntime/test/providers/cpu/object_detection/non_max_suppression_test.cc similarity index 99% rename from onnxruntime/test/providers/cpu/nn/non_max_suppression_test.cc rename to onnxruntime/test/providers/cpu/object_detection/non_max_suppression_test.cc index e309bedaa603a..9675612b7e12e 100644 --- a/onnxruntime/test/providers/cpu/nn/non_max_suppression_test.cc +++ b/onnxruntime/test/providers/cpu/object_detection/non_max_suppression_test.cc @@ -267,7 +267,7 @@ TEST(NonMaxSuppressionOpTest, InconsistentBoxAndScoreShapes) { test.AddInput("iou_threshold", {}, {0.5f}); test.AddInput("score_threshold", {}, {0.0f}); test.AddOutput("selected_indices", {0, 3}, {}); - test.Run(OpTester::ExpectResult::kExpectFailure, "boxes and scores should have same spatial_dimention."); + test.Run(OpTester::ExpectResult::kExpectFailure, "boxes and scores should have same spatial_dimension."); } TEST(NonMaxSuppressionOpTest, InvalidIOUThreshold) { diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc index 250cc629c8264..0c2660b7a015d 100644 --- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc +++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc @@ -258,7 +258,7 @@ TEST(RoiAlignTest, AvgModeNegativeInvalidNumRoiDims) { 35.1354f,56.7708f,56.7708f,56.7708f,56.8021f,58.4375f,58.4375f,58.4375f,58.4688f,60.1042f, 60.1042f,60.1042f,60.1354f}); - test.Run(OpTester::ExpectResult::kExpectFailure, "[ShapeInferenceError] rois input tensor has wrong dimension"); + test.Run(OpTester::ExpectResult::kExpectFailure, "[ShapeInferenceError] Input 1 expected to have rank 2 but has rank 3"); } TEST(RoiAlignTest, AvgModeNegativeInvalidSecondRoiDims) { @@ -341,7 +341,7 @@ TEST(RoiAlignTest, MismatchNumRois) { 35.1354f,56.7708f,56.7708f,56.7708f,56.8021f,58.4375f,58.4375f,58.4375f,58.4688f,60.1042f, 60.1042f,60.1042f,60.1354f}); - test.Run(OpTester::ExpectResult::kExpectFailure, "First dimension (num_rois) of batch_indices and rois don't match"); + test.Run(OpTester::ExpectResult::kExpectFailure, "[ShapeInferenceError] Dimension mismatch in unification between 4 and 5"); } } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 9a5424ff1ae09..3a7e0ad762067 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -676,6 +676,23 @@ TEST(ReductionOpTest, ReduceSum) { test.Run(); } +TEST(ReductionOpTest, ReduceSum_double) { + OpTester test("ReduceSum"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0, 2.0, + 3.0, 4.0, + + 5.0, 6.0, + 7.0, 8.0, + + 9.0, 10.0, + 11.0, 12.0}); + test.AddOutput("reduced", {1, 2, 1}, {33.0, 45.0}); + test.Run(); +} + TEST(ReductionOpTest, ReduceSum_axes01) { OpTester test("ReduceSum"); test.AddAttribute("axes", std::vector{2}); @@ -798,6 +815,23 @@ TEST(ReductionOpTest, ReduceSumSquare) { test.Run(); } +TEST(ReductionOpTest, ReduceSumSquare_double) { + OpTester test("ReduceSumSquare"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0, 2.0, + 3.0, 4.0, + + 5.0, 6.0, + 7.0, 8.0, + + 9.0, 10.0, + 11.0, 12.0}); + test.AddOutput("reduced", {1, 2, 1}, {247.0, 403.}); + test.Run(); +} + TEST(ReductionOpTest, ReduceSumSquare_int32) { OpTester test("ReduceSumSquare"); test.AddAttribute("axes", std::vector{0, 2}); diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases.inl b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases.inl index afd0eb730e8e9..e829dca03cf55 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases.inl +++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases.inl @@ -1,4 +1,15 @@ // Please don't manually edit this file. Generated from reduction_test_cases_generator.py +// Optimizations are disabled in this file to improve build throughput +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#pragma optimize ("", off) +#elif defined(__GNUC__) +#if defined(__clang__) + #pragma clang optimize off +#else + #pragma GCC push_options + #pragma GCC optimize ("O0") +#endif +#endif ReductionTestCases testcases = { // input_data { @@ -5355,3 +5366,12 @@ ReductionTestCases testcases = { 1.000000f, })}, }}; +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#pragma optimize ("", on) +#elif defined(__GNUC__) +#if defined(__clang__) + #pragma clang optimize on +#else + #pragma GCC pop_options +#endif +#endif diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py index b8bf6f126dd6a..6acbd4970d5db 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py +++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py @@ -63,6 +63,30 @@ def PrintResult(op, axes, keepdims, res): print ("})},") +def PrintDisableOptimizations(): + print ("// Optimizations are disabled in this file to improve build throughput") + print ("#if defined(_MSC_VER) || defined(__INTEL_COMPILER)") + print ("#pragma optimize (\"\", off)") + print ("#elif defined(__GNUC__)") + print ("#if defined(__clang__)") + print ("\t#pragma clang optimize off") + print ("#else") + print ("\t#pragma GCC push_options") + print ("\t#pragma GCC optimize (\"O0\")") + print ("#endif") + print ("#endif") + +def PrintReenableOptimizations(): + print ("#if defined(_MSC_VER) || defined(__INTEL_COMPILER)") + print ("t#pragma optimize (\"\", on)") + print ("#elif defined(__GNUC__)") + print ("#if defined(__clang__)") + print ("\t#pragma clang optimize on") + print ("#else") + print ("\t#pragma GCC pop_options") + print ("#endif") + print ("#endif") + if __name__ == "__main__": from itertools import product input_shape = [2,3,2,2,3] @@ -73,6 +97,7 @@ def PrintResult(op, axes, keepdims, res): ops = ["ReduceL1", "ReduceL2", "ReduceLogSum", "ReduceLogSumExp", "ReduceMax", "ReduceMean", "ReduceMin", "ReduceProd", "ReduceSum", "ReduceSumSquare", "ArgMax", "ArgMin"] print ("// Please don't manually edit this file. Generated from reduction_test_cases_generator.py") + PrintDisableOptimizations() print ("ReductionTestCases testcases = {") print ("// input_data") print ("{") @@ -101,3 +126,4 @@ def PrintResult(op, axes, keepdims, res): print ("}") print ("};") + PrintReenableOptimizations() diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc index 3314a8e627d8a..fe9cf9a389373 100644 --- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc +++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc @@ -790,6 +790,39 @@ TEST(GRUTest, ONNXRuntime_TestGRUOpSequenceLengthWithBidirectionalLinearBeforeRe ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h, true); } +TEST(GRUTest, ONNXRuntime_TestGRUOpShorterSeqInMiddle) { + const std::string direction = "bidirectional"; + const std::vector activations = {"sigmoid", "tanh", "sigmoid", "tanh"}; + + DeepCpuGruOpTestContext ctx(direction, activations); + + const int batch_size = 3; + const int seq_length = 2; + std::vector X = {-0.455351f, -0.276391f, + 0.855351f, 0.676391f, + -0.185934f, -0.269585f, + -0.585934f, 0.669585f, + -0.351455f, -0.391276f, + 0.670351f, 0.894676f}; + std::vector sequence_length = {2, 1, 2}; + std::vector initial_h = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + std::vector expected_Y = {-0.0325528607f, 0.0774837881f, -0.275918573f, -0.00228558504f, -0.0456649921f, 0.0462125241f, + -0.108452908f, 0.15118938684f, -0.2759185731f, -0.0022855850f, -0.1950065642f, 0.0961040258f, + + -0.1671274304f, 0.1817691028f, 0.0f, 0.0f, -0.3073617219f, 0.0686715841f, + -0.1494070887f, 0.1356348693f, 0.0f, 0.0f, -0.2866500020f, 0.0448506586f}; + std::vector expected_Y_h = {-0.1671274304f, 0.18176910281f, + -0.2759185731f, -0.00228558504f, + -0.3073617219f, 0.0686715841f, + + -0.1084529086f, 0.15118938684f, + -0.2759185731f, -0.00228558504f, + -0.1950065642f, 0.0961040258f}; + + ctx.RunTest(X, batch_size, seq_length, sequence_length, &initial_h, expected_Y, expected_Y_h, true); +} + TEST(GRUTest, ONNXRuntime_TestGRUOpSequenceLengthWithPartialZero) { const std::string direction = "bidirectional"; const std::vector activations = {"sigmoid", "tanh", "sigmoid", "tanh"}; diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc index eaa0cc649f91a..34121981a7271 100644 --- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc +++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc @@ -45,7 +45,8 @@ static void RunLstmTest(const std::vector& X_data, // copy the following vectors as we may modify them std::vector activations = {}, std::vector activation_alphas = {}, - std::vector activation_betas = {}) { + std::vector activation_betas = {}, + bool hasClip = true) { OpTester test("LSTM"); int num_directions = (direction == "bidirectional") ? 2 : 1; @@ -68,7 +69,9 @@ static void RunLstmTest(const std::vector& X_data, test.AddAttribute("hidden_size", hidden_size); // test.AddAttribute("output_sequence", output_sequence); test.AddAttribute("input_forget", input_forget); - test.AddAttribute("clip", clip); + if (hasClip) { + test.AddAttribute("clip", clip); + } std::vector X_dims = {seq_length, batch_size, input_size}; std::vector W_dims = {num_directions, 4 * hidden_size, input_size}; @@ -606,7 +609,8 @@ class LstmOpContext2x1x2x2 { bool use_bias = true, bool use_peepholes = true, float clip = 9999.f, - bool input_forget = false) { + bool input_forget = false, + bool hasClip = true) { // run with and without output_sequence to test UniDirectionalLstm handling when Y isn't returned ::onnxruntime::test::RunLstmTest(X, input_weights_, recurrent_weights_, expected_Y, expected_Y_h, expected_Y_c, @@ -621,7 +625,8 @@ class LstmOpContext2x1x2x2 { input_forget, activation_func_names_, activation_alphas_, - activation_betas_); + activation_betas_, + hasClip); ::onnxruntime::test::RunLstmTest(X, input_weights_, recurrent_weights_, expected_Y, expected_Y_h, expected_Y_c, @@ -636,7 +641,8 @@ class LstmOpContext2x1x2x2 { input_forget, activation_func_names_, activation_alphas_, - activation_betas_); + activation_betas_, + hasClip); } private: @@ -1090,6 +1096,77 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMSequenceLengthShorterThanInputSequenceLength) LstmOpContext2x1x2x2 context(direction); context.RunTest(X_data, batch_size, seq_len, &initial_h, &initial_c, Y_data, Y_h_data, {}, &sequence_length); } + +TEST(LSTMTest, ONNXRuntime_TestLSTMSequenceLengthShorterThanInputSequenceLengthNoP) { + const int seq_len = 2; + const int batch_size = 1; + + std::vector X_data = {-0.455351f, -0.276391f, + -0.185934f, -0.269585f}; + + std::vector sequence_length = {1}; + + std::vector initial_h = {0.0f, 0.0f, + -0.0306872f, 0.028035f}; + + std::vector initial_c = {0.0f, 0.0f, + -0.07243599f, 0.0467052f}; + + std::vector Y_data = {0.0415416f, 0.0196912f, + 0.0295027f, 0.0334400f, + + 0.0f, 0.0f, + 0.0f, 0.0f}; + + std::vector Y_h_data = {0.0415416f, 0.0196912f, + 0.0295027f, 0.0334400f}; + + std::string direction = "bidirectional"; + + LstmOpContext2x1x2x2 context(direction); + // CUDA implementation doesn't support peephole + context.RunTest(X_data, batch_size, seq_len, &initial_h, &initial_c, Y_data, Y_h_data, {}, &sequence_length, false); +} + +TEST(LSTMTest, ONNXRuntime_TestLSTMShorterSeqInMiddle) { + const int seq_len = 2; + int batch_size = 3; + std::vector activations = {"sigmoid", "tanh", "tanh", "sigmoid", "tanh", "tanh"}; + + bool use_bias = true; + bool use_peepholes = false; + + std::vector X_data = {-0.455351f, -0.776391f, + 0.0f, 0.0f, + 0.348763f, 0.678345f, + + -0.185934f, -0.169585f, + 0.0f, 0.0f, + 0.078053f, 0.163457f}; + + std::vector sequence_length = {2, 1, 2}; + + std::vector Y_data = {0.02907280f, 0.01765226f, -0.06724346f, 0.02957184f, -0.15355367f, 0.04701351f, + + 0.01841230f, 0.04093486f, -0.06724346f, 0.02957184f, -0.17994503f, 0.07397783f, + + -0.02912546f, 0.04120104f, 0.0f, 0.0f, -0.12768818f, 0.07457943f, + + -0.04350187f, 0.03531464f, 0.0f, 0.0f, -0.08877515f, 0.03413615f}; + + std::vector Y_h_data = {-0.0291254f, 0.04120104f, -0.06724346f, 0.02957184f, -0.12768818f, 0.07457943f, + + 0.01841230f, 0.04093486f, -0.06724346f, 0.02957184f, -0.17994503f, 0.07397783f}; + + std::vector Y_c_data = {-0.06609819f, 0.06838701f, -0.14596788f, 0.04902556f, -0.26768601f, 0.12119407f, + + 0.04934450f, 0.07126625f, -0.14596788f, 0.04902556f, -0.34139895f, 0.11673255f}; + + std::string direction = "bidirectional"; + LstmOpContext2x1x2x2 context(direction, activations); + context.RunTest(X_data, batch_size, seq_len, nullptr, nullptr, Y_data, Y_h_data, Y_c_data, + &sequence_length, use_bias, use_peepholes, 0.0f, false, false); +} #endif // USE_NGRAPH } // namespace test diff --git a/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc b/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc index 086177dea69fd..a0acc3f592f8f 100644 --- a/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc @@ -10,7 +10,7 @@ namespace test { // Some of the tests can't run on TensorrtExecutionProvider because of unsupported data types or limits // in its parser: axis >=0 && axis < nbDims. Those Tests will fallback to other EPs -TEST(MathOpTest, Concat1D_string) { +TEST(ConcatOpTest, Concat1D_string) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -21,7 +21,7 @@ TEST(MathOpTest, Concat1D_string) { test.Run(); } -TEST(MathOpTest, Concat1D_int32) { +TEST(ConcatOpTest, Concat1D_int32) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -32,7 +32,7 @@ TEST(MathOpTest, Concat1D_int32) { test.Run(); } -TEST(MathOpTest, Concat1D_int32_negative_axis) { +TEST(ConcatOpTest, Concat1D_int32_negative_axis) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{-1}); @@ -43,7 +43,7 @@ TEST(MathOpTest, Concat1D_int32_negative_axis) { test.Run(); } -TEST(MathOpTest, Concat1D_1) { +TEST(ConcatOpTest, Concat1D_1) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -54,7 +54,7 @@ TEST(MathOpTest, Concat1D_1) { test.Run(); } -TEST(MathOpTest, Concat1D_2) { +TEST(ConcatOpTest, Concat1D_2) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -65,7 +65,7 @@ TEST(MathOpTest, Concat1D_2) { test.Run(); } -TEST(MathOpTest, Concat2D_1) { +TEST(ConcatOpTest, Concat2D_1) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -80,7 +80,7 @@ TEST(MathOpTest, Concat2D_1) { test.Run(); } -TEST(MathOpTest, Concat2D_2) { +TEST(ConcatOpTest, Concat2D_2) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1}); @@ -96,7 +96,7 @@ TEST(MathOpTest, Concat2D_2) { test.Run(); } -TEST(MathOpTest, Concat2D_3) { +TEST(ConcatOpTest, Concat2D_3) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1}); @@ -107,7 +107,7 @@ TEST(MathOpTest, Concat2D_3) { test.Run(); } -TEST(MathOpTest, Concat3D_1) { +TEST(ConcatOpTest, Concat3D_1) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -139,7 +139,7 @@ TEST(MathOpTest, Concat3D_1) { test.Run(); } -TEST(MathOpTest, Concat3D_1_negative_axis) { +TEST(ConcatOpTest, Concat3D_1_negative_axis) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{-3}); @@ -171,7 +171,7 @@ TEST(MathOpTest, Concat3D_1_negative_axis) { test.Run(); } -TEST(MathOpTest, Concat3D_2) { +TEST(ConcatOpTest, Concat3D_2) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1}); @@ -203,7 +203,7 @@ TEST(MathOpTest, Concat3D_2) { test.Run(); } -TEST(MathOpTest, Concat3D_3) { +TEST(ConcatOpTest, Concat3D_3) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1}); diff --git a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc index 152cb3bcb9345..c816c7f7b6661 100644 --- a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc @@ -51,6 +51,20 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) { test.Run(); } +TEST(OneHotOpTest, DefaultAxis_int64_float_int64 /*indices, output, depth*/) { + OpTester test("OneHot", 9); + test.AddInput("indices", {2, 3}, {1, 9, 8, 2, 4, 6}); + test.AddInput("depth", {1}, {10}); + test.AddInput("values", {2}, {0, 1}); + test.AddOutput("output", {2, 3, 10}, {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,}); + test.Run(); +} + TEST(OneHotOpTest, Axis_0) { OpTester test("OneHot", 9); int64_t axis = 0; @@ -117,6 +131,26 @@ TEST(OneHotOpTest, Axis_2) { test.Run(); } +TEST(OneHotOpTest, Axis_Negative_NonDefault) { + OpTester test("OneHot", 9); + int64_t axis = -3; + test.AddAttribute("axis", axis); + test.AddInput("indices", {2, 3}, {1, 9, 8, 2, 4, 6}); + test.AddInput("depth", {1}, {10}); + test.AddInput("values", {2}, {0, 1}); + test.AddOutput("output", {10, 2, 3}, { 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, + 0, 1, 0, 0, 0, 0,}); + test.Run(); +} + TEST(OneHotOpTest, FloatInt64) { OpTester test("OneHot", 9); test.AddInput("indices", {2, 3}, {1.f, 9.f, 8.f, 2.f, 4.f, 6.f}); diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc index d731275214416..0611aa2501937 100644 --- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc @@ -27,28 +27,62 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest) { test.Run(); } -TEST(ResizeOpTest, ResizeOpUpsampleNearestTest) { +TEST(ResizeOpTest, ResizeOpLineartUpSampleTest) { OpTester test("Resize", 10); - std::vector scales{1.0f, 1.0f, 2.0f, 3.0f}; + std::vector scales{1.0f, 1.0f, 2.0f, 4.0f}; + test.AddAttribute("mode", "linear"); - test.AddAttribute("mode", "nearest"); + const int64_t N = 2, C = 1, H = 2, W = 2; + std::vector X = {1.0f, 3.0f, + 4.0f, 8.0f, - const int64_t N = 1, C = 1, H = 2, W = 2; - std::vector X = {1.0f, 2.0f, 3.0f, 4.0f}; + 6.0f, 2.0f, + 7.0f, 11.0f}; test.AddInput("X", {N, C, H, W}, X); test.AddInput("scales", {4}, scales); - std::vector Y = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, - 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, - 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f, - 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f}; + std::vector Y = { + 1.0f, 1.5f, 2.0f, 2.5f, 3.0f, 3.0f, 3.0f, 3.0f, + 2.5f, 3.25f, 4.0f, 4.75f, 5.5f, 5.5f, 5.5f, 5.5f, + 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f, + 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f, + + 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, + 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, + 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f, + 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f}; test.AddOutput("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); test.Run(); } -TEST(ResizeOpTest, ResizeOpNearestTest) { +TEST(ResizeOpTest, ResizeOpLineartNoScaleTest) { + OpTester test("Resize", 10); + std::vector scales{1.0f, 1.0f, 1.0f, 1.0f}; + test.AddAttribute("mode", "linear"); + + const int64_t N = 2, C = 1, H = 2, W = 2; + std::vector X = {1.0f, 3.0f, + 4.0f, 8.0f, + + 6.0f, 2.0f, + 7.0f, 11.0f}; + + test.AddInput("X", {N, C, H, W}, X); + test.AddInput("scales", {4}, scales); + + std::vector Y = {1.0f, 3.0f, + 4.0f, 8.0f, + + 6.0f, 2.0f, + 7.0f, 11.0f}; + + test.AddOutput("Y", {N, C, H, W}, Y); + test.Run(); +} + +TEST(ResizeOpTest, ResizeOpNearestDownSampleTest) { OpTester test("Resize", 10); std::vector scales{1.0f, 1.0f, 0.6f, 0.6f}; @@ -68,37 +102,44 @@ TEST(ResizeOpTest, ResizeOpNearestTest) { test.Run(); } -TEST(ResizeOpTest, ResizeOpBilinearTest) { +TEST(ResizeOpTest, ResizeOpNearestUpSampleTest) { OpTester test("Resize", 10); - std::vector scales{1.0f, 1.0f, 0.5f, 0.5f}; - - test.AddAttribute("mode", "linear"); + std::vector scales{1.0f, 1.0f, 2.0f, 3.0f}; - const int64_t N = 2, C = 1, H = 4, W = 8; - std::vector X = { - 1.0f, 1.5f, 2.0f, 2.5f, 3.0f, 3.0f, 3.0f, 3.0f, - 2.0f, 2.5f, 3.0f, 3.5f, 4.0f, 4.0f, 4.0f, 4.0f, - 3.0f, 3.5f, 4.0f, 4.5f, 5.0f, 5.0f, 5.0f, 5.0f, - 3.0f, 3.5f, 4.0f, 4.5f, 5.0f, 5.0f, 5.0f, 5.0f, + test.AddAttribute("mode", "nearest"); - 3.0f, 3.5f, 4.0f, 4.5f, 5.0f, 5.0f, 5.0f, 5.0f, - 5.0f, 5.5f, 6.0f, 6.5f, 7.0f, 7.0f, 7.0f, 7.0f, - 7.0f, 7.5f, 8.0f, 8.5f, 9.0f, 9.0f, 9.0f, 9.0f, - 7.0f, 7.5f, 8.0f, 8.5f, 9.0f, 9.0f, 9.0f, 9.0f}; + const int64_t N = 1, C = 1, H = 2, W = 2; + std::vector X = {1.0f, 2.0f, 3.0f, 4.0f}; test.AddInput("X", {N, C, H, W}, X); test.AddInput("scales", {4}, scales); - std::vector Y = { - 1.0f, 2.0f, 3.0f, 3.0f, - 3.0f, 4.0f, 5.0f, 5.0f, - - 3.0f, 4.0f, 5.0f, 5.0f, - 7.0f, 8.0f, 9.0f, 9.0f}; + std::vector Y = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, + 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, + 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f, + 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f}; test.AddOutput("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); test.Run(); } +TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest) { + OpTester test("Resize", 10); + std::vector scales{1.0f, 1.0f, 1.0f, 1.0f}; + + test.AddAttribute("mode", "nearest"); + + const int64_t N = 1, C = 1, H = 2, W = 2; + std::vector X = {1.0f, 2.0f, 3.0f, 4.0f}; + + test.AddInput("X", {N, C, H, W}, X); + test.AddInput("scales", {4}, scales); + + std::vector Y = {1.0f, 2.0f, 3.0f, 4.0f}; + + test.AddOutput("Y", {N, C, H, W}, Y); + test.Run(); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc b/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc index 979512e8bf739..4287d1369bd65 100644 --- a/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc @@ -18,6 +18,23 @@ TEST(SqueezeOpTest, Squeeze_1) { test.Run(); } +TEST(SqueezeOpTest, Squeeze_Empty_Axes_1) { + OpTester test("Squeeze"); + test.AddInput("data", {1, 1, 4, 1}, std::vector(4, 1.0f)); + test.AddOutput("squeezed", {4}, std::vector(4, 1.0f)); + // TensorRT doesn't seem to support missing 'axes' + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + +TEST(SqueezeOpTest, Squeeze_Empty_Axes_2) { + OpTester test("Squeeze"); + // nothing to "squeeze" out in the input shape + test.AddInput("data", {2, 4}, std::vector(8, 1.0f)); + test.AddOutput("squeezed", {2, 4}, std::vector(8, 1.0f)); + // TensorRT doesn't seem to support missing 'axes' + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + TEST(SqueezeOpTest, Squeeze_1_int32) { OpTester test("Squeeze"); test.AddAttribute("axes", std::vector{0}); diff --git a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc index 0aa6d665e47d0..d09d711105cdd 100644 --- a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc @@ -43,7 +43,7 @@ TEST(TransposeOpTest, TwoDimNoAttr) { 2.0f, 5.0f, 3.0f, 6.0f}; - TransposeTest(input_shape, input_vals, nullptr, expected_shape, expected_vals, false);//TensorRT: SegFault error + TransposeTest(input_shape, input_vals, nullptr, expected_shape, expected_vals, false); //TensorRT: SegFault error } TEST(TransposeOpTest, TwoDimNoAttrStr) { @@ -113,37 +113,23 @@ TEST(TransposeOpTest, ThreeDim) { std::vector perm = {0, 2, 1}; std::vector expected_shape({4, 3, 2}); auto expected_vals = { - 1.0f, - 4.0f, - 2.0f, - 5.0f, - 3.0f, - 6.0f, - - 1.1f, - 4.1f, - 2.1f, - 5.1f, - 3.1f, - 6.1f, - - 1.2f, - 4.2f, - 2.2f, - 5.2f, - 3.2f, - 6.2f, - - 1.3f, - 4.3f, - 2.3f, - 5.3f, - 3.3f, - 6.3f, - - }; - - TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals, false); //TensorRT: illegal error + 1.0f, 4.0f, + 2.0f, 5.0f, + 3.0f, 6.0f, + + 1.1f, 4.1f, + 2.1f, 5.1f, + 3.1f, 6.1f, + + 1.2f, 4.2f, + 2.2f, 5.2f, + 3.2f, 6.2f, + + 1.3f, 4.3f, + 2.3f, 5.3f, + 3.3f, 6.3f}; + + TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals, false); //TensorRT: illegal error } TEST(TransposeOpTest, ThreeDimStr) { @@ -164,38 +150,60 @@ TEST(TransposeOpTest, ThreeDimStr) { std::vector perm = {0, 2, 1}; std::vector expected_shape({4, 3, 2}); std::initializer_list expected_vals = { - "1", - "4", - "2", - "5", - "3", - "6", - - "1", - "4", - "2", - "5", - "3", - "6", - - "1", - "4", - "2", - "5", - "3", - "6", - - "1", - "4", - "2", - "5", - "3", - "6" - - }; + "1", "4", + "2", "5", + "3", "6", + + "1", "4", + "2", "5", + "3", "6", + + "1", "4", + "2", "5", + "3", "6", + + "1", "4", + "2", "5", + "3", "6"}; TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals); } +TEST(TransposeOpTest, NCHW2NHWC) { + std::vector input_shape({1, 3, 2, 2}); + std::vector input_vals = { + "1", "2", "3", "4", + "5", "6", "7", "8", + "9", "10", "11", "12"}; + + std::vector perm = {0, 2, 3, 1}; + std::vector expected_shape({1, 2, 2, 3}); + std::initializer_list expected_vals = { + "1", "5", "9", + "2", "6", "10", + "3", "7", "11", + "4", "8", "12"}; + + TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals, false); +} + +TEST(TransposeOpTest, NHWC2NCHW) { + std::vector input_shape({1, 2, 2, 3}); + std::vector input_vals = { + "1", "2", "3", + "4", "5", "6", + "7", "8", "9", + "10", "11", "12"}; + + std::vector perm = {0, 3, 1, 2}; + std::vector expected_shape({1, 3, 2, 2}); + std::initializer_list expected_vals = { + "1", "4", "7", "10", + "2", "5", "8", "11", + "3", "6", "9", "12"}; + + TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals, false); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc index 7df01e77a7640..68924aa60b3b0 100644 --- a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc @@ -70,7 +70,7 @@ TEST(UpsampleOpTest, UpsampleOpNearestTest_int32) { 7, 7, 7, 9, 9, 9}; test.AddOutput("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: nvinfer1::query::Ports&): Assertion `!formats.empty()' failed + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: nvinfer1::query::Ports&): Assertion `!formats.empty()' failed } TEST(UpsampleOpTest, UpsampleOpNearestTest_uint8) { @@ -170,10 +170,9 @@ TEST(UpsampleOpTest, UpsampleOpNearest222XTest) { 3.0f, 3.0f, 5.0f, 5.0f, 3.0f, 3.0f, 5.0f, 5.0f, 7.0f, 7.0f, 9.0f, 9.0f, - 7.0f, 7.0f, 9.0f, 9.0f - }; + 7.0f, 7.0f, 9.0f, 9.0f}; - test.AddOutput("Y", {N*2, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); + test.AddOutput("Y", {N * 2, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); test.Run(); } @@ -208,6 +207,32 @@ TEST(UpsampleOpTest, UpsampleOpNearest15XTest) { test.Run(); } +TEST(UpsampleOpTest, UpsampleOpNearestTest_NoScale) { + OpTester test("Upsample"); + + std::vector scales{1.0f, 1.0f, 1.0f, 1.0f}; + test.AddAttribute("mode", "nearest"); + test.AddAttribute("scales", scales); + + const int64_t N = 1, C = 2, H = 2, W = 2; + std::vector X = {1.0f, 3.0f, + 3.0f, 5.0f, + + 3.0f, 5.0f, + 7.0f, 9.0f}; + + test.AddInput("X", {N, C, H, W}, X); + + std::vector Y = {1.0f, 3.0f, + 3.0f, 5.0f, + + 3.0f, 5.0f, + 7.0f, 9.0f}; + + test.AddOutput("Y", {N, C, H, W}, Y); + test.Run(); +} + TEST(UpsampleOpTest, UpsampleOpNearest2XTest_int32) { OpTester test("Upsample"); @@ -236,7 +261,7 @@ TEST(UpsampleOpTest, UpsampleOpNearest2XTest_int32) { 7, 7, 9, 9}; test.AddOutput("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: nvinfer1::query::Ports&): Assertion `!formats.empty()' failed + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: nvinfer1::query::Ports&): Assertion `!formats.empty()' failed } TEST(UpsampleOpTest, UpsampleOpBilinearTest) { @@ -270,34 +295,29 @@ TEST(UpsampleOpTest, UpsampleOpBilinearTest) { test.Run(); } -TEST(UpsampleOpTest, UpsampleOpBilinearTest2) { +TEST(UpsampleOpTest, UpsampleOpBilinearTest_NoScale) { OpTester test("Upsample"); - std::vector scales{1.0f, 1.0f, 2.0f, 4.0f}; + std::vector scales{1.0f, 1.0f, 1.0f, 1.0f}; test.AddAttribute("mode", "linear"); test.AddAttribute("scales", scales); const int64_t N = 2, C = 1, H = 2, W = 2; std::vector X = {1.0f, 3.0f, - 4.0f, 8.0f, + 3.0f, 5.0f, - 6.0f, 2.0f, - 7.0f, 11.0f}; + 3.0f, 5.0f, + 7.0f, 9.0f}; test.AddInput("X", {N, C, H, W}, X); - std::vector Y = { - 1.0f, 1.5f, 2.0f, 2.5f, 3.0f, 3.0f, 3.0f, 3.0f, - 2.5f, 3.25f, 4.0f, 4.75f, 5.5f, 5.5f, 5.5f, 5.5f, - 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f, - 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f, + std::vector Y = {1.0f, 3.0f, + 3.0f, 5.0f, - 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, - 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, - 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f, - 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f}; + 3.0f, 5.0f, + 7.0f, 9.0f}; - test.AddOutput("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); + test.AddOutput("Y", {N, C, H, W}, Y); test.Run(); } diff --git a/onnxruntime/test/providers/memcpy_test.cc b/onnxruntime/test/providers/memcpy_test.cc index 38f46ba4d4c80..c43779875fb02 100644 --- a/onnxruntime/test/providers/memcpy_test.cc +++ b/onnxruntime/test/providers/memcpy_test.cc @@ -31,8 +31,8 @@ TEST(MemcpyTest, copy1) { KernelRegistryManager kernel_registry_manager; kernel_registry_manager.RegisterKernels(execution_providers); - onnx::ModelProto mp; - std::ifstream model_istream("testdata/matmul_1.pb", std::ifstream::in | std::ifstream::binary); + ONNX_NAMESPACE::ModelProto mp; + std::ifstream model_istream("testdata/matmul_1.onnx", std::ifstream::in | std::ifstream::binary); google::protobuf::io::IstreamInputStream zero_copy_input(&model_istream); const bool result = mp.ParseFromZeroCopyStream(&zero_copy_input) && model_istream.eof(); ASSERT_TRUE(result); diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc new file mode 100644 index 0000000000000..c806de54e37bf --- /dev/null +++ b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc @@ -0,0 +1,103 @@ +#include "core/session/inference_session.h" +#include "test/providers/provider_test_utils.h" +#include "test/framework/test_utils.h" +#include "gtest/gtest.h" +#include "core/providers/nnapi/nnapi_execution_provider.h" +#include "core/common/logging/logging.h" + +using namespace std; +using namespace ONNX_NAMESPACE; +using namespace ::onnxruntime::logging; + +namespace onnxruntime { + +namespace test { +void VerifyOutputs(const std::vector& fetches, const std::vector& expected_dims, + const std::vector& expected_values) { + ASSERT_EQ(1, fetches.size()); + auto& rtensor = fetches.front().Get(); + TensorShape expected_shape(expected_dims); + ASSERT_EQ(expected_shape, rtensor.Shape()); + const std::vector found(rtensor.template Data(), rtensor.template Data() + expected_values.size()); + ASSERT_EQ(expected_values, found); +} + +TEST(NnapiExecutionProviderTest, FunctionTest) { + onnxruntime::Model model("graph_1"); + auto& graph = model.MainGraph(); + std::vector inputs; + std::vector outputs; + + // FLOAT tensor. + ONNX_NAMESPACE::TypeProto float_tensor; + float_tensor.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(3); + float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(2); + + auto& input_arg_1 = graph.GetOrCreateNodeArg("X", &float_tensor); + auto& input_arg_2 = graph.GetOrCreateNodeArg("Y", &float_tensor); + inputs.push_back(&input_arg_1); + inputs.push_back(&input_arg_2); + auto& output_arg = graph.GetOrCreateNodeArg("node_1_out_1", &float_tensor); + outputs.push_back(&output_arg); + graph.AddNode("node_1", "Add", "node 1.", inputs, outputs); + + auto& input_arg_3 = graph.GetOrCreateNodeArg("Z", &float_tensor); + inputs.clear(); + inputs.push_back(&output_arg); + inputs.push_back(&input_arg_3); + auto& output_arg_2 = graph.GetOrCreateNodeArg("M", &float_tensor); + outputs.clear(); + outputs.push_back(&output_arg_2); + graph.AddNode("node_2", "Add", "node 2.", inputs, outputs); + + auto status = graph.Resolve(); + ASSERT_TRUE(status.IsOK()); + std::string model_file_name = "nnapi_execution_provider_test_graph.onnx"; + status = onnxruntime::Model::Save(model, model_file_name); + + std::vector dims_mul_x = {1, 1, 3, 2}; + std::vector values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + OrtValue ml_value_x; + CreateMLValue(TestNnapiExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value_x); + OrtValue ml_value_y; + CreateMLValue(TestNnapiExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value_y); + OrtValue ml_value_z; + CreateMLValue(TestNnapiExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value_z); + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + feeds.insert(std::make_pair("Y", ml_value_y)); + feeds.insert(std::make_pair("Z", ml_value_z)); + + // prepare outputs + std::vector output_names; + output_names.push_back("M"); + std::vector fetches; + + // prepare expected inputs and outputs + std::vector expected_dims_mul_m = {1, 1, 3, 2}; + std::vector expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}; + + SessionOptions so; + so.session_logid = "NnapiExecutionProviderTest.FunctionTest"; + RunOptions run_options; + run_options.run_tag = so.session_logid; + + InferenceSession session_object{so}; + status = session_object.RegisterExecutionProvider(std::make_unique<::onnxruntime::NnapiExecutionProvider>()); + ASSERT_TRUE(status.IsOK()); + status = session_object.Load(model_file_name); + ASSERT_TRUE(status.IsOK()); + status = session_object.Initialize(); + ASSERT_TRUE(status.IsOK()); + + // Now run + status = session_object.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); + VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m); +} +} // namespace test +} // namespace onnxruntime + diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc index b274754b0d93c..4abbc94b827a9 100644 --- a/onnxruntime/test/providers/provider_test_utils.cc +++ b/onnxruntime/test/providers/provider_test_utils.cc @@ -34,6 +34,42 @@ void Check(const OpTester::Data& expected_data, const Tensor& output_tensor, con } } +template <> +void Check(const OpTester::Data& expected_data, const Tensor& output_tensor, const std::string& provider_type) { + auto& expected_tensor = expected_data.data_.Get(); + auto* expected = expected_tensor.template Data(); + auto* output = output_tensor.template Data(); + auto size = output_tensor.Shape().Size(); + + bool has_abs_err = expected_data.absolute_error_.has_value(); + bool has_rel_err = expected_data.relative_error_.has_value(); + + double threshold = 0.001; +#ifdef USE_CUDA + threshold = 0.005; +#endif + + for (int i = 0; i < size; ++i) { + if (std::isinf(expected[i])) { // Test infinity for equality + EXPECT_EQ(expected[i], output[i]); + } else if (std::isnan(expected[i])) { + EXPECT_TRUE(std::isnan(output[i])) << "Expected output " << i << " to be NaN"; + } else { + if (!has_abs_err && !has_rel_err) { + // the default for existing tests + EXPECT_NEAR(expected[i], output[i], threshold) << "provider_type: " << provider_type; + } else { + if (has_abs_err) { + EXPECT_NEAR(expected[i], output[i], expected_data.absolute_error_.value()) << "provider_type: " << provider_type; + } + if (has_rel_err) { + EXPECT_NEAR(expected[i], output[i], expected_data.relative_error_.value() * std::abs(expected[i])) << "provider_type: " << provider_type; + } + } + } + } +} + template <> void Check(const OpTester::Data& expected_data, const Tensor& output_tensor, const std::string& provider_type) { auto& expected_tensor = expected_data.data_.Get(); @@ -338,7 +374,8 @@ void OpTester::ExecuteModel(Model& model, InferenceSession& session_object, Expe if (add_shape_to_tensor_data_) { auto out_shape_proto = expected_data.def_.Shape(); EXPECT_TRUE(out_shape_proto != nullptr); - auto inferred_dims = utils::GetTensorShapeFromTensorShapeProto(*out_shape_proto); + const auto& tensor_shape = utils::GetTensorShapeFromTensorShapeProto(*out_shape_proto); + const auto& inferred_dims = tensor_shape.GetDims(); const auto& expected_shape = expected_data.data_.Get().Shape(); EXPECT_TRUE(inferred_dims.size() == expected_shape.NumDimensions()); for (size_t d = 0; d < inferred_dims.size(); ++d) { @@ -463,6 +500,8 @@ void OpTester::Run(ExpectResult expect_result, execution_provider = DefaultTensorrtExecutionProvider(); else if (provider_type == onnxruntime::kOpenVINOExecutionProvider) execution_provider = DefaultOpenVINOExecutionProvider(); + else if (provider_type == onnxruntime::kNnapiExecutionProvider) + execution_provider = DefaultNnapiExecutionProvider(); // skip if execution provider is disabled if (execution_provider == nullptr) continue; diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index ae977e57b6a49..703a879fdc1c3 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -103,7 +103,13 @@ def create_backend_test(testname=None): '^test_bitshift_left_uint32_cpu.*', '^test_bitshift_left_uint64_cpu.*', '^test_bitshift_left_uint8_cpu.*', - '^test_round_cpu.*' + '^test_round_cpu.*', + '^test_cumsum_1d_cpu.*', + '^test_cumsum_1d_exclusive_cpu.*', + '^test_cumsum_1d_reverse_cpu.*', + '^test_cumsum_1d_reverse_exclusive_cpu.*', + '^test_cumsum_2d_axis_0_cpu.*', + '^test_cumsum_2d_axis_1_cpu.*' ) # Example of how to disable tests for a specific provider. diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index 9bb4b3488f877..ce4da1a0a6964 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -10,7 +10,7 @@ class TestInferenceSession(unittest.TestCase): - + def get_name(self, name): if os.path.exists(name): return name @@ -22,17 +22,20 @@ def get_name(self, name): res = os.path.join(data, name) if os.path.exists(res): return res - raise FileNotFoundError("Unable to find '{0}' or '{1}' or '{2}'".format(name, rel, res)) + raise FileNotFoundError( + "Unable to find '{0}' or '{1}' or '{2}'".format(name, rel, res)) def run_model(self, session_object, run_options): x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) input_name = session_object.get_inputs()[0].name res = session_object.run([], {input_name: x}, run_options=run_options) - output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + output_expected = np.array( + [[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) def testRunModel(self): - sess = onnxrt.InferenceSession(self.get_name("mul_1.pb")) + sess = onnxrt.InferenceSession(self.get_name("mul_1.onnx")) x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) input_name = sess.get_inputs()[0].name self.assertEqual(input_name, "X") @@ -43,11 +46,13 @@ def testRunModel(self): output_shape = sess.get_outputs()[0].shape self.assertEqual(output_shape, [3, 2]) res = sess.run([output_name], {input_name: x}) - output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + output_expected = np.array( + [[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) def testRunModelFromBytes(self): - with open(self.get_name("mul_1.pb"), "rb") as f: + with open(self.get_name("mul_1.onnx"), "rb") as f: content = f.read() sess = onnxrt.InferenceSession(content) x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) @@ -60,11 +65,13 @@ def testRunModelFromBytes(self): output_shape = sess.get_outputs()[0].shape self.assertEqual(output_shape, [3, 2]) res = sess.run([output_name], {input_name: x}) - output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + output_expected = np.array( + [[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) def testRunModel2(self): - sess = onnxrt.InferenceSession(self.get_name("matmul_1.pb")) + sess = onnxrt.InferenceSession(self.get_name("matmul_1.onnx")) x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) input_name = sess.get_inputs()[0].name self.assertEqual(input_name, "X") @@ -76,19 +83,21 @@ def testRunModel2(self): self.assertEqual(output_shape, [3, 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) def testRunModelMultipleThreads(self): so = onnxrt.SessionOptions() so.session_log_verbosity_level = 1 so.session_logid = "MultiThreadsTest" - sess = onnxrt.InferenceSession(self.get_name("mul_1.pb"), sess_options=so) + sess = onnxrt.InferenceSession( + self.get_name("mul_1.onnx"), sess_options=so) ro1 = onnxrt.RunOptions() - ro1.run_tag = "thread1" - t1 = threading.Thread(target=self.run_model, args = (sess, ro1)) + ro1.run_tag = "thread1" + t1 = threading.Thread(target=self.run_model, args=(sess, ro1)) ro2 = onnxrt.RunOptions() ro2.run_tag = "thread2" - t2 = threading.Thread(target=self.run_model, args = (sess, ro2)) + t2 = threading.Thread(target=self.run_model, args=(sess, ro2)) t1.start() t2.start() t1.join() @@ -99,29 +108,30 @@ def testRunDevice(self): self.assertTrue('CPU' in device or 'GPU' in device) def testRunModelSymbolicInput(self): - sess = onnxrt.InferenceSession(self.get_name("matmul_2.pb")) + sess = onnxrt.InferenceSession(self.get_name("matmul_2.onnx")) x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) input_name = sess.get_inputs()[0].name self.assertEqual(input_name, "X") input_shape = sess.get_inputs()[0].shape # Input X has an unknown dimension. - self.assertEqual(input_shape, [None, 2]) + self.assertEqual(input_shape, ['None', 2]) output_name = sess.get_outputs()[0].name self.assertEqual(output_name, "Y") output_shape = sess.get_outputs()[0].shape # Output X has an unknown dimension. - self.assertEqual(output_shape, [None, 1]) + self.assertEqual(output_shape, ['None', 1]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) def testBooleanInputs(self): - sess = onnxrt.InferenceSession(self.get_name("logicaland.pb")) + sess = onnxrt.InferenceSession(self.get_name("logicaland.onnx")) a = np.array([[True, True], [False, False]], dtype=np.bool) b = np.array([[True, False], [True, False]], dtype=np.bool) # input1:0 is first in the protobuf, and input:0 is second - # and we maintain the original order. + # and we maintain the original order. a_name = sess.get_inputs()[0].name self.assertEqual(a_name, "input1:0") a_shape = sess.get_inputs()[0].shape @@ -143,13 +153,15 @@ def testBooleanInputs(self): output_type = sess.get_outputs()[0].type self.assertEqual(output_type, 'tensor(bool)') - output_expected = np.array([[True, False], [False, False]], dtype=np.bool) + output_expected = np.array( + [[True, False], [False, False]], dtype=np.bool) res = sess.run([output_name], {a_name: a, b_name: b}) np.testing.assert_equal(output_expected, res[0]) def testStringInput1(self): - sess = onnxrt.InferenceSession(self.get_name("identity_string.pb")) - x = np.array(['this', 'is', 'identity', 'test'], dtype=np.str).reshape((2,2)) + sess = onnxrt.InferenceSession(self.get_name("identity_string.onnx")) + x = np.array(['this', 'is', 'identity', 'test'], + dtype=np.str).reshape((2, 2)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "input:0") @@ -169,8 +181,9 @@ def testStringInput1(self): np.testing.assert_equal(x, res[0]) def testStringInput2(self): - sess = onnxrt.InferenceSession(self.get_name("identity_string.pb")) - x = np.array(['Olá', '你好', '여보세요', 'hello'], dtype=np.unicode).reshape((2,2)) + sess = onnxrt.InferenceSession(self.get_name("identity_string.onnx")) + x = np.array(['Olá', '你好', '여보세요', 'hello'], + dtype=np.unicode).reshape((2, 2)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "input:0") @@ -188,10 +201,10 @@ def testStringInput2(self): res = sess.run([output_name], {x_name: x}) np.testing.assert_equal(x, res[0]) - + def testInputBytes(self): - sess = onnxrt.InferenceSession(self.get_name("identity_string.pb")) - x = np.array([b'this', b'is', b'identity', b'test']).reshape((2,2)) + sess = onnxrt.InferenceSession(self.get_name("identity_string.onnx")) + x = np.array([b'this', b'is', b'identity', b'test']).reshape((2, 2)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "input:0") @@ -208,11 +221,12 @@ def testInputBytes(self): self.assertEqual(output_type, 'tensor(string)') res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0].astype('|S8')) + np.testing.assert_equal(x, res[0].astype('|S8')) def testInputObject(self): - sess = onnxrt.InferenceSession(self.get_name("identity_string.pb")) - x = np.array(['this', 'is', 'identity', 'test'], object).reshape((2,2)) + sess = onnxrt.InferenceSession(self.get_name("identity_string.onnx")) + x = np.array(['this', 'is', 'identity', 'test'], + object).reshape((2, 2)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "input:0") @@ -229,11 +243,12 @@ def testInputObject(self): self.assertEqual(output_type, 'tensor(string)') res = sess.run([output_name], {x_name: x}) - np.testing.assert_equal(x, res[0]) + np.testing.assert_equal(x, res[0]) def testInputVoid(self): - sess = onnxrt.InferenceSession(self.get_name("identity_string.pb")) - x = np.array([b'this', b'is', b'identity', b'test'], np.void).reshape((2,2)) + sess = onnxrt.InferenceSession(self.get_name("identity_string.onnx")) + x = np.array([b'this', b'is', b'identity', b'test'], + np.void).reshape((2, 2)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "input:0") @@ -250,14 +265,14 @@ def testInputVoid(self): self.assertEqual(output_type, 'tensor(string)') res = sess.run([output_name], {x_name: x}) - + expr = np.array([['this\x00\x00\x00\x00', 'is\x00\x00\x00\x00\x00\x00'], ['identity', 'test\x00\x00\x00\x00']], dtype=object) np.testing.assert_equal(expr, res[0]) def testConvAutoPad(self): - sess = onnxrt.InferenceSession(self.get_name("conv_autopad.pb")) - x = np.array(25 * [1.0], dtype=np.float32).reshape((1,1,5,5)) + sess = onnxrt.InferenceSession(self.get_name("conv_autopad.onnx")) + x = np.array(25 * [1.0], dtype=np.float32).reshape((1, 1, 5, 5)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "Input4") @@ -282,8 +297,10 @@ def testConvAutoPad(self): np.testing.assert_allclose(output_expected, res[0]) def testZipMapStringFloat(self): - sess = onnxrt.InferenceSession(self.get_name("zipmap_stringfloat.pb")) - x = np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], dtype=np.float32).reshape((2,3)) + sess = onnxrt.InferenceSession( + self.get_name("zipmap_stringfloat.onnx")) + x = np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], + dtype=np.float32).reshape((2, 3)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "X") @@ -301,8 +318,9 @@ def testZipMapStringFloat(self): self.assertEqual(output_expected, res[0]) def testZipMapInt64Float(self): - sess = onnxrt.InferenceSession(self.get_name("zipmap_int64float.pb")) - x = np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], dtype=np.float32).reshape((2,3)) + sess = onnxrt.InferenceSession(self.get_name("zipmap_int64float.onnx")) + x = np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], + dtype=np.float32).reshape((2, 3)) x_name = sess.get_inputs()[0].name self.assertEqual(x_name, "X") @@ -314,13 +332,14 @@ def testZipMapInt64Float(self): output_type = sess.get_outputs()[0].type self.assertEqual(output_type, 'seq(map(int64,tensor(float)))') - output_expected = [{10: 1.0, 20: 0.0, 30: 3.0}, {10: 44.0, 20: 23.0, 30: 11.0}] + output_expected = [{10: 1.0, 20: 0.0, 30: 3.0}, + {10: 44.0, 20: 23.0, 30: 11.0}] res = sess.run([output_name], {x_name: x}) self.assertEqual(output_expected, res[0]) def testRaiseWrongNumInputs(self): with self.assertRaises(ValueError) as context: - sess = onnxrt.InferenceSession(self.get_name("logicaland.pb")) + sess = onnxrt.InferenceSession(self.get_name("logicaland.onnx")) a = np.array([[True, True], [False, False]], dtype=np.bool) res = sess.run([], {'input:0': a}) @@ -340,7 +359,8 @@ def testModelMeta(self): def testProfilerWithSessionOptions(self): so = onnxrt.SessionOptions() so.enable_profiling = True - sess = onnxrt.InferenceSession(self.get_name("mul_1.pb"), sess_options=so) + sess = onnxrt.InferenceSession( + self.get_name("mul_1.onnx"), sess_options=so) x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) sess.run([], {'X': x}) profile_file = sess.end_profiling() @@ -355,7 +375,8 @@ def testProfilerWithSessionOptions(self): self.assertTrue(']' in lines[8]) def testDictVectorizer(self): - sess = onnxrt.InferenceSession(self.get_name("pipeline_vectorize.onnx")) + sess = onnxrt.InferenceSession( + self.get_name("pipeline_vectorize.onnx")) input_name = sess.get_inputs()[0].name self.assertEqual(input_name, "float_input") input_type = str(sess.get_inputs()[0].type) @@ -368,38 +389,43 @@ def testDictVectorizer(self): self.assertEqual(output_type, "tensor(float)") output_shape = sess.get_outputs()[0].shape self.assertEqual(output_shape, [1, 1]) - + # Python type x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) - + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) + xwrong = x.copy() xwrong["a"] = 5.6 try: res = sess.run([output_name], {input_name: xwrong}) except RuntimeError as e: - self.assertIn("Unexpected key type , it cannot be linked to C type int64_t", str(e)) + self.assertIn( + "Unexpected key type , it cannot be linked to C type int64_t", str(e)) # numpy type x = {np.int64(k): np.float32(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) - + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) + x = {np.int64(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) - + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) + x = {np.int32(k): np.float64(v) for k, v in x.items()} res = sess.run([output_name], {input_name: x}) output_expected = np.array([[49.752754]], dtype=np.float32) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) def testLabelEncoder(self): - sess = onnxrt.InferenceSession(self.get_name("LabelEncoder.pb")) + sess = onnxrt.InferenceSession(self.get_name("LabelEncoder.onnx")) input_name = sess.get_inputs()[0].name self.assertEqual(input_name, "input") input_type = str(sess.get_inputs()[0].type) @@ -412,50 +438,54 @@ def testLabelEncoder(self): self.assertEqual(output_type, "tensor(int64)") output_shape = sess.get_outputs()[0].shape self.assertEqual(output_shape, [1, 1]) - + # Array x = np.array([['4']]) res = sess.run([output_name], {input_name: x}) output_expected = np.array([[3]], dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) # Python type - x = np.array(['4']) + x = np.array(['4'], ndmin=2) res = sess.run([output_name], {input_name: x}) - output_expected = np.array([3], dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + output_expected = np.array([3], ndmin=2, dtype=np.int64) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) - x = np.array(['4'], dtype=np.object) + x = np.array(['4'], ndmin=2, dtype=np.object) res = sess.run([output_name], {input_name: x}) - output_expected = np.array([3], dtype=np.int64) - np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + output_expected = np.array([3], ndmin=2, dtype=np.int64) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) def test_run_model_mlnet(self): sess = onnxrt.InferenceSession(self.get_name("mlnet_encoder.onnx")) names = [_.name for _ in sess.get_outputs()] self.assertEqual(['C00', 'C12'], names) - c0 = np.array([5.], dtype=np.float32).reshape(1, 1); - + c0 = np.array([5.], dtype=np.float32).reshape(1, 1) + c1 = np.array([b'A\0A\0', b"B\0B\0", b"C\0C\0"], np.void).reshape(1, 3) res = sess.run(None, {'C0': c0, 'C1': c1}) mat = res[1] total = mat.sum() self.assertEqual(total, 2) - self.assertEqual(list(mat.ravel()), + self.assertEqual(list(mat.ravel()), list(np.array([[[0., 0., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.]]]).ravel())) - - # In memory, the size of each element is fixed and equal to the + + # In memory, the size of each element is fixed and equal to the # longest element. We cannot use bytes because numpy is trimming # every final 0 for strings and bytes before creating the array # (to save space). It does not have this behaviour for void # but as a result, numpy does not know anymore the size # of each element, they all have the same size. - c1 = np.array([b'A\0A\0\0', b"B\0B\0", b"C\0C\0"], np.void).reshape(1, 3) + c1 = np.array([b'A\0A\0\0', b"B\0B\0", b"C\0C\0"], + np.void).reshape(1, 3) res = sess.run(None, {'C0': c0, 'C1': c1}) mat = res[1] total = mat.sum() self.assertEqual(total, 0) - + if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py index a2e6864f5112b..d1ed383cf8a7b 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py @@ -29,7 +29,7 @@ def get_name(self, name): raise FileNotFoundError("Unable to find '{0}' or '{1}' or '{2}'".format(name, rel, res)) def testRunModel(self): - name = self.get_name("mul_1.pb") + name = self.get_name("mul_1.onnx") rep = backend.prepare(name) x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) res = rep.run(x) diff --git a/onnxruntime/test/server/integration_tests/README.MD b/onnxruntime/test/server/integration_tests/README.MD index 141d531d1a9bc..291e44e67bbe3 100644 --- a/onnxruntime/test/server/integration_tests/README.MD +++ b/onnxruntime/test/server/integration_tests/README.MD @@ -40,4 +40,14 @@ If those models are in different folder but in the same structure as the test da /usr/bin/python3 ./model_zoo_tests.py /some/where/server_app /home/my/models/ /home/foo/bar/model_zoo_test/ ``` -All tests are running in sequential order. \ No newline at end of file +All tests are running in sequential order. + +## Generating python GRPC clients + +Protoc needs absolute paths +``` +protoc --python_out=/home/colin/onnxserving -I . *.proto +protoc --grpc_out=/home/colin/onnxserving --plugin=protoc-gen-grpc=/usr/local/bin/grpc_python_plugin -I . prediction_service.proto +``` + +need to pip install `grpcio` and `protobuf` \ No newline at end of file diff --git a/onnxruntime/test/server/integration_tests/function_tests.py b/onnxruntime/test/server/integration_tests/function_tests.py index 9baa095569184..e91a0a26dfd0b 100644 --- a/onnxruntime/test/server/integration_tests/function_tests.py +++ b/onnxruntime/test/server/integration_tests/function_tests.py @@ -12,6 +12,8 @@ import test_util import onnx_ml_pb2 import predict_pb2 +import prediction_service_pb2_grpc +import grpc class HttpJsonPayloadTests(unittest.TestCase): server_ip = '127.0.0.1' @@ -358,6 +360,69 @@ def test_health_endpoint(self): self.assertEqual(r.status_code, 200) self.assertEqual(r.content.decode('utf-8'), 'Healthy') +class GRPCTests(unittest.TestCase): + server_ip = '127.0.0.1' + server_port = 54321 + server_app_path = '' + test_data_path = '' + model_path = '' + log_level = 'verbose' + server_app_proc = None + wait_server_ready_in_seconds = 1 + + @classmethod + def setUpClass(cls): + onnx_model = os.path.join(cls.model_path, 'mnist.onnx') + test_util.prepare_mnist_model(onnx_model) + cmd = [cls.server_app_path, '--grpc_port', str(cls.server_port), '--model_path', onnx_model, '--log_level', cls.log_level] + test_util.test_log('Launching server app: [{0}]'.format(' '.join(cmd))) + cls.server_app_proc = subprocess.Popen(cmd) + test_util.test_log('Server app PID: {0}'.format(cls.server_app_proc.pid)) + test_util.test_log('Sleep {0} second(s) to wait for server initialization'.format(cls.wait_server_ready_in_seconds)) + time.sleep(cls.wait_server_ready_in_seconds) + + + @classmethod + def tearDownClass(cls): + test_util.test_log('Shutdown server app') + cls.server_app_proc.kill() + test_util.test_log('PID {0} has been killed: {1}'.format(cls.server_app_proc.pid, test_util.is_process_killed(cls.server_app_proc.pid))) + + + def test_mnist_happy_path(self): + input_data_file = os.path.join(self.test_data_path, 'mnist_test_data_set_0_input.pb') + output_data_file = os.path.join(self.test_data_path, 'mnist_test_data_set_0_output.pb') + + with open(input_data_file, 'rb') as f: + request_payload = f.read() + + request = predict_pb2.PredictRequest() + request.ParseFromString(request_payload) + uri = "{}:{}".format(self.server_ip, self.server_port) + test_util.test_log(uri) + with grpc.insecure_channel(uri) as channel: + stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) + actual_result = stub.Predict(request) + + expected_result = predict_pb2.PredictResponse() + with open(output_data_file, 'rb') as f: + expected_result.ParseFromString(f.read()) + + for k in expected_result.outputs.keys(): + self.assertEqual(actual_result.outputs[k].data_type, expected_result.outputs[k].data_type) + + count = 1 + for i in range(0, len(expected_result.outputs['Plus214_Output_0'].dims)): + self.assertEqual(actual_result.outputs['Plus214_Output_0'].dims[i], expected_result.outputs['Plus214_Output_0'].dims[i]) + count = count * int(actual_result.outputs['Plus214_Output_0'].dims[i]) + + actual_array = numpy.frombuffer(actual_result.outputs['Plus214_Output_0'].raw_data, dtype=numpy.float32) + expected_array = numpy.frombuffer(expected_result.outputs['Plus214_Output_0'].raw_data, dtype=numpy.float32) + self.assertEqual(len(actual_array), len(expected_array)) + self.assertEqual(len(actual_array), count) + for i in range(0, count): + self.assertTrue(test_util.compare_floats(actual_array[i], expected_array[i], rel_tol=0.001)) + if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/server/integration_tests/model_zoo_tests.py b/onnxruntime/test/server/integration_tests/model_zoo_tests.py index ab1e5514c560c..8f1e79d5aba72 100644 --- a/onnxruntime/test/server/integration_tests/model_zoo_tests.py +++ b/onnxruntime/test/server/integration_tests/model_zoo_tests.py @@ -10,6 +10,7 @@ class ModelZooTests(unittest.TestCase): server_ip = '127.0.0.1' server_port = 54321 + grpc_port = 56789 url_pattern = 'http://{0}:{1}/v1/models/{2}/versions/{3}:predict' server_app_path = '' # Required log_level = 'verbose' @@ -45,7 +46,7 @@ def __test_model(self, model_path, data_paths): if onnx_file_path == '': raise FileNotFoundError('Could not find any *.onnx file in {0}'.format(model_path)) - cmd = [self.server_app_path, '--http_port', str(self.server_port), '--model_path', onnx_file_path, '--log_level', self.log_level] + cmd = [self.server_app_path, '--http_port', str(self.server_port), '--model_path', onnx_file_path, '--log_level', self.log_level, '--grpc_port', str(self.grpc_port)] test_util.test_log(cmd) server_app_proc = test_util.launch_server_app(cmd, self.server_ip, self.server_port, self.server_ready_in_seconds) @@ -67,6 +68,15 @@ def __test_model(self, model_path, data_paths): request_payload = f.read() resp = test_util.make_http_request(url, pb_request_headers, request_payload) test_util.pb_response_validation(self, resp, os.path.join(test, 'response.pb')) + + test_util.test_log('[{0}] GRPC testing ....'.format(model_path)) + uri = ("{}:{}".format(self.server_ip, self.grpc_port)) + with open(os.path.join(test, 'request.pb'), 'rb') as f: + request_payload = f.read() + with grpc.insecure_channel(uri) as channel: + stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) + resp = stub.Predict(request_payload) + test_util.pb_response_validation(self, resp, os.path.join(test, 'response.pb')) finally: test_util.shutdown_server_app(server_app_proc, self.server_off_in_seconds) @@ -95,6 +105,7 @@ def test_models_from_model_zoo(self): test_util.test_log('-----------------------') self.server_port = random.randint(30000, 40000) + self.grpc_port = self.server_port + 1 for model_path, data_paths in model_data_map.items(): self.__test_model(model_path, data_paths) diff --git a/onnxruntime/test/server/integration_tests/test_main.py b/onnxruntime/test/server/integration_tests/test_main.py index 39706f040b829..45f72c9491ce7 100644 --- a/onnxruntime/test/server/integration_tests/test_main.py +++ b/onnxruntime/test/server/integration_tests/test_main.py @@ -13,7 +13,7 @@ loader = unittest.TestLoader() - test_classes = [function_tests.HttpJsonPayloadTests, function_tests.HttpProtobufPayloadTests, function_tests.HttpEndpointTests] + test_classes = [function_tests.HttpJsonPayloadTests, function_tests.HttpProtobufPayloadTests, function_tests.HttpEndpointTests, function_tests.GRPCTests] test_suites = [] for tests in test_classes: diff --git a/onnxruntime/test/server/unit_tests/converter_tests.cc b/onnxruntime/test/server/unit_tests/converter_tests.cc index 021acf09f49e9..4b192dd0d08c7 100644 --- a/onnxruntime/test/server/unit_tests/converter_tests.cc +++ b/onnxruntime/test/server/unit_tests/converter_tests.cc @@ -5,16 +5,30 @@ #include "gmock/gmock.h" #include "core/framework/tensor.h" +#include "core/graph/basic_types.h" #include "core/framework/allocatormgr.h" #include "test/framework/test_utils.h" #include "test/test_environment.h" #include "server/converter.h" +#include "server/serializing/tensorprotoutils.h" +#include +#include +#include +#include "onnx-ml.pb.h" namespace onnxruntime { namespace server { namespace test { -void CreateMLValueBool(AllocatorPtr alloc, const std::vector& dims, const bool* value, OrtValue* p_mlvalue); +void CreateMLValueBool(AllocatorPtr alloc, const std::vector& dims, const bool* value, Ort::Value& p_mlvalue); + +template +void CreateMLValue(AllocatorPtr alloc, const std::vector& dims, const std::vector& value, + Ort::Value& p_mlvalue) { + OrtValue* ml_value = new OrtValue{}; + onnxruntime::test::CreateMLValue(alloc, dims, value, ml_value); + p_mlvalue = Ort::Value{ml_value}; +} IExecutionProvider* TestCPUExecutionProvider() { static CPUExecutionProviderInfo info; @@ -23,80 +37,63 @@ IExecutionProvider* TestCPUExecutionProvider() { } TEST(MLDataTypeToTensorProtoDataTypeTests, MLDataTypeToTensorProtoDataTypeTests) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - - MLDataType ml_data_type = DataTypeImpl::GetType(); - onnx::TensorProto_DataType result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + onnx::TensorProto_DataType result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT); EXPECT_EQ(result, onnx::TensorProto_DataType_FLOAT); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16); EXPECT_EQ(result, onnx::TensorProto_DataType_FLOAT16); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16); EXPECT_EQ(result, onnx::TensorProto_DataType_BFLOAT16); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE); EXPECT_EQ(result, onnx::TensorProto_DataType_DOUBLE); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8); EXPECT_EQ(result, onnx::TensorProto_DataType_UINT8); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8); EXPECT_EQ(result, onnx::TensorProto_DataType_INT8); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16); EXPECT_EQ(result, onnx::TensorProto_DataType_UINT16); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16); EXPECT_EQ(result, onnx::TensorProto_DataType_INT16); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32); EXPECT_EQ(result, onnx::TensorProto_DataType_UINT32); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32); EXPECT_EQ(result, onnx::TensorProto_DataType_INT32); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64); EXPECT_EQ(result, onnx::TensorProto_DataType_UINT64); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64); EXPECT_EQ(result, onnx::TensorProto_DataType_INT64); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING); EXPECT_EQ(result, onnx::TensorProto_DataType_STRING); - ml_data_type = DataTypeImpl::GetType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL); EXPECT_EQ(result, onnx::TensorProto_DataType_BOOL); - ml_data_type = DataTypeImpl::GetTensorType(); - result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ml_data_type); + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED); + EXPECT_EQ(result, onnx::TensorProto_DataType_UNDEFINED); + + result = onnxruntime::server::MLDataTypeToTensorProtoDataType(static_cast(17)); EXPECT_EQ(result, onnx::TensorProto_DataType_UNDEFINED); } TEST(MLValueToTensorProtoTests, FloatToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -126,16 +123,13 @@ TEST(MLValueToTensorProtoTests, FloatToRaw) { } TEST(MLValueToTensorProtoTests, FloatToFloatData) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -161,16 +155,13 @@ TEST(MLValueToTensorProtoTests, FloatToFloatData) { } TEST(MLValueToTensorProtoTests, Int32ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -200,16 +191,13 @@ TEST(MLValueToTensorProtoTests, Int32ToRaw) { } TEST(MLValueToTensorProtoTests, Int32ToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -235,16 +223,13 @@ TEST(MLValueToTensorProtoTests, Int32ToInt32Data) { } TEST(MLValueToTensorProtoTests, UInt8ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -274,16 +259,13 @@ TEST(MLValueToTensorProtoTests, UInt8ToRaw) { } TEST(MLValueToTensorProtoTests, UInt8ToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -301,26 +283,70 @@ TEST(MLValueToTensorProtoTests, UInt8ToInt32Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); - auto count = tp.int32_data().size() * (sizeof(int32_t) / sizeof(uint8_t)); - EXPECT_EQ(count, 8); - auto data = tp.int32_data().data(); - const auto* data8 = reinterpret_cast(data); + auto count = tp.int32_data().size(); + EXPECT_EQ(count, 6); for (int x = 0; x < 6; ++x) { - EXPECT_EQ(data8[x], values_mul_x[x]); + EXPECT_EQ(tp.int32_data()[x], values_mul_x[x]); } } -TEST(MLValueToTensorProtoTests, Int8ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); +TEST(MLValueToTensorProtoTests, UInt8ProtoRoundTrip) { + std::vector dims_mul_x = {3, 2}; + std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; + onnx::TensorProto tp; + for (auto const& val : values_mul_x) { + tp.add_int32_data(val); + } + for (auto const& dim : dims_mul_x) { + tp.add_dims(dim); + } + tp.set_data_type(onnx::TensorProto_DataType_UINT8); + Ort::Value ml_value{nullptr}; + char buf[1000]; + auto allocator = Ort::Allocator::CreateDefault(); + auto info = allocator.GetInfo(); + MemBuffer buffer((void*)&buf, tp.ByteSizeLong(), *info); + onnxruntime::server::TensorProtoToMLValue(tp, buffer, ml_value); + + onnx::TensorProto tp_out; + + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp_out); + + // Verify data type + EXPECT_TRUE(tp_out.has_data_type()); + EXPECT_EQ(tp_out.data_type(), onnx::TensorProto_DataType_UINT8); + + // Verify data location + EXPECT_FALSE(tp_out.has_data_location()); + + // Verify dimensions + const auto& dims = tp_out.dims(); + std::vector tensor_shape_vec(static_cast(dims.size())); + for (int i = 0; i < dims.size(); ++i) { + EXPECT_EQ(dims[i], dims_mul_x[i]); + } + + // Verify data + EXPECT_FALSE(tp_out.has_raw_data()); + + EXPECT_EQ(tp_out.int32_data_size(), tp.int32_data_size()); + auto in_data = tp.int32_data(); + auto out_data = tp_out.int32_data(); + for (auto x = 0; x < tp_out.int32_data_size(); ++x) { + EXPECT_EQ(values_mul_x[x], in_data[x]); + EXPECT_EQ(static_cast(out_data[x]), in_data[x]); + } +} + +TEST(MLValueToTensorProtoTests, Int8ToRaw) { std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -350,16 +376,13 @@ TEST(MLValueToTensorProtoTests, Int8ToRaw) { } TEST(MLValueToTensorProtoTests, Int8ToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -378,25 +401,21 @@ TEST(MLValueToTensorProtoTests, Int8ToInt32Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); auto count = tp.int32_data().size(); - EXPECT_EQ(count, 2); - auto data = tp.int32_data().data(); - const auto* data8 = reinterpret_cast(data); + EXPECT_EQ(count, 6); + auto data = tp.int32_data(); for (int x = 0; x < 6; ++x) { - EXPECT_EQ(data8[x], values_mul_x[x]); + EXPECT_EQ(tp.int32_data()[x], values_mul_x[x]); } } TEST(MLValueToTensorProtoTests, UInt16ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 3}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -426,16 +445,13 @@ TEST(MLValueToTensorProtoTests, UInt16ToRaw) { } TEST(MLValueToTensorProtoTests, UInt16ToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 3}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -454,25 +470,21 @@ TEST(MLValueToTensorProtoTests, UInt16ToInt32Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); auto count = tp.int32_data().size(); - EXPECT_EQ(count, 5); - auto data = tp.int32_data().data(); - const auto* data16 = reinterpret_cast(data); + EXPECT_EQ(count, 9); + auto data = tp.int32_data(); for (int x = 0; x < 9; ++x) { - EXPECT_EQ(data16[x], values_mul_x[x]); + EXPECT_EQ(tp.int32_data()[x], values_mul_x[x]); } } TEST(MLValueToTensorProtoTests, Int16ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -502,16 +514,13 @@ TEST(MLValueToTensorProtoTests, Int16ToRaw) { } TEST(MLValueToTensorProtoTests, Int16ToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -529,26 +538,22 @@ TEST(MLValueToTensorProtoTests, Int16ToInt32Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); - auto count = tp.int32_data().size() * (sizeof(int32_t) / sizeof(int16_t)); + auto count = tp.int32_data().size(); EXPECT_EQ(count, 6); - auto data = tp.int32_data().data(); - const auto* data16 = reinterpret_cast(data); + auto data = tp.int32_data(); for (int x = 0; x < 6; ++x) { - EXPECT_EQ(data16[x], values_mul_x[x]); + EXPECT_EQ(tp.int32_data()[x], values_mul_x[x]); } } TEST(MLValueToTensorProtoTests, BoolToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; bool values_mul_x[] = {true, false, false, true, true, false}; - OrtValue ml_value; - CreateMLValueBool(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + CreateMLValueBool(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -578,16 +583,13 @@ TEST(MLValueToTensorProtoTests, BoolToRaw) { } TEST(MLValueToTensorProtoTests, BoolToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; bool values_mul_x[] = {true, false, false, true, true, false}; - OrtValue ml_value; - CreateMLValueBool(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + CreateMLValueBool(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -606,17 +608,14 @@ TEST(MLValueToTensorProtoTests, BoolToInt32Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); auto count = tp.int32_data().size(); - EXPECT_EQ(count, 2); - auto data = tp.int32_data().data(); - const auto* data16 = reinterpret_cast(data); + EXPECT_EQ(count, 6); + auto data = tp.int32_data(); for (int x = 0; x < 6; ++x) { - EXPECT_EQ(data16[x], values_mul_x[x]); + EXPECT_EQ(tp.int32_data()[x], values_mul_x[x]); } } TEST(MLValueToTensorProtoTests, Float16ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x{ onnxruntime::MLFloat16(1), @@ -625,12 +624,11 @@ TEST(MLValueToTensorProtoTests, Float16ToRaw) { onnxruntime::MLFloat16(4), onnxruntime::MLFloat16(5), onnxruntime::MLFloat16(6)}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -660,8 +658,6 @@ TEST(MLValueToTensorProtoTests, Float16ToRaw) { } TEST(MLValueToTensorProtoTests, FloatToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x{ onnxruntime::MLFloat16(1), @@ -670,12 +666,11 @@ TEST(MLValueToTensorProtoTests, FloatToInt32Data) { onnxruntime::MLFloat16(4), onnxruntime::MLFloat16(5), onnxruntime::MLFloat16(6)}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -694,17 +689,16 @@ TEST(MLValueToTensorProtoTests, FloatToInt32Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); auto count = tp.int32_data().size(); - EXPECT_EQ(count, 3); + EXPECT_EQ(count, 6); auto data = tp.int32_data().data(); - const auto* data16 = reinterpret_cast(data); for (int x = 0; x < 6; ++x) { - EXPECT_EQ(data16[x], values_mul_x[x]); + const u_int16_t data16 = data[x]; + const auto data_float_16 = static_cast(data16); + EXPECT_EQ(data_float_16, values_mul_x[x]); } } TEST(MLValueToTensorProtoTests, BFloat16ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x{ onnxruntime::BFloat16(1.0f), @@ -713,12 +707,11 @@ TEST(MLValueToTensorProtoTests, BFloat16ToRaw) { onnxruntime::BFloat16(4.0f), onnxruntime::BFloat16(5.0f), onnxruntime::BFloat16(6.0f)}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -748,8 +741,6 @@ TEST(MLValueToTensorProtoTests, BFloat16ToRaw) { } TEST(MLValueToTensorProtoTests, BFloatToInt32Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x{ onnxruntime::BFloat16(1.0f), @@ -758,12 +749,11 @@ TEST(MLValueToTensorProtoTests, BFloatToInt32Data) { onnxruntime::BFloat16(4.0f), onnxruntime::BFloat16(5.0f), onnxruntime::BFloat16(6.0f)}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -782,31 +772,29 @@ TEST(MLValueToTensorProtoTests, BFloatToInt32Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); auto count = tp.int32_data().size(); - EXPECT_EQ(count, 3); + EXPECT_EQ(count, 6); auto data = tp.int32_data().data(); - const auto* data16 = reinterpret_cast(data); for (int x = 0; x < 6; ++x) { - EXPECT_EQ(data16[x], values_mul_x[x].val); + const u_int16_t data16 = data[x]; + const auto data_float_16 = static_cast(data16); + EXPECT_EQ(data_float_16, values_mul_x[x]); } } TEST(MLValueToTensorProtoTests, StringToStringData) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x{"A", "BC", "DEF", "123", "45", "6"}; - OrtValue ml_value; - onnxruntime::test::AllocateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, &ml_value); + OrtValue* p_mlValue = new OrtValue{}; + onnxruntime::test::AllocateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, p_mlValue); - Tensor* mutable_tensor = ml_value.GetMutable(); + Tensor* mutable_tensor = p_mlValue->GetMutable(); std::string* mutable_data = mutable_tensor->MutableData(); for (size_t i = 0; i < values_mul_x.size(); ++i) { mutable_data[i] = values_mul_x[i]; } - + Ort::Value value{p_mlValue}; onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -833,16 +821,13 @@ TEST(MLValueToTensorProtoTests, StringToStringData) { } TEST(MLValueToTensorProtoTests, Int64ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -872,16 +857,13 @@ TEST(MLValueToTensorProtoTests, Int64ToRaw) { } TEST(MLValueToTensorProtoTests, Int64ToInt64Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -907,16 +889,13 @@ TEST(MLValueToTensorProtoTests, Int64ToInt64Data) { } TEST(MLValueToTensorProtoTests, UInt32ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -946,16 +925,13 @@ TEST(MLValueToTensorProtoTests, UInt32ToRaw) { } TEST(MLValueToTensorProtoTests, UInt32ToUint64Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -973,27 +949,23 @@ TEST(MLValueToTensorProtoTests, UInt32ToUint64Data) { // Verify data EXPECT_FALSE(tp.has_raw_data()); - auto count = tp.uint64_data().size() * (sizeof(uint64_t) / sizeof(uint32_t)); + auto count = tp.uint64_data().size(); EXPECT_EQ(count, 6); auto data = tp.uint64_data().data(); - const auto* data32 = reinterpret_cast(data); for (size_t x = 0; x < count; ++x) { - EXPECT_EQ(data32[x], values_mul_x[x]); + EXPECT_EQ(data[x], values_mul_x[x]); } } TEST(MLValueToTensorProtoTests, UInt64ToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -1023,16 +995,13 @@ TEST(MLValueToTensorProtoTests, UInt64ToRaw) { } TEST(MLValueToTensorProtoTests, UInt64ToInt64Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1, 2, 3, 4, 5, 6}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -1058,16 +1027,13 @@ TEST(MLValueToTensorProtoTests, UInt64ToInt64Data) { } TEST(MLValueToTensorProtoTests, DoubleToRaw) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ true, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -1097,16 +1063,13 @@ TEST(MLValueToTensorProtoTests, DoubleToRaw) { } TEST(MLValueToTensorProtoTests, DoubleToInt64Data) { - auto logger = std::make_unique(::onnxruntime::test::DefaultLoggingManager().DefaultLogger()); - std::vector dims_mul_x = {3, 2}; std::vector values_mul_x = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; - OrtValue ml_value; - onnxruntime::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, &ml_value); + Ort::Value ml_value{nullptr}; + onnxruntime::server::test::CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), dims_mul_x, values_mul_x, ml_value); onnx::TensorProto tp; - common::Status status = onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, std::move(logger), tp); - EXPECT_TRUE(status.IsOK()); + onnxruntime::server::MLValueToTensorProto(ml_value, /* using_raw_data */ false, spdlog::default_logger(), tp); // Verify data type EXPECT_TRUE(tp.has_data_type()); @@ -1131,8 +1094,9 @@ TEST(MLValueToTensorProtoTests, DoubleToInt64Data) { } } -void CreateMLValueBool(AllocatorPtr alloc, const std::vector& dims, const bool* value, OrtValue* p_mlvalue) { +void CreateMLValueBool(AllocatorPtr alloc, const std::vector& dims, const bool* value, Ort::Value& p_value) { TensorShape shape(dims); + OrtValue* p_mlvalue = new OrtValue{}; auto element_type = DataTypeImpl::GetType(); std::unique_ptr p_tensor = std::make_unique(element_type, shape, @@ -1141,6 +1105,7 @@ void CreateMLValueBool(AllocatorPtr alloc, const std::vector& dims, con p_mlvalue->Init(p_tensor.release(), DataTypeImpl::GetType(), DataTypeImpl::GetType()->GetDeleteFunc()); + p_value = Ort::Value{p_mlvalue}; } } // namespace test diff --git a/onnxruntime/test/server/unit_tests/executor_test.cc b/onnxruntime/test/server/unit_tests/executor_test.cc index 75ab5d6817bc7..e069360eba91a 100644 --- a/onnxruntime/test/server/unit_tests/executor_test.cc +++ b/onnxruntime/test/server/unit_tests/executor_test.cc @@ -7,26 +7,24 @@ #include "server/executor.h" #include "server/http/json_handling.h" -#include "test/test_environment.h" +#include +#include +#include +#include "test_server_environment.h" namespace onnxruntime { namespace server { namespace test { TEST(ExecutorTests, TestMul_1) { - const static auto model_file = "testdata/mul_1.pb"; + const static auto model_file = "testdata/mul_1.onnx"; const static auto input_json = R"({"inputs":{"X":{"dims":[3,2],"dataType":1,"floatData":[1,2,3,4,5,6]}},"outputFilter":["Y"]})"; const static auto expected = R"({"outputs":{"Y":{"dims":["3","2"],"dataType":1,"floatData":[1,4,9,16,25,36]}}})"; - onnxruntime::server::ServerEnvironment env(logging::Severity::kWARNING, logging::LoggingManager::InstanceType::Temporal, false); + onnxruntime::server::ServerEnvironment* env = ServerEnv(); + env->InitializeModel(model_file); - auto status = env.InitializeModel(model_file); - EXPECT_TRUE(status.IsOK()); - - status = env.GetSession()->Initialize(); - EXPECT_TRUE(status.IsOK()); - - onnxruntime::server::Executor executor(&env, "RequestId"); + onnxruntime::server::Executor executor(env, "RequestId"); onnxruntime::server::PredictRequest request{}; onnxruntime::server::PredictResponse response{}; diff --git a/onnxruntime/test/server/unit_tests/external/server_context_test_spouse.h b/onnxruntime/test/server/unit_tests/external/server_context_test_spouse.h new file mode 100644 index 0000000000000..4e6dcf8798b5b --- /dev/null +++ b/onnxruntime/test/server/unit_tests/external/server_context_test_spouse.h @@ -0,0 +1,65 @@ +/* + * + * Copyright 2016 gRPC authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef GRPCPP_TEST_SERVER_CONTEXT_TEST_SPOUSE_H +#define GRPCPP_TEST_SERVER_CONTEXT_TEST_SPOUSE_H + +#include + +#include + +namespace grpc { +namespace testing { + +/// A test-only class to access private members and methods of ServerContext. +class ServerContextTestSpouse { + public: + explicit ServerContextTestSpouse(ServerContext* ctx) : ctx_(ctx) {} + + /// Inject client metadata to the ServerContext for the test. The test spouse + /// must be alive when \a ServerContext::client_metadata is called. + void AddClientMetadata(const grpc::string& key, const grpc::string& value) { + client_metadata_storage_.insert( + std::pair(key, value)); + ctx_->client_metadata_.map()->clear(); + for (auto iter = client_metadata_storage_.begin(); + iter != client_metadata_storage_.end(); ++iter) { + ctx_->client_metadata_.map()->insert( + std::pair( + iter->first.c_str(), + grpc::string_ref(iter->second.data(), iter->second.size()))); + } + } + + std::multimap GetInitialMetadata() const { + return ctx_->initial_metadata_; + } + + std::multimap GetTrailingMetadata() const { + return ctx_->trailing_metadata_; + } + + private: + ServerContext* ctx_; // not owned + std::multimap client_metadata_storage_; +}; + +} // namespace testing +} // namespace grpc + +#endif // GRPCPP_TEST_SERVER_CONTEXT_TEST_SPOUSE_H \ No newline at end of file diff --git a/onnxruntime/test/server/unit_tests/prediction_service_impl_test.cc b/onnxruntime/test/server/unit_tests/prediction_service_impl_test.cc new file mode 100644 index 0000000000000..9b38c3f577352 --- /dev/null +++ b/onnxruntime/test/server/unit_tests/prediction_service_impl_test.cc @@ -0,0 +1,119 @@ +#include + +#include "gtest/gtest.h" + +#include "server/executor.h" +#include "server/grpc/prediction_service_impl.h" +#include "test/test_environment.h" +#include "test_server_environment.h" +#include "external/server_context_test_spouse.h" +#include + +namespace onnxruntime { +namespace server { +namespace grpc { +namespace test { +static ::grpc::internal::GrpcLibraryInitializer g_initializer; + +PredictRequest GetRequest() { + PredictRequest req{}; + req.add_output_filter("Y"); + onnx::TensorProto proto{}; + proto.add_dims(3); + proto.add_dims(2); + proto.set_data_type(1); + std::vector floats = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f}; + std::for_each(floats.begin(), floats.end(), [&proto](float f) { proto.add_float_data(f); }); + (*req.mutable_inputs())["X"] = proto; + return req; +} + +std::shared_ptr GetEnvironment() { + return std::shared_ptr(onnxruntime::server::test::ServerEnv(), [](onnxruntime::server::ServerEnvironment *){}); +} + +TEST(PredictionServiceImplTests, HappyPath) { + auto env = GetEnvironment(); + PredictionServiceImpl test{env}; + auto request = GetRequest(); + PredictResponse resp{}; + ::grpc::ServerContext context; + auto status = test.Predict(&context, &request, &resp); + EXPECT_TRUE(status.ok()); +} + +TEST(PredictionServiceImplTests, InvalidInput) { + auto env = GetEnvironment(); + PredictionServiceImpl test{env}; + auto request = GetRequest(); + (*request.mutable_inputs())["X"].add_dims(1); + PredictResponse resp{}; + ::grpc::ServerContext context{}; + auto status = test.Predict(&context, &request, &resp); + EXPECT_EQ(status.error_code(), ::grpc::INVALID_ARGUMENT); +} + +TEST(PredictionServiceImplTests, SuccessRequestID) { + auto env = GetEnvironment(); + PredictionServiceImpl test{env}; + auto request = GetRequest(); + PredictResponse resp{}; + ::grpc::ServerContext context; + ::grpc::testing::ServerContextTestSpouse spouse(&context); + auto status = test.Predict(&context, &request, &resp); + auto metadata = spouse.GetInitialMetadata(); + EXPECT_NE(metadata.find("x-ms-request-id"), metadata.end()); + EXPECT_TRUE(status.ok()); +} + +TEST(PredictionServiceImplTests, InvalidInputRequestID) { + auto env = GetEnvironment(); + PredictionServiceImpl test{env}; + auto request = GetRequest(); + request.clear_inputs(); + + PredictResponse resp{}; + ::grpc::ServerContext context; + ::grpc::testing::ServerContextTestSpouse spouse(&context); + auto status = test.Predict(&context, &request, &resp); + auto metadata = spouse.GetInitialMetadata(); + EXPECT_NE(metadata.find("x-ms-request-id"), metadata.end()); + EXPECT_FALSE(status.ok()); +} + +TEST(PredictionServiceImplTests, SuccessClientID) { + auto env = GetEnvironment(); + PredictionServiceImpl test{env}; + auto request = GetRequest(); + PredictResponse resp{}; + ::grpc::ServerContext context; + ::grpc::testing::ServerContextTestSpouse spouse(&context); + spouse.AddClientMetadata("x-ms-client-request-id", "client-id"); + auto status = test.Predict(&context, &request, &resp); + auto metadata = spouse.GetInitialMetadata(); + EXPECT_NE(metadata.find("x-ms-client-request-id"), metadata.end()); + EXPECT_EQ(metadata.find("x-ms-client-request-id")->second, "client-id"); + EXPECT_TRUE(status.ok()); +} + +TEST(PredictionServiceImplTests, InvalidInputClientID) { + auto env = GetEnvironment(); + PredictionServiceImpl test{env}; + auto request = GetRequest(); + request.clear_inputs(); + + PredictResponse resp{}; + ::grpc::ServerContext context; + ::grpc::testing::ServerContextTestSpouse spouse(&context); + spouse.AddClientMetadata("x-ms-client-request-id", "client-id"); + auto status = test.Predict(&context, &request, &resp); + auto metadata = spouse.GetInitialMetadata(); + EXPECT_NE(metadata.find("x-ms-client-request-id"), metadata.end()); + EXPECT_EQ(metadata.find("x-ms-client-request-id")->second, "client-id"); + EXPECT_FALSE(status.ok()); +} + +} // namespace test +} // namespace grpc +} // namespace server +} // namespace onnxruntime diff --git a/onnxruntime/test/server/unit_tests/server_configuration_test.cc b/onnxruntime/test/server/unit_tests/server_configuration_test.cc index 77a979e27f593..00e772d902274 100644 --- a/onnxruntime/test/server/unit_tests/server_configuration_test.cc +++ b/onnxruntime/test/server/unit_tests/server_configuration_test.cc @@ -13,7 +13,7 @@ namespace test { TEST(ConfigParsingTests, AllArgs) { char* test_argv[] = { const_cast("/path/to/binary"), - const_cast("--model_path"), const_cast("testdata/mul_1.pb"), + const_cast("--model_path"), const_cast("testdata/mul_1.onnx"), const_cast("--address"), const_cast("4.4.4.4"), const_cast("--http_port"), const_cast("80"), const_cast("--num_http_threads"), const_cast("1"), @@ -22,27 +22,27 @@ TEST(ConfigParsingTests, AllArgs) { onnxruntime::server::ServerConfiguration config{}; Result res = config.ParseInput(11, test_argv); EXPECT_EQ(res, Result::ContinueSuccess); - EXPECT_EQ(config.model_path, "testdata/mul_1.pb"); + EXPECT_EQ(config.model_path, "testdata/mul_1.onnx"); EXPECT_EQ(config.address, "4.4.4.4"); EXPECT_EQ(config.http_port, 80); EXPECT_EQ(config.num_http_threads, 1); - EXPECT_EQ(config.logging_level, onnxruntime::logging::Severity::kINFO); + EXPECT_EQ(config.logging_level, ORT_LOGGING_LEVEL_INFO); } TEST(ConfigParsingTests, Defaults) { char* test_argv[] = { const_cast("/path/to/binary"), - const_cast("--model"), const_cast("testdata/mul_1.pb"), + const_cast("--model"), const_cast("testdata/mul_1.onnx"), const_cast("--num_http_threads"), const_cast("3")}; onnxruntime::server::ServerConfiguration config{}; Result res = config.ParseInput(5, test_argv); EXPECT_EQ(res, Result::ContinueSuccess); - EXPECT_EQ(config.model_path, "testdata/mul_1.pb"); + EXPECT_EQ(config.model_path, "testdata/mul_1.onnx"); EXPECT_EQ(config.address, "0.0.0.0"); EXPECT_EQ(config.http_port, 8001); EXPECT_EQ(config.num_http_threads, 3); - EXPECT_EQ(config.logging_level, onnxruntime::logging::Severity::kINFO); + EXPECT_EQ(config.logging_level, ORT_LOGGING_LEVEL_INFO); } TEST(ConfigParsingTests, Help) { @@ -82,7 +82,7 @@ TEST(ConfigParsingTests, WrongLoggingLevel) { char* test_argv[] = { const_cast("/path/to/binary"), const_cast("--log_level"), const_cast("not a logging level"), - const_cast("--model_path"), const_cast("testdata/mul_1.pb"), + const_cast("--model_path"), const_cast("testdata/mul_1.onnx"), const_cast("--address"), const_cast("4.4.4.4"), const_cast("--http_port"), const_cast("80"), const_cast("--num_http_threads"), const_cast("1")}; diff --git a/onnxruntime/test/server/unit_tests/test_main.cc b/onnxruntime/test/server/unit_tests/test_main.cc index c62b90fdb2efb..09a5304d5f8d7 100644 --- a/onnxruntime/test/server/unit_tests/test_main.cc +++ b/onnxruntime/test/server/unit_tests/test_main.cc @@ -2,15 +2,15 @@ // Licensed under the MIT License. #include "gtest/gtest.h" -#include "test/test_environment.h" +#include "server/environment.h" +#include "test_server_environment.h" GTEST_API_ int main(int argc, char** argv) { int status = 0; try { - const bool create_default_logger = true; - onnxruntime::test::TestEnvironment environment{argc, argv, create_default_logger}; - + onnxruntime::server::test::TestServerEnvironment server_env{}; + onnxruntime::test::TestEnvironment env{argc, argv, false}; status = RUN_ALL_TESTS(); } catch (const std::exception& ex) { std::cerr << ex.what(); diff --git a/onnxruntime/test/server/unit_tests/test_server_environment.cc b/onnxruntime/test/server/unit_tests/test_server_environment.cc new file mode 100644 index 0000000000000..75d21ec5c1ddd --- /dev/null +++ b/onnxruntime/test/server/unit_tests/test_server_environment.cc @@ -0,0 +1,31 @@ +#include "test_server_environment.h" +#include +#include +#include + +namespace onnxruntime { +namespace server { +namespace test { +static std::unique_ptr s_env; + +ServerEnvironment* ServerEnv() { + ORT_ENFORCE(s_env != nullptr, + "Need a TestServerEnvironment instance to provide the server environment."); + + return s_env.get(); +} + +TestServerEnvironment::TestServerEnvironment() { + auto console = spdlog::stdout_logger_mt("console"); + spdlog::set_default_logger(console); + spdlog::sink_ptr ptr = std::make_shared(); + s_env = std::make_unique(ORT_LOGGING_LEVEL_WARNING, spdlog::sinks_init_list{ptr}); +} +TestServerEnvironment::~TestServerEnvironment() { + //destruct env to make sure the default logger is destoryed before the logger mutex. + s_env = nullptr; +} + +} // namespace test +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/test/server/unit_tests/test_server_environment.h b/onnxruntime/test/server/unit_tests/test_server_environment.h new file mode 100644 index 0000000000000..bfd8072f13d4b --- /dev/null +++ b/onnxruntime/test/server/unit_tests/test_server_environment.h @@ -0,0 +1,20 @@ +#pragma once + +#include "server/environment.h" +#include "test/test_environment.h" + +namespace onnxruntime { +namespace server { +namespace test { +ServerEnvironment* ServerEnv(); +class TestServerEnvironment { + public: + TestServerEnvironment(); + ~TestServerEnvironment(); + + TestServerEnvironment(const TestServerEnvironment&) = delete; + TestServerEnvironment(TestServerEnvironment&&) = default; +}; +} // namespace test +} // namespace server +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/test/shared_lib/test_allocator.cc b/onnxruntime/test/shared_lib/test_allocator.cc index 3ba04d10659c9..372594fa8f16c 100644 --- a/onnxruntime/test/shared_lib/test_allocator.cc +++ b/onnxruntime/test/shared_lib/test_allocator.cc @@ -10,7 +10,7 @@ using namespace onnxruntime; TEST_F(CApiTest, allocation_info) { OrtAllocatorInfo *info1, *info2; - ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtArenaAllocator, 0, OrtMemTypeDefault, &info1)); + ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &info1)); ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &info2)); int result; ORT_THROW_ON_ERROR(OrtCompareAllocatorInfo(info1, info2, &result)); diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 840ce0aed175a..2833803ac9ad2 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -114,10 +114,10 @@ void TestInference(Ort::Env& env, T model_uri, &value_y); } -static constexpr PATH_TYPE MODEL_URI = TSTR("testdata/mul_1.pb"); -static constexpr PATH_TYPE CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_1.pb"); +static constexpr PATH_TYPE MODEL_URI = TSTR("testdata/mul_1.onnx"); +static constexpr PATH_TYPE CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_1.onnx"); #ifdef ENABLE_LANGUAGE_INTEROP_OPS -static constexpr PATH_TYPE PYOP_FLOAT_MODEL_URI = TSTR("testdata/pyop_1.pb"); +static constexpr PATH_TYPE PYOP_FLOAT_MODEL_URI = TSTR("testdata/pyop_1.onnx"); #endif class CApiTestWithProvider : public CApiTest, diff --git a/onnxruntime/test/shared_lib/test_model_loading.cc b/onnxruntime/test/shared_lib/test_model_loading.cc index 4da741a7c627b..151dabb7e232d 100644 --- a/onnxruntime/test/shared_lib/test_model_loading.cc +++ b/onnxruntime/test/shared_lib/test_model_loading.cc @@ -170,7 +170,7 @@ TEST_F(CApiTest, model_with_external_data) { } TEST_F(CApiTest, model_from_array) { - const char* model_path = "testdata/matmul_1.pb"; + const char* model_path = "testdata/matmul_1.onnx"; std::vector buffer; { std::ifstream file(model_path, std::ios::binary | std::ios::ate); diff --git a/onnxruntime/test/shared_lib/test_run_options.cc b/onnxruntime/test/shared_lib/test_run_options.cc index 5a07d663546e4..f1f05bb6cc6b5 100644 --- a/onnxruntime/test/shared_lib/test_run_options.cc +++ b/onnxruntime/test/shared_lib/test_run_options.cc @@ -10,5 +10,5 @@ TEST_F(CApiTest, run_options) { options.SetRunLogVerbosityLevel(1); options.SetRunTag("abc"); ASSERT_STREQ(options.GetRunTag(), "abc"); - ASSERT_EQ(options.GetRunLogVerbosityLevel(), unsigned(1)); + ASSERT_EQ(options.GetRunLogVerbosityLevel(), 1); } diff --git a/onnxruntime/test/testdata/LabelEncoder.pb b/onnxruntime/test/testdata/LabelEncoder.onnx similarity index 100% rename from onnxruntime/test/testdata/LabelEncoder.pb rename to onnxruntime/test/testdata/LabelEncoder.onnx diff --git a/onnxruntime/test/testdata/conv_autopad.pb b/onnxruntime/test/testdata/conv_autopad.onnx similarity index 100% rename from onnxruntime/test/testdata/conv_autopad.pb rename to onnxruntime/test/testdata/conv_autopad.onnx diff --git a/onnxruntime/test/testdata/foo_1.pb b/onnxruntime/test/testdata/foo_1.onnx similarity index 100% rename from onnxruntime/test/testdata/foo_1.pb rename to onnxruntime/test/testdata/foo_1.onnx diff --git a/onnxruntime/test/testdata/foo_2.pb b/onnxruntime/test/testdata/foo_2.onnx similarity index 100% rename from onnxruntime/test/testdata/foo_2.pb rename to onnxruntime/test/testdata/foo_2.onnx diff --git a/onnxruntime/test/testdata/fuse_add_1.pb b/onnxruntime/test/testdata/fuse_add_1.pb deleted file mode 100644 index 801147fa4e277..0000000000000 Binary files a/onnxruntime/test/testdata/fuse_add_1.pb and /dev/null differ diff --git a/onnxruntime/test/testdata/fuse_mul_1.onnx b/onnxruntime/test/testdata/fuse_mul_1.onnx new file mode 100644 index 0000000000000..a638551249f64 Binary files /dev/null and b/onnxruntime/test/testdata/fuse_mul_1.onnx differ diff --git a/onnxruntime/test/testdata/gru_1.pb b/onnxruntime/test/testdata/gru_1.pb deleted file mode 100644 index 1f2c684b306b6..0000000000000 Binary files a/onnxruntime/test/testdata/gru_1.pb and /dev/null differ diff --git a/onnxruntime/test/testdata/identity_string.pb b/onnxruntime/test/testdata/identity_string.onnx similarity index 100% rename from onnxruntime/test/testdata/identity_string.pb rename to onnxruntime/test/testdata/identity_string.onnx diff --git a/onnxruntime/test/testdata/invalid_dim_param_value_repetition.onnx b/onnxruntime/test/testdata/invalid_dim_param_value_repetition.onnx new file mode 100644 index 0000000000000..64db93e30ea93 Binary files /dev/null and b/onnxruntime/test/testdata/invalid_dim_param_value_repetition.onnx differ diff --git a/onnxruntime/test/testdata/logicaland.pb b/onnxruntime/test/testdata/logicaland.onnx similarity index 100% rename from onnxruntime/test/testdata/logicaland.pb rename to onnxruntime/test/testdata/logicaland.onnx diff --git a/onnxruntime/test/testdata/lstm_1.onnx b/onnxruntime/test/testdata/lstm_1.onnx deleted file mode 100644 index 238712affe30c..0000000000000 Binary files a/onnxruntime/test/testdata/lstm_1.onnx and /dev/null differ diff --git a/onnxruntime/test/testdata/matmul_1.pb b/onnxruntime/test/testdata/matmul_1.onnx similarity index 100% rename from onnxruntime/test/testdata/matmul_1.pb rename to onnxruntime/test/testdata/matmul_1.onnx diff --git a/onnxruntime/test/testdata/matmul_2.pb b/onnxruntime/test/testdata/matmul_2.onnx similarity index 100% rename from onnxruntime/test/testdata/matmul_2.pb rename to onnxruntime/test/testdata/matmul_2.onnx diff --git a/onnxruntime/test/testdata/model_optional_inputs.pb b/onnxruntime/test/testdata/model_optional_inputs.pb deleted file mode 100644 index 638c87c14856b..0000000000000 Binary files a/onnxruntime/test/testdata/model_optional_inputs.pb and /dev/null differ diff --git a/onnxruntime/test/testdata/mul_1.pb.noopset b/onnxruntime/test/testdata/mul_1.noopset.onnx similarity index 100% rename from onnxruntime/test/testdata/mul_1.pb.noopset rename to onnxruntime/test/testdata/mul_1.noopset.onnx diff --git a/onnxruntime/test/testdata/mul_1.pb b/onnxruntime/test/testdata/mul_1.onnx similarity index 100% rename from onnxruntime/test/testdata/mul_1.pb rename to onnxruntime/test/testdata/mul_1.onnx diff --git a/onnxruntime/test/testdata/mul_16.pb b/onnxruntime/test/testdata/mul_16.onnx similarity index 100% rename from onnxruntime/test/testdata/mul_16.pb rename to onnxruntime/test/testdata/mul_16.onnx diff --git a/onnxruntime/test/testdata/optional_1.pb b/onnxruntime/test/testdata/optional_1.onnx similarity index 100% rename from onnxruntime/test/testdata/optional_1.pb rename to onnxruntime/test/testdata/optional_1.onnx diff --git a/onnxruntime/test/testdata/optional_inputs_ir3.onnx b/onnxruntime/test/testdata/optional_inputs_ir3.onnx new file mode 100644 index 0000000000000..2419059734bc9 Binary files /dev/null and b/onnxruntime/test/testdata/optional_inputs_ir3.onnx differ diff --git a/onnxruntime/test/testdata/optional_inputs_ir4.onnx b/onnxruntime/test/testdata/optional_inputs_ir4.onnx new file mode 100644 index 0000000000000..9943f230e8a78 Binary files /dev/null and b/onnxruntime/test/testdata/optional_inputs_ir4.onnx differ diff --git a/onnxruntime/test/testdata/pyop_1.pb b/onnxruntime/test/testdata/pyop_1.onnx similarity index 100% rename from onnxruntime/test/testdata/pyop_1.pb rename to onnxruntime/test/testdata/pyop_1.onnx diff --git a/onnxruntime/test/testdata/scan_1.pb b/onnxruntime/test/testdata/scan_1.onnx similarity index 99% rename from onnxruntime/test/testdata/scan_1.pb rename to onnxruntime/test/testdata/scan_1.onnx index a1ae04f243195..cbe10155675a3 100644 Binary files a/onnxruntime/test/testdata/scan_1.pb and b/onnxruntime/test/testdata/scan_1.onnx differ diff --git a/onnxruntime/test/testdata/subgraph_input_shadows_outer_scope_value.onnx b/onnxruntime/test/testdata/subgraph_input_shadows_outer_scope_value.onnx new file mode 100644 index 0000000000000..1821b393d0481 --- /dev/null +++ b/onnxruntime/test/testdata/subgraph_input_shadows_outer_scope_value.onnx @@ -0,0 +1,73 @@ +: + +max_trip_count +keep_going_inp +bb_loopmy_localuser_defined_vals"Loop* +body2 + +a +bmy_local"Add + +a +bb_loop"Sub +' +my_local +b_loop +keep_going"Greater +( +b_loop +b_loopuser_defined_vals"AddbodyZ + iteration_num + + +Z +keep_going_inp + +  +Z +b + + +b + +keep_going + +  +b +b_loop + + +b +my_local + + +b +user_defined_vals + + +outerZ +a + + +Z +b + + +Z +keep_going_inp + +  +Z +max_trip_count + + +b +b + + +b# +user_defined_vals +  + + +B diff --git a/onnxruntime/test/testdata/transform/fusion/conv_clip.onnx b/onnxruntime/test/testdata/transform/fusion/conv_clip.onnx new file mode 100644 index 0000000000000..850f13447ad8b Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/conv_clip.onnx differ diff --git a/onnxruntime/test/testdata/transform/fusion/negative-fuse-conv-add-no-bias.onnx b/onnxruntime/test/testdata/transform/fusion/negative-fuse-conv-add-no-bias.onnx index f2df9e1022435..a11fc08e7138b 100644 Binary files a/onnxruntime/test/testdata/transform/fusion/negative-fuse-conv-add-no-bias.onnx and b/onnxruntime/test/testdata/transform/fusion/negative-fuse-conv-add-no-bias.onnx differ diff --git a/onnxruntime/test/testdata/zipmap_int64float.pb b/onnxruntime/test/testdata/zipmap_int64float.onnx similarity index 100% rename from onnxruntime/test/testdata/zipmap_int64float.pb rename to onnxruntime/test/testdata/zipmap_int64float.onnx diff --git a/onnxruntime/test/testdata/zipmap_stringfloat.pb b/onnxruntime/test/testdata/zipmap_stringfloat.onnx similarity index 100% rename from onnxruntime/test/testdata/zipmap_stringfloat.pb rename to onnxruntime/test/testdata/zipmap_stringfloat.onnx diff --git a/onnxruntime/test/tvm/tvm_basic_test.cc b/onnxruntime/test/tvm/tvm_basic_test.cc index 24a0b0748bda0..32ac5454844d2 100644 --- a/onnxruntime/test/tvm/tvm_basic_test.cc +++ b/onnxruntime/test/tvm/tvm_basic_test.cc @@ -2,49 +2,31 @@ // Licensed under the MIT License. #include "gtest/gtest.h" -#include -#include "core/codegen/tvm/tvm_kernel.h" -#include "core/framework/execution_provider.h" +#include "core/common/logging/logging.h" #include "core/framework/compute_capability.h" +#include "core/framework/execution_provider.h" +#include "core/framework/kernel_registry.h" +#include "core/framework/op_kernel.h" #include "core/graph/graph_viewer.h" #include "core/providers/cpu/cpu_execution_provider.h" #include "core/session/inference_session.h" #include "core/session/onnxruntime_cxx_api.h" -#include "core/common/logging/logging.h" #include "test/framework/test_utils.h" #include "test/test_environment.h" -#include "core/framework/op_kernel.h" -#include "core/framework/kernel_registry.h" +#include "test/tvm/tvm_demo/demo_compiler.h" -namespace onnxruntime { +#include -tvm::Schedule DefaultTVMScheduleGenerator(const TVMGraph& tvm_graph) { - std::vector args; - for (auto& tensor : tvm_graph.outputs_) - args.push_back(tensor.tvm_tensor_->op); - return tvm::create_schedule(args); -} +namespace onnxruntime { -tvm::runtime::Module BuildStackVMDefaultModule(tvm::Schedule schedule, - tvm::BuildConfig config, - tvm::Array tvm_args, - std::vector& target_func_names) { - auto target = tvm::target::stackvm(); - std::string func_name = "func"; - auto args = tvm::Array(tvm_args); - std::unordered_map binds; - auto lowered = lower(schedule, args, "func", binds, config); - target_func_names.push_back(func_name); - return build(lowered, target, tvm::Target(), config); -} +using namespace tvm_demo; -template -class TVMFuseAddKernels : public TVMKernel { +class TVMDemoKernel : public OpKernel { public: - explicit TVMFuseAddKernels(const OpKernelInfo& info) : TVMKernel(info) {} + explicit TVMDemoKernel(const OpKernelInfo& info) : OpKernel(info) {} protected: - virtual const TensorShape& GetOutputShape(OpKernelContext* context, int /*i*/) const override { + const TensorShape& GetOutputShape(OpKernelContext* context, int /*i*/) const { return context->Input(0)->Shape(); } }; @@ -100,28 +82,28 @@ class FuseExecutionProviderX : public CPUExecutionProvider { GetCapability(const onnxruntime::GraphViewer& graph_viewer, const std::vector& /*kernel_registries*/) const override { std::vector> result; - std::vector add_nodes; + std::vector fused_nodes; for (auto& node : graph_viewer.Nodes()) { - if (node.OpType() == "Add") { - add_nodes.push_back(node.Index()); + if (node.OpType() == "Mul") { + fused_nodes.push_back(node.Index()); } } - UnionSet set(static_cast(add_nodes.size())); - for (int i = 0; i < add_nodes.size(); ++i) { - auto node = graph_viewer.GetNode(add_nodes[i]); + UnionSet set(static_cast(fused_nodes.size())); + for (int i = 0; i < fused_nodes.size(); ++i) { + auto node = graph_viewer.GetNode(fused_nodes[i]); for (auto it = node->InputNodesBegin(); it != node->InputNodesEnd(); ++it) { - auto index_it = std::find(add_nodes.begin(), add_nodes.end(), (*it).Index()); - if (index_it != add_nodes.end()) { - set.merge(i, static_cast(index_it - add_nodes.begin())); + auto index_it = std::find(fused_nodes.begin(), fused_nodes.end(), (*it).Index()); + if (index_it != fused_nodes.end()) { + set.merge(i, static_cast(index_it - fused_nodes.begin())); } } } std::vector> groups; - groups.resize(add_nodes.size()); + groups.resize(fused_nodes.size()); for (int i = 0; i < set.farthers_.size(); ++i) { - groups[set.get(i)].push_back(add_nodes[i]); + groups[set.get(i)].push_back(fused_nodes[i]); } for (auto& group : groups) { @@ -150,7 +132,7 @@ class FuseExecutionProviderX : public CPUExecutionProvider { } auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); - meta_def->name = "TVMFuseAdd"; + meta_def->name = "TVMFuse"; meta_def->domain = "FuseTest"; for (auto input : fused_inputs) { meta_def->inputs.push_back(input->Name()); @@ -177,22 +159,22 @@ class FuseExecutionProviderX : public CPUExecutionProvider { auto func_body = fused_node->GetFunctionBody(); if (!func_body) return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Function body is empty"); - //1. compile the onnxruntime Graph to tvm graph. - auto tvm_graph_ = CompileToTVM(func_body->Body(), kCpuExecutionProvider); - //2. create schedule for tvm graph, this step is depends on the execution provider/hardware. - auto s = DefaultTVMScheduleGenerator(tvm_graph_); - //3. Build module + //1. Build tvm IR based on the Ort graph + auto demo_tvm_tensor_ctx = BuildTVMIR(func_body->Body()); + //2. Create schedule for the built tvm IRs + auto s = CreateSchedule(demo_tvm_tensor_ctx); + //3. Build tvm module std::vector tvm_args; - for (auto& t : tvm_graph_.inputs_) { - tvm_args.push_back(t.tvm_tensor_); + for (auto& t : demo_tvm_tensor_ctx.inputs) { + tvm_args.push_back(t); } - for (auto& t : tvm_graph_.outputs_) { - tvm_args.push_back(t.tvm_tensor_); + for (auto& t : demo_tvm_tensor_ctx.outputs) { + tvm_args.push_back(t); } std::vector func_names; auto module_ptr = std::make_shared(); - *module_ptr = BuildStackVMDefaultModule(s, tvm::build_config(), tvm_args, func_names); + *module_ptr = BuildStackVMModule(s, tvm::build_config(), tvm_args, func_names); modules_[fused_node->Name()] = module_ptr; NodeComputeInfo compute_info; @@ -307,7 +289,9 @@ static void RunSession(InferenceSession& session_object, // Now run common::Status st = session_object.Run(run_options, feeds, output_names, &fetches); - std::cout << "Run returned status: " << st.ErrorMessage() << std::endl; + if (!st.IsOK()) { + std::cout << "Run returned status: " << st.ErrorMessage() << std::endl; + } EXPECT_TRUE(st.IsOK()); ASSERT_EQ(1, fetches.size()); auto& rtensor = fetches.front().Get(); @@ -319,9 +303,9 @@ static void RunSession(InferenceSession& session_object, ASSERT_EQ(found[i], values_y[i]); } -static const std::string MODEL_URI = "testdata/fuse_add_1.pb"; +static const std::string MODEL_URI = "testdata/fuse_mul_1.onnx"; -TEST(TVMTest, Fuse_Add_Test) { +TEST(TVMTest, CodeGen_Demo_for_Fuse_Mul) { SessionOptions so; so.session_logid = "InferenceSessionTests.NoTimeout"; @@ -346,8 +330,8 @@ TEST(TVMTest, Fuse_Add_Test) { std::vector expected_dims_y = { 6, }; - // now the expected value should be Add's result. - std::vector expected_values_y = {5.0, 10.0, 15.0, 20.0, 25.0, 30.0}; + // now the expected value should be Mul's result. + std::vector expected_values_y = {1.0, 32.0, 243.0, 1024.0, 3125.0, 7776.0}; // Now run RunSession(session_object, run_options, dims_x, values_x, expected_dims_y, expected_values_y); @@ -356,7 +340,7 @@ TEST(TVMTest, Fuse_Add_Test) { } // namespace onnxruntime -TEST(TVMTest, Basic) { +TEST(TVMTest, Native_TVM) { using namespace tvm; auto n = var("n"); Array shape; diff --git a/onnxruntime/test/tvm/tvm_demo/demo_compiler.cc b/onnxruntime/test/tvm/tvm_demo/demo_compiler.cc new file mode 100644 index 0000000000000..c978675b06d8e --- /dev/null +++ b/onnxruntime/test/tvm/tvm_demo/demo_compiler.cc @@ -0,0 +1,226 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "test/tvm/tvm_demo/demo_compiler.h" + +#include "core/codegen/passes/scheduler/schedule_utils.h" +#include "core/codegen/passes/utils/ort_tvm_utils.h" +#include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h" +#include "core/codegen/passes/scheduler/tvm_scheduler.h" +#include "core/codegen/passes/scheduler/tvm_schedule_builder.h" + +#include +#include + +namespace onnxruntime { +namespace tvm_demo { + +// Create a dummy demo handle +static codegen::CodeGenHandle demo_handle; +// Create a dummy demo codegen context +static tvm_codegen::CodeGenContext demo_codegen_ctx(&demo_handle); + +// Translate an Ort graph into tvm IR +// Note this function is specific for this demo. +// This function uses specific way for graph traversal or constructing tvm placeholders. +// It may or may not work for a universal Ort graph. +// For a more general example, please check nuphar provider. +DemoTVMTensorCtx BuildTVMIR(const onnxruntime::Graph& graph) { + // Create OpIRRegistry that holds all OpIRCreators + std::unique_ptr op_ir_registry = + std::make_unique(); + + // Register all generic OpIRCreators + tvm_codegen::RegisterAllGenericOpIRCreators(op_ir_registry.get()); + + // Create OpIRBuilder + std::shared_ptr op_ir_builder = + std::make_shared("Demo_Op_IR_Builder"); + + // Attach all generic OpIRCreators from op_ir_registry to op_ir_builder + tvm_codegen::RegisterGenericOrtOpTypeDispatcher(op_ir_builder, op_ir_registry.get()); + + // Create DemoTVMTensorCtx holdings tvm IR + DemoTVMTensorCtx result; + + // Local lookup from name to tvm::Tensor + std::unordered_map tvm_tensors; + + // Note this is a simplified traversal that works specifically for this demo + // but may or may not work for an univerisal model. + // For more general traversal, please check nuphar provider. + for (auto& node : graph.Nodes()) { + tvm::Array inputs; + tvm::Array outputs; + + // Get inputs + for (auto& def : node.InputDefs()) { + const std::string& name = def->Name(); + auto iter = tvm_tensors.find(name); + // Always create placeholder when not finding a tensor + // Note it is for this demo. + // It may or may not work for a universal graph. + if (iter == tvm_tensors.end()) { + tvm_tensors[name] = + tvm::placeholder(ShapeToTvmArray(def, demo_codegen_ctx), + tvm_codegen::ToTvmType(TensorProtoDataType(def)), + name + "_placeholder"); + } + inputs.push_back(tvm_tensors[name]); + } + + // call OpIBuilder's Evaluate to build tvm IR + op_ir_builder->Evaluate(inputs, node, demo_codegen_ctx, outputs); + + // Store outputs + for (size_t def_id = 0; def_id < node.OutputDefs().size(); ++def_id) { + const NodeArg* def = node.OutputDefs()[def_id]; + tvm_tensors[def->Name()] = outputs[def_id]; + } + } + + // put inputs to DemoTVMTensorCtx + for (auto& input : graph.GetInputs()) { + result.inputs.push_back(tvm_tensors[input->Name()]); + } + + // check initializer + for (auto& initializer : graph.GetAllInitializedTensors()) { + result.inputs.push_back(tvm_tensors[initializer.first]); + } + + // Only one output in this demo + auto& output = graph.GetOutputs()[0]; + result.outputs.push_back(tvm_tensors[output->Name()]); + return result; +} + +// Declare a Demo scheduler that always inserts compute_inline +DECLARE_TVM_SCHEDULER_CLASS(AlwaysInline, DemoTVM) + +// Define a Demo scheduler's Evaluate that always inserts compute_inline +bool TVM_SCHEDULER_CLASS(AlwaysInline, DemoTVM)::Evaluate( + const tvm::Tensor& tensor, + const Node*, + tvm_codegen::CodeGenContext&, + tvm_codegen::ScheduleContext& ctx_sched) { + return TryInlineSchedule(tensor, ctx_sched); +} + +// Register the always inline Scheduler to sched_registry +static void RegisterAlwaysInlineScheduler(tvm_codegen::TVMScheduleRegistry* sched_registry) { + sched_registry->Register( + std::move(std::make_unique())); +} + +// Declare a schedule dispatcher that always dispatches the always inline Scheduler +DECLARE_SCHEDULE_DISPATCHER_CLASS(DemoTVM) + +// Use a predefined key as DemoKey to dispatch the scheduler +constexpr auto predefined_key = "DemoKey"; + +// Define the schedule dispatcher's Find function +// that always dispatches the always inline Scheduler +// Note this dispatcher always returning a predefined_key is only for demo purpose. +// In practice, a dispatcher returns a key by checking tvm::Tensor, Node, +// or even meta data stored in CodeGenContext. +// Derived CodeGenContext allows compiler developers to store their specific meta data. +// For more detailed example, please check nuphar provider. +tvm_codegen::Scheduler* SCHEDULE_DISPATCHER_CLASS(DemoTVM)::Find( + const tvm::Tensor&, const Node*, tvm_codegen::CodeGenContext&) { + return DispatcherBase::Get(predefined_key); +} + +// Attach the always inline Scheduler to the above dispatcher +// and then attach the dispatcher to the scheduler builder +static void AttachAlwaysInlineScheduler(const std::shared_ptr& builder, + const tvm_codegen::TVMScheduleRegistry* registry) { + auto dispatcher = std::make_unique("DemoSchedulers"); + + // Using a predefined_key + dispatcher->Register(predefined_key, + registry->Get(TVM_SCHEDULER_STRING(AlwaysInline, DemoTVM))); + + builder->InsertDispatcher(std::move(dispatcher)); +} + +// Traverse tvm::Tensor and then schedule them +// Note this traversal is simplified and specific for this demo. +// For a more general traversal, please check nuphar provider. +static void TraverseAndSchedule( + std::shared_ptr& schedule_builder, + const tvm::Tensor& tensor, + tvm_codegen::ScheduleContext& ctx_schedule) { + schedule_builder->Evaluate(tensor, nullptr, demo_codegen_ctx, ctx_schedule); + + // Traverse tensor's children (inputs) + for (auto& t : tensor->op->InputTensors()) { + // check whether it is a non-trivial tensor by checking its input size + if (t->op->InputTensors().size() > 0) { + TraverseAndSchedule(schedule_builder, t, ctx_schedule); + } + } +} + +// Create a TVM schedule by always inserting tvm's compute_inline. +// Note this schedule is specific for this demo. +// In practice, always inline might lead to bad performance +// or even illegal loop transformation for some backends. +// For a more general example, please check nuphar provider. +tvm::Schedule CreateSchedule(const DemoTVMTensorCtx& ctx) { + // Create TVMScheduleRegistry that holds all Scheduler + std::unique_ptr schedule_registry = + std::make_unique(); + + // Register the always inline Scheduler to schedule_registry + RegisterAlwaysInlineScheduler(schedule_registry.get()); + + // Create a DemoScheduleBuilder + std::shared_ptr schedule_builder = + std::make_shared("Demo_Schedule_Builder"); + + // Attach the demo inline scheduler to the schedule_builder + AttachAlwaysInlineScheduler(schedule_builder, schedule_registry.get()); + + // Create scheudule object + tvm::Array out_ops; + for (auto& t : ctx.outputs) { + out_ops.push_back(t->op); + } + + // Create scheudule context + tvm_codegen::ScheduleContext ctx_schedule(out_ops); + + // Traverse tvm::Tensor in a DFS way, and then schedule + for (auto& t : ctx.outputs) { + TraverseAndSchedule(schedule_builder, t, ctx_schedule); + } + + // Make sure all outputs compute_root (tvm's requirement) + for (auto& t : ctx.outputs) { + tvm_codegen::InsertRootSchedule(t, ctx_schedule); + } + + return ctx_schedule.schedule; +} + +// Build TVM Module with a schedule using tvm's stackvm. +// Note in real practice, please change stackvm to other backends. +// For a more detailed example, please check nuphar provider. +tvm::runtime::Module BuildStackVMModule(tvm::Schedule schedule, + tvm::BuildConfig config, + tvm::Array tvm_args, + std::vector& target_func_names) { + auto target = tvm::target::stackvm(); + std::string func_name = "func"; + auto args = tvm::Array(tvm_args); + std::unordered_map binds; + auto lowered = lower(schedule, args, "func", binds, config); + // Uncomment the following line to dump lowered func + // std::cout << "Dumping lowered func: " << lowered[0]->body; + target_func_names.push_back(func_name); + return build(lowered, target, tvm::Target(), config); +} + +} // namespace tvm_demo +} // namespace onnxruntime diff --git a/onnxruntime/test/tvm/tvm_demo/demo_compiler.h b/onnxruntime/test/tvm/tvm_demo/demo_compiler.h new file mode 100644 index 0000000000000..3905995293152 --- /dev/null +++ b/onnxruntime/test/tvm/tvm_demo/demo_compiler.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/graph/graph_viewer.h" +#include +#include + +namespace onnxruntime { +namespace tvm_demo { +// A Demo data structure to hold tvm IR and context +struct DemoTVMTensorCtx { + tvm::Array inputs; + tvm::Array outputs; +}; + +// Translate an Ort graph into tvm IR +DemoTVMTensorCtx BuildTVMIR(const onnxruntime::Graph& graph); + +// Create a demo schedule for the tvm IR +tvm::Schedule CreateSchedule(const DemoTVMTensorCtx& ctx); + +// Build a demo tvm module with the tvm IR and schedule +tvm::runtime::Module BuildStackVMModule(tvm::Schedule schedule, + tvm::BuildConfig config, + tvm::Array tvm_args, + std::vector& target_func_names); + +} // namespace tvm_demo +} // namespace onnxruntime diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 35e87c1b4c655..65c7bf4ee97eb 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -14,6 +14,7 @@ std::shared_ptr CreateExecutionProviderFactory_Mkldnn std::shared_ptr CreateExecutionProviderFactory_NGraph(const char* ng_backend_type); std::shared_ptr CreateExecutionProviderFactory_Nuphar(int device_id, const char*); std::shared_ptr CreateExecutionProviderFactory_BrainSlice(uint32_t ip, int, int, bool, const char*, const char*, const char*); +std::shared_ptr CreateExecutionProviderFactory_Nnapi(); std::shared_ptr CreateExecutionProviderFactory_Tensorrt(); std::shared_ptr CreateExecutionProviderFactory_OpenVINO(const char* device_id); @@ -80,5 +81,13 @@ std::unique_ptr DefaultBrainSliceExecutionProvider() { #endif } +std::unique_ptr DefaultNnapiExecutionProvider() { +#ifdef USE_NNAPI + return CreateExecutionProviderFactory_Nnapi()->CreateProvider(); +#else + return nullptr; +#endif +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h index 77db086b8a77e..c5369b4def58a 100644 --- a/onnxruntime/test/util/include/default_providers.h +++ b/onnxruntime/test/util/include/default_providers.h @@ -15,6 +15,7 @@ std::unique_ptr DefaultNupharExecutionProvider(); std::unique_ptr DefaultBrainSliceExecutionProvider(); std::unique_ptr DefaultTensorrtExecutionProvider(); std::unique_ptr DefaultOpenVINOExecutionProvider(); +std::unique_ptr DefaultNnapiExecutionProvider(); } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/util/include/providers.h b/onnxruntime/test/util/include/providers.h index 296897cca7eca..9cc04a360acd4 100644 --- a/onnxruntime/test/util/include/providers.h +++ b/onnxruntime/test/util/include/providers.h @@ -24,4 +24,7 @@ #endif #ifdef USE_OPENVINO #include "core/providers/openvino/openvino_provider_factory.h" -#endif \ No newline at end of file +#endif +#ifdef USE_NNAPI +#include "core/providers/nnapi/nnapi_provider_factory.h" +#endif diff --git a/onnxruntime/test/util/test_allocator.cc b/onnxruntime/test/util/test_allocator.cc index 97f8fd5c4de99..3a91d9d52fae7 100644 --- a/onnxruntime/test/util/test_allocator.cc +++ b/onnxruntime/test/util/test_allocator.cc @@ -8,7 +8,7 @@ MockedOrtAllocator::MockedOrtAllocator() { OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Alloc(size); }; OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast(this_)->Free(p); }; OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast(this_)->Info(); }; - ORT_THROW_ON_ERROR(OrtCreateAllocatorInfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault, &cpuAllocatorInfo)); + ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpuAllocatorInfo)); } MockedOrtAllocator::~MockedOrtAllocator() { diff --git a/package/rpm/onnxruntime.spec b/package/rpm/onnxruntime.spec index 6fbdec8851c9b..cbf7013ca5f0f 100644 --- a/package/rpm/onnxruntime.spec +++ b/package/rpm/onnxruntime.spec @@ -1,5 +1,5 @@ Name: onnxruntime -Version: 0.4.0 +Version: 0.5.0 Release: 1%{?dist} Summary: onnxruntime diff --git a/rename_manylinux.sh b/rename_manylinux.sh deleted file mode 100755 index 5634a5ef442e4..0000000000000 --- a/rename_manylinux.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -# hack script to modify whl as manylinux whl -whl=(*whl) -renamed_whl=`echo $whl | sed --expression='s/linux/manylinux1/g'` -basename=`echo $whl | awk -F'-cp3' '{print $1}'` -unzip $whl -sed -i 's/linux/manylinux1/g' ${basename}.dist-info/WHEEL -# explicitly set file perms -chmod 664 ${basename}.dist-info/* -zip -r $renamed_whl ${basename}.data ${basename}.dist-info diff --git a/samples/c_cxx/CMakeLists.txt b/samples/c_cxx/CMakeLists.txt new file mode 100644 index 0000000000000..e477bffe79b49 --- /dev/null +++ b/samples/c_cxx/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.13) + +# Project +project(onnxruntime_samples C CXX) +if (WIN32) + string(APPEND CMAKE_CXX_FLAGS " /W4") +else() + string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra") + string(APPEND CMAKE_C_FLAGS " -Wall -Wextra") +endif() + +#onnxruntime providers +option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) +option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) +option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF) +option(onnxruntime_USE_MKLDNN "Build with MKL-DNN support" OFF) +option(onnxruntime_USE_NGRAPH "Build with nGraph support" OFF) +option(onnxruntime_USE_NUPHAR "Build with Nuphar" OFF) +option(onnxruntime_USE_BRAINSLICE "Build with BrainSlice" OFF) +option(onnxruntime_USE_TENSORRT "Build with TensorRT support" OFF) +option(LIBPNG_ROOTDIR "libpng root dir") + +#if JPEG lib is available, we'll use it for image decoding, otherwise we'll use WIC +find_package(JPEG) +if(LIBPNG_ROOTDIR) + set(PNG_FOUND true) + if(WIN32) + set(PNG_LIBRARIES debug libpng16_d optimized libpng16) + else() + set(PNG_LIBRARIES png16) + endif() + set(PNG_INCLUDE_DIRS "${LIBPNG_ROOTDIR}/include") + set(PNG_LIBDIR "${LIBPNG_ROOTDIR}/lib") +else() + find_package(PNG) +endif() + +if(onnxruntime_USE_CUDA) + add_definitions(-DUSE_CUDA) +endif() +if(onnxruntime_USE_OPENVINO) + add_definitions(-DUSE_OPENVINO) +endif() +if(onnxruntime_USE_NNAPI) + add_definitions(-DUSE_NNAPI) +endif() +if(onnxruntime_USE_NNAPI) + add_definitions(-DUSE_NNAPI) +endif() +if(onnxruntime_USE_MKLDNN) + add_definitions(-DUSE_MKLDNN) +endif() +if(onnxruntime_USE_NGRAPH) + add_definitions(-DUSE_NGRAPH) +endif() +if(onnxruntime_USE_NUPHAR) + add_definitions(-DUSE_NUPHAR) +endif() +if(onnxruntime_USE_BRAINSLICE) + add_definitions(-DUSE_BRAINSLICE) +endif() +if(onnxruntime_USE_TENSORRT) + add_definitions(-DUSE_TENSORRT) +endif() + +#TODO: remove this part(need to talk to Ryan and find a solution for this) +if(UNIX) + include_directories("/usr/local/include/onnxruntime") +else() + include_directories("C:\\Program Files (x86)\\onnxruntime\\include" "C:\\Program Files (x86)\\onnxruntime\\include\\onnxruntime" "C:\\Program Files (x86)\\onnxruntime\\include\\onnxruntime\\core\\session") + link_directories("C:\\Program Files (x86)\\onnxruntime\\lib") +endif() + + +add_subdirectory(imagenet) +if(PNG_FOUND) + add_subdirectory(fns_candy_style_transfer) +endif() +add_subdirectory(MNIST) diff --git a/samples/c_cxx/MNIST/CMakeLists.txt b/samples/c_cxx/MNIST/CMakeLists.txt new file mode 100644 index 0000000000000..ef22be798ed3c --- /dev/null +++ b/samples/c_cxx/MNIST/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.13) + +add_executable(mnist MNIST.cpp) + +target_link_options(mnist PRIVATE "/SUBSYSTEM:WINDOWS") \ No newline at end of file diff --git a/samples/c_cxx/MNIST/MNIST.cpp b/samples/c_cxx/MNIST/MNIST.cpp new file mode 100644 index 0000000000000..2df76d4999bf2 --- /dev/null +++ b/samples/c_cxx/MNIST/MNIST.cpp @@ -0,0 +1,229 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#define UNICODE +#include +#include +#include + +#pragma comment(lib, "user32.lib") +#pragma comment(lib, "gdi32.lib") +#pragma comment(lib, "onnxruntime.lib") + +Ort::Env env{ORT_LOGGING_LEVEL_WARNING, "test"}; + +// This is the structure to interface with the MNIST model +// After instantiation, set the input_image_ data to be the 28x28 pixel image of the number to recognize +// Then call Run() to fill in the results_ data with the probabilities of each +// result_ holds the index with highest probability (aka the number the model thinks is in the image) +struct MNIST { + MNIST() { + auto allocator_info = Ort::AllocatorInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); + input_tensor_ = Ort::Value::CreateTensor(allocator_info, input_image_.data(), input_image_.size(), input_shape_.data(), input_shape_.size()); + output_tensor_ = Ort::Value::CreateTensor(allocator_info, results_.data(), results_.size(), output_shape_.data(), output_shape_.size()); + } + + std::ptrdiff_t Run() { + const char* input_names[] = {"Input3"}; + const char* output_names[] = {"Plus214_Output_0"}; + + session_.Run(Ort::RunOptions{nullptr}, input_names, &input_tensor_, 1, output_names, &output_tensor_, 1); + + result_ = std::distance(results_.begin(), std::max_element(results_.begin(), results_.end())); + return result_; + } + + static constexpr const int width_ = 28; + static constexpr const int height_ = 28; + + std::array input_image_{}; + std::array results_{}; + int result_{0}; + + private: + Ort::Session session_{env, L"model.onnx", Ort::SessionOptions{nullptr}}; + + Ort::Value input_tensor_{nullptr}; + std::array input_shape_{1, 1, width_, height_}; + + Ort::Value output_tensor_{nullptr}; + std::array output_shape_{1, 10}; +}; + +const constexpr int drawing_area_inset_{4}; // Number of pixels to inset the top left of the drawing area +const constexpr int drawing_area_scale_{4}; // Number of times larger to make the drawing area compared to the shape inputs +const constexpr int drawing_area_width_{MNIST::width_ * drawing_area_scale_}; +const constexpr int drawing_area_height_{MNIST::height_ * drawing_area_scale_}; + +MNIST mnist_; +HBITMAP dib_; +HDC hdc_dib_; +bool painting_{}; + +HBRUSH brush_winner_{CreateSolidBrush(RGB(128, 255, 128))}; +HBRUSH brush_bars_{CreateSolidBrush(RGB(128, 128, 255))}; + +struct DIBInfo : DIBSECTION { + DIBInfo(HBITMAP hBitmap) noexcept { ::GetObject(hBitmap, sizeof(DIBSECTION), this); } + + int Width() const noexcept { return dsBm.bmWidth; } + int Height() const noexcept { return dsBm.bmHeight; } + + void* Bits() const noexcept { return dsBm.bmBits; } + int Pitch() const noexcept { return dsBmih.biSizeImage / abs(dsBmih.biHeight); } +}; + +// We need to convert the true-color data in the DIB into the model's floating point format +// TODO: (also scales down the image and smooths the values, but this is not working properly) +void ConvertDibToMnist() { + DIBInfo info{dib_}; + + const DWORD* input = reinterpret_cast(info.Bits()); + float* output = mnist_.input_image_.data(); + + std::fill(mnist_.input_image_.begin(), mnist_.input_image_.end(), 0.f); + + for (unsigned y = 0; y < MNIST::height_; y++) { + for (unsigned x = 0; x < MNIST::width_; x++) { + output[x] += input[x] == 0 ? 1.0f : 0.0f; + } + input = reinterpret_cast(reinterpret_cast(input) + info.Pitch()); + output += MNIST::width_; + } +} + +LRESULT CALLBACK WndProc(HWND, UINT, WPARAM, LPARAM); + +// The Windows entry point function +int APIENTRY wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE hPrevInstance, _In_ LPTSTR lpCmdLine, + _In_ int nCmdShow) { + { + WNDCLASSEX wc{}; + wc.cbSize = sizeof(WNDCLASSEX); + wc.style = CS_HREDRAW | CS_VREDRAW; + wc.lpfnWndProc = WndProc; + wc.hInstance = hInstance; + wc.hCursor = LoadCursor(NULL, IDC_ARROW); + wc.hbrBackground = (HBRUSH)(COLOR_WINDOW + 1); + wc.lpszClassName = L"ONNXTest"; + RegisterClassEx(&wc); + } + { + BITMAPINFO bmi{}; + bmi.bmiHeader.biSize = sizeof(bmi.bmiHeader); + bmi.bmiHeader.biWidth = MNIST::width_; + bmi.bmiHeader.biHeight = -MNIST::height_; + bmi.bmiHeader.biPlanes = 1; + bmi.bmiHeader.biBitCount = 32; + bmi.bmiHeader.biPlanes = 1; + bmi.bmiHeader.biCompression = BI_RGB; + + void* bits; + dib_ = CreateDIBSection(nullptr, &bmi, DIB_RGB_COLORS, &bits, nullptr, 0); + } + if (dib_ == nullptr) return -1; + hdc_dib_ = CreateCompatibleDC(nullptr); + SelectObject(hdc_dib_, dib_); + SelectObject(hdc_dib_, CreatePen(PS_SOLID, 2, RGB(0, 0, 0))); + FillRect(hdc_dib_, &RECT{0, 0, MNIST::width_, MNIST::height_}, (HBRUSH)GetStockObject(WHITE_BRUSH)); + + HWND hWnd = CreateWindow(L"ONNXTest", L"ONNX Runtime Sample - MNIST", WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, CW_USEDEFAULT, 512, 256, nullptr, nullptr, hInstance, nullptr); + if (!hWnd) + return FALSE; + + ShowWindow(hWnd, nCmdShow); + + MSG msg; + while (GetMessage(&msg, NULL, 0, 0)) { + TranslateMessage(&msg); + DispatchMessage(&msg); + } + return (int)msg.wParam; +} + +LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { + switch (message) { + case WM_PAINT: { + PAINTSTRUCT ps; + HDC hdc = BeginPaint(hWnd, &ps); + + // Draw the image + StretchBlt(hdc, drawing_area_inset_, drawing_area_inset_, drawing_area_width_, drawing_area_height_, hdc_dib_, 0, 0, MNIST::width_, MNIST::height_, SRCCOPY); + SelectObject(hdc, GetStockObject(BLACK_PEN)); + SelectObject(hdc, GetStockObject(NULL_BRUSH)); + Rectangle(hdc, drawing_area_inset_, drawing_area_inset_, drawing_area_inset_ + drawing_area_width_, drawing_area_inset_ + drawing_area_height_); + + constexpr int graphs_left = drawing_area_inset_ + drawing_area_width_ + 5; + constexpr int graph_width = 64; + SelectObject(hdc, brush_bars_); + + auto least = *std::min_element(mnist_.results_.begin(), mnist_.results_.end()); + auto greatest = mnist_.results_[mnist_.result_]; + auto range = greatest - least; + + auto graphs_zero = graphs_left - least * graph_width / range; + + // Hilight the winner + RECT rc{graphs_left, mnist_.result_ * 16, graphs_left + graph_width + 128, (mnist_.result_ + 1) * 16}; + FillRect(hdc, &rc, brush_winner_); + + // For every entry, draw the odds and the graph for it + SetBkMode(hdc, TRANSPARENT); + wchar_t value[80]; + for (unsigned i = 0; i < 10; i++) { + int y = 16 * i; + float result = mnist_.results_[i]; + + auto length = wsprintf(value, L"%2d: %d.%02d", i, int(result), abs(int(result * 100) % 100)); + TextOut(hdc, graphs_left + graph_width + 5, y, value, length); + + Rectangle(hdc, graphs_zero, y + 1, graphs_zero + result * graph_width / range, y + 14); + } + + // Draw the zero line + MoveToEx(hdc, graphs_zero, 0, nullptr); + LineTo(hdc, graphs_zero, 16 * 10); + + EndPaint(hWnd, &ps); + return 0; + } + + case WM_LBUTTONDOWN: { + SetCapture(hWnd); + painting_ = true; + int x = (GET_X_LPARAM(lParam) - drawing_area_inset_) / drawing_area_scale_; + int y = (GET_Y_LPARAM(lParam) - drawing_area_inset_) / drawing_area_scale_; + MoveToEx(hdc_dib_, x, y, nullptr); + return 0; + } + + case WM_MOUSEMOVE: + if (painting_) { + int x = (GET_X_LPARAM(lParam) - drawing_area_inset_) / drawing_area_scale_; + int y = (GET_Y_LPARAM(lParam) - drawing_area_inset_) / drawing_area_scale_; + LineTo(hdc_dib_, x, y); + InvalidateRect(hWnd, nullptr, false); + } + return 0; + + case WM_CAPTURECHANGED: + painting_ = false; + return 0; + + case WM_LBUTTONUP: + ReleaseCapture(); + ConvertDibToMnist(); + mnist_.Run(); + InvalidateRect(hWnd, nullptr, true); + return 0; + + case WM_RBUTTONDOWN: // Erase the image + FillRect(hdc_dib_, &RECT{0, 0, MNIST::width_, MNIST::height_}, (HBRUSH)GetStockObject(WHITE_BRUSH)); + InvalidateRect(hWnd, nullptr, false); + return 0; + + case WM_DESTROY: + PostQuitMessage(0); + return 0; + } + return DefWindowProc(hWnd, message, wParam, lParam); +} diff --git a/samples/c_cxx/MNIST/ReadMe.md b/samples/c_cxx/MNIST/ReadMe.md new file mode 100644 index 0000000000000..043a63dbc9e17 --- /dev/null +++ b/samples/c_cxx/MNIST/ReadMe.md @@ -0,0 +1,66 @@ +# MNIST Sample - Number recognition + +This sample uses the MNIST model from the Model Zoo: https://github.com/onnx/models/tree/master/mnist + +![Screenshot](Screenshot.png) + +## Requirements + +Compiled Onnxruntime.dll / lib (link to instructions on how to build dll) +Windows Visual Studio Compiler (cl.exe) + +## Build + +Run 'build.bat' in this directory to call cl.exe to generate MNIST.exe +Then just run MNIST.exe + +## How to use it + +Just draw a number with the left mouse button (or use touch) in the box on the left side. After releasing the mouse button the model will be run and the outputs of the model will be displayed. Note that when drawing numbers requiring multiple drawing strokes, the model will be run at the end of each stroke with probably wrong predictions (but it's amusing to see and avoids needing to press a 'run model' button). + +To clear the image, click the right mouse button anywhere. + +## How it works + +A single Ort::Env is created globally to initialize the runtime. +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L12 + +The MNIST structure abstracts away all of the interaction with the Onnx Runtime, creating the tensors, and running the model. + +WWinMain is the Windows entry point, it creates the main window. + +WndProc is the window procedure for the window, handling the mouse input and drawing the graphics + +### Preprocessing the data + +MNIST's input is a {1,1,28,28} shaped float tensor, which is basically a 28x28 floating point grayscale image (0.0 = background, 1.0 = foreground). + +The sample stores the image in a 32-bit per pixel windows DIB section, since that's easy to draw into and draw to the screen for windows. The DIB is created here: +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L109-L121 + +The function to convert the DIB data and writ it into the model's input tensor: +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L77-L92 + +### Postprocessing the output + +MNIST's output is a simple {1,10} float tensor that holds the likelihood weights per number. The number with the highest value is the model's best guess. + +The MNIST structure uses std::max_element to do this and stores it in result_: +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L31 + +To make things more interesting, the window painting handler graphs the probabilities and shows the weights here: +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L164-L183 + +### The Ort::Session + +1. Creation: The Ort::Session is created inside the MNIST structure here: +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L43 + +2. Setup inputs & outputs: The input & output tensors are created here: +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L19-L23 +In this usage, we're providing the memory location for the data instead of having Ort allocate the buffers. This is simpler in this case since the buffers are small and can just be fixed members of the MNIST struct. + +3. Run: Running the session is done in the Run() method: +https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L25-L33 + + diff --git a/samples/c_cxx/MNIST/Screenshot.png b/samples/c_cxx/MNIST/Screenshot.png new file mode 100644 index 0000000000000..4c4ea23007e54 Binary files /dev/null and b/samples/c_cxx/MNIST/Screenshot.png differ diff --git a/samples/c_cxx/MNIST/build.bat b/samples/c_cxx/MNIST/build.bat new file mode 100644 index 0000000000000..eba19ffbd1926 --- /dev/null +++ b/samples/c_cxx/MNIST/build.bat @@ -0,0 +1 @@ +cl MNIST.cpp /Zi /EHsc /I..\..\..\include\onnxruntime\core\session /link /LIBPATH:..\..\..\build\Windows\Debug\Debug \ No newline at end of file diff --git a/samples/c_cxx/README.md b/samples/c_cxx/README.md new file mode 100644 index 0000000000000..801311a3bde9b --- /dev/null +++ b/samples/c_cxx/README.md @@ -0,0 +1,39 @@ +This directory contains a few C/C++ sample applications for demoing onnxruntime usage: + +1. fns_candy_style_transfer: A C application that uses the FNS-Candy style transfer model to re-style images. +2. MNIST: A windows GUI application for doing handwriting recognition +3. imagenet: An end-to-end sample for the [ImageNet Large Scale Visual Recognition Challenge 2012](http://www.image-net.org/challenges/LSVRC/2012/) + +# How to build + +## Prerequisites +1. Visual Studio 2015/2017/2019 +2. cmake(version >=3.13) +3. (optional) [libpng 1.6](http://www.libpng.org/pub/png/libpng.html) + +You may get a precompiled libpng library from [https://onnxruntimetestdata.blob.core.windows.net/models/libpng.zip](https://onnxruntimetestdata.blob.core.windows.net/models/libpng.zip) + +## Install ONNX Runtime +You may either get a prebuit onnxruntime from nuget.org, or build it from source by following the [BUILD.md document](../../../BUILD.md). +If you build it by yourself, you must append the "--build_shared_lib" flag to your build command. Like: + +``` +build.bat --config RelWithDebInfo --build_shared_lib --parallel +``` +When the build is done, run Visual Studio as administrator and open the onnxruntime.sln file in your build directory. +![vs.png](vs.png) + +When the solution is loaded, change the build configuration to "RelWithDebInfo"(which must match your previous build command), then select the "INSTALL" project, and build it. It will install your onnxruntime to "C:\Program Files (x86)\onnxruntime" + +## Build the samples +Open cmd.exe, change your current directory to samples\c_cxx, then run +```bat +mkdir build +cmake .. -A x64 -T host=x64 -DLIBPNG_ROOTDIR=C:\path\to\your\libpng\binary +``` +You may omit the "-DLIBPNG_ROOTDIR=..." argument if you don't have the libpng library. +You may append "-Donnxruntime_USE_CUDA=ON" to the last command args if your onnxruntime binary was built with CUDA support. + +Then you can open the onnxruntime_samples.sln file in the "build" directory and build the solution. + + diff --git a/samples/c_cxx/fns_candy_style_transfer/CMakeLists.txt b/samples/c_cxx/fns_candy_style_transfer/CMakeLists.txt new file mode 100644 index 0000000000000..8f4edc59b9954 --- /dev/null +++ b/samples/c_cxx/fns_candy_style_transfer/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +add_executable(fns_candy_style_transfer "fns_candy_style_transfer.c") +target_include_directories(fns_candy_style_transfer PRIVATE ${PROJECT_SOURCE_DIR}/include ${PNG_INCLUDE_DIRS}) +target_link_libraries(fns_candy_style_transfer PRIVATE onnxruntime ${PNG_LIBRARIES}) +if(PNG_LIBDIR) + target_link_directories(fns_candy_style_transfer PRIVATE ${PNG_LIBDIR}) +endif() \ No newline at end of file diff --git a/samples/c_cxx/fns_candy_style_transfer/README.md b/samples/c_cxx/fns_candy_style_transfer/README.md new file mode 100644 index 0000000000000..4211aa8d1ec59 --- /dev/null +++ b/samples/c_cxx/fns_candy_style_transfer/README.md @@ -0,0 +1,19 @@ +# Build +See [../README.md](../README.md) + +# Prepare data +Please download the model from (candy.onnx)[https://raw.githubusercontent.com/microsoft/Windows-Machine-Learning/master/Samples/FNSCandyStyleTransfer/UWP/cs/Assets/candy.onnx] + +Then prepare an image: +1. In png format +2. With dimension of 720x720 + +# Run +``` +fns_candy_style_transfer.exe +``` + + + + + diff --git a/samples/c_cxx/fns_candy_style_transfer/fns_candy_style_transfer.c b/samples/c_cxx/fns_candy_style_transfer/fns_candy_style_transfer.c new file mode 100644 index 0000000000000..5c191ca540286 --- /dev/null +++ b/samples/c_cxx/fns_candy_style_transfer/fns_candy_style_transfer.c @@ -0,0 +1,257 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#include "onnxruntime/core/session/onnxruntime_c_api.h" +#include "providers.h" +#include +#include +#include +#ifdef _WIN32 +#include +#endif +#define ORT_ABORT_ON_ERROR(expr) \ + do { \ + OrtStatus* onnx_status = (expr); \ + if (onnx_status != NULL) { \ + const char* msg = OrtGetErrorMessage(onnx_status); \ + fprintf(stderr, "%s\n", msg); \ + OrtReleaseStatus(onnx_status); \ + abort(); \ + } \ + } while (0); + +/** + * convert input from HWC format to CHW format + * \param input A single image. The byte array has length of 3*h*w + * \param h image height + * \param w image width + * \param output A float array. should be freed by caller after use + * \param output_count Array length of the `output` param + */ +static void hwc_to_chw(const png_byte* input, size_t h, size_t w, float** output, size_t* output_count) { + size_t stride = h * w; + *output_count = stride * 3; + float* output_data = (float*)malloc(*output_count * sizeof(float)); + for (size_t i = 0; i != stride; ++i) { + for (size_t c = 0; c != 3; ++c) { + output_data[c * stride + i] = input[i * 3 + c]; + } + } + *output = output_data; +} + +/** + * convert input from CHW format to HWC format + * \param input A single image. This float array has length of 3*h*w + * \param h image height + * \param w image width + * \param output A byte array. should be freed by caller after use + */ +static void chw_to_hwc(const float* input, size_t h, size_t w, png_bytep* output) { + size_t stride = h * w; + png_bytep output_data = (png_bytep)malloc(stride * 3); + for (int c = 0; c != 3; ++c) { + size_t t = c * stride; + for (size_t i = 0; i != stride; ++i) { + float f = input[t + i]; + if (f < 0.f || f > 255.0f) f = 0; + output_data[i * 3 + c] = (png_byte)f; + } + } + *output = output_data; +} + +/** + * \param out should be freed by caller after use + * \param output_count Array length of the `out` param + */ +static int read_png_file(const char* input_file, size_t* height, size_t* width, float** out, size_t* output_count) { + png_image image; /* The control structure used by libpng */ + /* Initialize the 'png_image' structure. */ + memset(&image, 0, (sizeof image)); + image.version = PNG_IMAGE_VERSION; + if (png_image_begin_read_from_file(&image, input_file) == 0) { + return -1; + } + png_bytep buffer; + image.format = PNG_FORMAT_BGR; + size_t input_data_length = PNG_IMAGE_SIZE(image); + if (input_data_length != 720 * 720 * 3) { + printf("input_data_length:%zd\n", input_data_length); + return -1; + } + buffer = (png_bytep)malloc(input_data_length); + memset(buffer, 0, input_data_length); + if (png_image_finish_read(&image, NULL /*background*/, buffer, 0 /*row_stride*/, NULL /*colormap*/) == 0) { + return -1; + } + hwc_to_chw(buffer, image.height, image.width, out, output_count); + free(buffer); + *width = image.width; + *height = image.height; + return 0; +} + +/** + * \param tensor should be a float tensor in [N,C,H,W] format + */ +static int write_tensor_to_png_file(OrtValue* tensor, const char* output_file) { + struct OrtTensorTypeAndShapeInfo* shape_info; + ORT_ABORT_ON_ERROR(OrtGetTensorTypeAndShape(tensor, &shape_info)); + size_t dim_count; + ORT_ABORT_ON_ERROR(OrtGetDimensionsCount(shape_info, &dim_count)); + if (dim_count != 4) { + printf("output tensor must have 4 dimensions"); + return -1; + } + int64_t dims[4]; + ORT_ABORT_ON_ERROR(OrtGetDimensions(shape_info, dims, sizeof(dims) / sizeof(dims[0]))); + if (dims[0] != 1 || dims[1] != 3) { + printf("output tensor shape error"); + return -1; + } + float* f; + ORT_ABORT_ON_ERROR(OrtGetTensorMutableData(tensor, (void**)&f)); + png_bytep model_output_bytes; + png_image image; + memset(&image, 0, (sizeof image)); + image.version = PNG_IMAGE_VERSION; + image.format = PNG_FORMAT_BGR; + image.height = dims[2]; + image.width = dims[3]; + chw_to_hwc(f, image.height, image.width, &model_output_bytes); + int ret = 0; + if (png_image_write_to_file(&image, output_file, 0 /*convert_to_8bit*/, model_output_bytes, 0 /*row_stride*/, + NULL /*colormap*/) == 0) { + printf("write to '%s' failed:%s\n", output_file, image.message); + ret = -1; + } + free(model_output_bytes); + return ret; +} + +static void usage() { printf("usage: \n"); } + +static char* convert_string(const wchar_t* input) { + size_t src_len = wcslen(input) + 1; + if (src_len > INT_MAX) { + printf("size overflow\n"); + abort(); + } + const int len = WideCharToMultiByte(CP_ACP, 0, input, (int)src_len, NULL, 0, NULL, NULL); + assert(len > 0); + char* ret = (char*)malloc(len); + assert(ret != NULL); + const int r = WideCharToMultiByte(CP_ACP, 0, input, (int)src_len, ret, len, NULL, NULL); + assert(len == r); + return ret; +} + +int run_inference(OrtSession* session, const ORTCHAR_T* input_file, const ORTCHAR_T* output_file) { + size_t input_height; + size_t input_width; + float* model_input; + size_t model_input_ele_count; +#ifdef _WIN32 + char* output_file_p = convert_string(output_file); + char* input_file_p = convert_string(input_file); +#else + char* input_file_p = input_file; +#endif + if (read_png_file(input_file_p, &input_height, &input_width, &model_input, &model_input_ele_count) != 0) { + return -1; + } + if (input_height != 720 || input_width != 720) { + printf("please resize to image to 720x720\n"); + free(model_input); + return -1; + } + OrtAllocatorInfo* allocator_info; + ORT_ABORT_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &allocator_info)); + const int64_t input_shape[] = {1, 3, 720, 720}; + const size_t input_shape_len = sizeof(input_shape) / sizeof(input_shape[0]); + const size_t model_input_len = model_input_ele_count * sizeof(float); + + OrtValue* input_tensor = NULL; + ORT_ABORT_ON_ERROR(OrtCreateTensorWithDataAsOrtValue(allocator_info, model_input, model_input_len, input_shape, + input_shape_len, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, + &input_tensor)); + assert(input_tensor != NULL); + int is_tensor; + ORT_ABORT_ON_ERROR(OrtIsTensor(input_tensor, &is_tensor)); + assert(is_tensor); + OrtReleaseAllocatorInfo(allocator_info); + const char* input_names[] = {"inputImage"}; + const char* output_names[] = {"outputImage"}; + OrtValue* output_tensor = NULL; + ORT_ABORT_ON_ERROR( + OrtRun(session, NULL, input_names, (const OrtValue* const*)&input_tensor, 1, output_names, 1, &output_tensor)); + assert(output_tensor != NULL); + ORT_ABORT_ON_ERROR(OrtIsTensor(output_tensor, &is_tensor)); + assert(is_tensor); + int ret = 0; + if (write_tensor_to_png_file(output_tensor, output_file_p) != 0) { + ret = -1; + } + OrtReleaseValue(output_tensor); + OrtReleaseValue(input_tensor); + free(model_input); +#ifdef _WIN32 + free(input_file_p); + free(output_file_p); +#endif // _WIN32 + return ret; +} + +void verify_input_output_count(OrtSession* session) { + size_t count; + ORT_ABORT_ON_ERROR(OrtSessionGetInputCount(session, &count)); + assert(count == 1); + ORT_ABORT_ON_ERROR(OrtSessionGetOutputCount(session, &count)); + assert(count == 1); +} + +#ifdef USE_CUDA +void enable_cuda(OrtSessionOptions* session_options) { + ORT_ABORT_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); +} +#endif + +#ifdef _WIN32 +int wmain(int argc, wchar_t* argv[]) { +#else +int main(int argc, char* argv[]) { +#endif + if (argc < 4) { + usage(); + return -1; + } +#ifdef _WIN32 + //CoInitializeEx is only needed if Windows Image Component will be used in this program for image loading/saving. + HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED); + if (!SUCCEEDED(hr)) return -1; +#endif + ORTCHAR_T* model_path = argv[1]; + ORTCHAR_T* input_file = argv[2]; + ORTCHAR_T* output_file = argv[3]; + OrtEnv* env; + ORT_ABORT_ON_ERROR(OrtCreateEnv(ORT_LOGGING_LEVEL_WARNING, "test", &env)); + OrtSessionOptions* session_options; + ORT_ABORT_ON_ERROR(OrtCreateSessionOptions(&session_options)); +#ifdef USE_CUDA + enable_cuda(session_options); +#endif + OrtSession* session; + ORT_ABORT_ON_ERROR(OrtCreateSession(env, model_path, session_options, &session)); + verify_input_output_count(session); + int ret = run_inference(session, input_file, output_file); + OrtReleaseSessionOptions(session_options); + OrtReleaseSession(session); + OrtReleaseEnv(env); + if (ret != 0) { + fprintf(stderr, "fail\n"); + } +#ifdef _WIN32 + CoUninitialize(); +#endif + return ret; +} diff --git a/samples/c_cxx/imagenet/CMakeLists.txt b/samples/c_cxx/imagenet/CMakeLists.txt new file mode 100644 index 0000000000000..7b2e371144e7c --- /dev/null +++ b/samples/c_cxx/imagenet/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +set(FS_SOURCES local_filesystem.h sync_api.h controller.h controller.cc) +if(WIN32) + LIST(APPEND FS_SOURCES local_filesystem_win.cc sync_api_win.cc) +else() + LIST(APPEND FS_SOURCES local_filesystem_posix.cc sync_api_posix.cc) +endif() +add_library(slim_fs_lib ${FS_SOURCES}) +if(WIN32) + target_compile_definitions(slim_fs_lib PRIVATE WIN32_LEAN_AND_MEAN NOMINMAX) +endif() + +if(JPEG_FOUND) + SET(IMAGE_SRC jpeg_handle.cc jpeg_handle.h jpeg_mem.cc jpeg_mem.h image_loader_libjpeg.cc) +elseif(WIN32) + SET(IMAGE_SRC image_loader_wic.cc) +endif() +add_executable(image_classifier main.cc runnable_task.h data_processing.h ${IMAGE_SRC} + async_ring_buffer.h image_loader.cc image_loader.h cached_interpolation.h single_consumer.h) +if(JPEG_FOUND) + target_compile_definitions(image_classifier PRIVATE HAVE_JPEG) + SET(IMAGE_HEADERS ${JPEG_INCLUDE_DIR}) + SET(IMAGE_LIBS ${JPEG_LIBRARIES}) +endif() +target_include_directories(image_classifier PRIVATE ${PROJECT_SOURCE_DIR}/include ${IMAGE_HEADERS}) +if(WIN32) + target_compile_definitions(image_classifier PRIVATE WIN32_LEAN_AND_MEAN NOMINMAX) +endif() +target_link_libraries(image_classifier PRIVATE onnxruntime slim_fs_lib ${IMAGE_LIBS}) + + + diff --git a/samples/c_cxx/imagenet/README.md b/samples/c_cxx/imagenet/README.md new file mode 100644 index 0000000000000..ca494dc5ff5ca --- /dev/null +++ b/samples/c_cxx/imagenet/README.md @@ -0,0 +1,66 @@ +# Overview + + + +![taskflow](taskflow.png) + +WARNING: If you want to train the model by yourself, you need at least 500GB disk space and a powerful NVIDIA GPU. + +# Install tensorflow +Install Python 3.x from [python.org](https://www.python.org/), then execute +``` +pip install --upgrade tensorflow +``` +For more information, see [Install Tensorflow](https://www.tensorflow.org/install) + +# Get the Imagenet dataset +We need the [ILSVRC-2012-CLS](http://www.image-net.org/challenges/LSVRC/2012/) image classification dataset from http://www.image-net.org/. + +If you're going to train the model by yourself, then you need the full dataset, which is about 500GB. Otherwise, you only need the +validation data set, which is just about 3GB. + +For how to get the data, see [ImageNet Download faq](http://image-net.org/download-faq). Once you get an account, visit http://www.image-net.org/download-images. You will find "Download links to ILSVRC2012 image data" on that page + +And also, please download the "[imagenet_lsvrc_2015_synsets.txt](https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_lsvrc_2015_synsets.txt)" and "[imagenet_2012_validation_synset_labels.txt](https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_2012_validation_synset_labels.txt)" from tensorflow models repo. + +# Get the model +Please check [https://github.com/tensorflow/models/tree/master/research/slim/](https://github.com/tensorflow/models/tree/master/research/slim/). +You may either train the model by yourself, or just download a pretrained model provided by Google. +If you don't know which one to download and try, we suggest you choose the [Inception V4](http://download.tensorflow.org/models/inception_v4_2016_09_09.tar.gz) model as a starting point. + +After downloading, please uncompress it. +``` +tar -zxvf inception_v4_2016_09_09.tar.gz +``` + +The [Inception V4] zip file only contains a single checkpoint file: inception_v4.ckpt. It can't be directly used for inferencing. +You need to combine the network definition and the checkpoint. Please follow the steps below: + +1. Export the graph +Create an new folder. At there, execute +``` +git clone https://github.com/tensorflow/models . +python research\slim\export_inference_graph.py --model_name=inception_v4 --output_file=grpah.pb +``` + +2. Freeze the graph +Run +``` +freeze_graph.exe --input_graph=graph.pb --input_checkpoint=inception_v4.ckpt --output_graph=inception_v4.pb --output_node_names=InceptionV4/Logits/Predictions --input_binary=true +``` + +# Convert the model to ONNX + +``` +pip install --upgrade tf2onnx +python -m tf2onnx.convert --input inception_v4.pb --inputs input:0 --outputs InceptionV4/Logits/Predictions:0 --opset 10 --output inception_v4.onnx +``` + +# Run the inferencing +In your build dir of onnxruntime_samples, search for "image_classifier.exe" and run +``` +image_classifier.exe C:\tools\imagnet_validation_data inception_v4.onnx imagenet_lsvrc_2015_synsets.txt imagenet_2012_validation_synset_labels.txt 32 +``` +Please replace the file names with the corresponding file paths. + +The last parameter is batch size, you may need to adjust it according to your GPU memory size. diff --git a/samples/c_cxx/imagenet/async_ring_buffer.h b/samples/c_cxx/imagenet/async_ring_buffer.h new file mode 100644 index 0000000000000..6de96f5c09775 --- /dev/null +++ b/samples/c_cxx/imagenet/async_ring_buffer.h @@ -0,0 +1,320 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include +#include +#include "controller.h" +#include "onnxruntime/core/session/onnxruntime_cxx_api.h" +#include "single_consumer.h" +#include "runnable_task.h" + +template +class AsyncRingBuffer { + private: + static VOID NTAPI ThreadPoolEntry(_Inout_ PTP_CALLBACK_INSTANCE pci, _Inout_opt_ PVOID data, _Inout_ PTP_WORK work) { + CloseThreadpoolWork(work); + (*(RunnableTask*)data)(pci); + } + + template + static size_t CalcItemSize(const std::vector& tensor_shape) { + int64_t r = 1; + for (int64_t i : tensor_shape) r *= i; + return static_cast(r) * sizeof(T); + } + + enum class BufferState { EMPTY, + FILLING, + FULL, + TAKEN }; + const size_t batch_size_; + using InputType = typename InputIterator::value_type; + DataProcessing* p_; + OutputCollector* c_; + size_t capacity_; + struct QueueItem { + OrtValue* value = nullptr; + std::vector taskid_list; + + QueueItem() = default; + ~QueueItem() { OrtReleaseValue(value); } + QueueItem(const QueueItem&) = delete; + QueueItem& operator=(const QueueItem&) = delete; + }; + //A list of tensors with equal tensor shape + SingleConsumerFIFO queue_; + using TensorListEntry = typename SingleConsumerFIFO::ListEntry; + Controller& threadpool_; + std::vector CreateTensorShapeWithBatchSize(const std::vector& input, size_t batch_size) { + std::vector shape(input.size() + 1); + shape[0] = batch_size; + size_t len = shape.size(); + for (size_t i = 1; i != len; ++i) { + shape[i] = input[i - 1]; + } + return shape; + } + std::mutex m; + + /** + * A collection of buffers with equal size. + */ + struct BufferManager { + size_t capacity_; + size_t item_size_in_bytes_; + size_t write_index_ = 0; + std::vector buffer_state; + std::vector input_task_id_for_buffers_; + + // TODO: if there is an alignment requirement, this buffer need do padding between the tensors. + std::vector buffer_; + + BufferManager(size_t capacity, size_t item_size_in_bytes) + : capacity_(capacity), + item_size_in_bytes_(item_size_in_bytes), + buffer_state(capacity, BufferState::EMPTY), + input_task_id_for_buffers_(capacity), + buffer_(item_size_in_bytes * capacity) {} + + size_t GetId(_In_ const uint8_t* p) const { return (p - buffer_.data()) / item_size_in_bytes_; } + size_t GetItemSizeInBytes() const { return item_size_in_bytes_; } + bool CompareAndSet(size_t i, BufferState old, BufferState new_state) { + if (buffer_state[i] != old) return false; + buffer_state[i] = new_state; + return true; + } + + bool CompareAndSet(size_t index, size_t index_end, BufferState old, BufferState new_state) { + assert(index_end >= index); + for (size_t i = index; i != index_end; ++i) { + if (buffer_state[i] != old) return false; + } + for (size_t i = index; i != index_end; ++i) { + buffer_state[i] = new_state; + } + return true; + } + + bool TakeRange(size_t index, size_t index_end, std::vector& task_id_list) { + assert(index_end >= index); + if (!CompareAndSet(index, index_end, BufferState::FULL, BufferState::TAKEN)) { + return false; + } + auto* p = &input_task_id_for_buffers_[index]; + auto* p_end = p + (index_end - index); + task_id_list.assign(p, p_end); + return true; + } + + _Success_(return ) bool TakeAllRemain(_Out_ uint8_t** begin, std::vector& task_id_list) { + auto iter = + std::find_if(buffer_state.begin(), buffer_state.end(), [](BufferState s) { return s == BufferState::FULL; }); + if (iter == buffer_state.end()) return false; + auto iter_end = std::find_if(iter, buffer_state.end(), [](BufferState s) { return s != BufferState::FULL; }); + + *begin = &buffer_[iter - buffer_state.begin()]; + if (!TakeRange(iter - buffer_state.begin(), iter_end - buffer_state.begin(), task_id_list)) { + throw std::runtime_error("internal error"); + } + size_t remain = std::count_if(buffer_state.begin(), buffer_state.end(), + [](BufferState s) { return s != BufferState::TAKEN && s != BufferState::EMPTY; }); + if (remain != 0) { + throw std::runtime_error("the buffer contains multiple non-contiguous region"); + } + return true; + } + + uint8_t* Begin() { return buffer_.data(); } + + /* + * Get a buffer pointer and set its state to FILLING + * \param taskid + * \return Pointer to the buffer + */ + uint8_t* Next(InputType taskid) { + for (size_t i = 0; i != capacity_; ++i) { + size_t index = (write_index_ + i) % capacity_; + if (buffer_state[i] == BufferState::EMPTY) { + buffer_state[i] = BufferState::FILLING; + input_task_id_for_buffers_[i] = taskid; + return &buffer_[index * item_size_in_bytes_]; + } + } + return nullptr; + } + }; + BufferManager buffer_; + InputIterator input_begin_; + const InputIterator input_end_; + // unsafe + bool is_input_eof() const { return input_end_ == input_begin_; } + size_t parallelism = 8; + size_t current_running_downloders = 0; + + void ReturnAndTake(TensorListEntry*& input_tensor) { + std::lock_guard g(m); + if (input_tensor != nullptr) { + size_t tensor_id = queue_.Return(input_tensor); + size_t buffer_id = tensor_id * batch_size_; + if (!buffer_.CompareAndSet(buffer_id, buffer_id + batch_size_, BufferState::TAKEN, BufferState::EMPTY)) { + throw std::runtime_error("ReturnAndTake: internal state error"); + } + } + input_tensor = queue_.Take(); + } + + void OnDownloadFinished(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci, const uint8_t* dest) { + size_t buffer_id = buffer_.GetId(dest); + TensorListEntry* input_tensor = nullptr; + { + std::lock_guard g(m); + --current_running_downloders; + if (!buffer_.CompareAndSet(buffer_id, BufferState::FILLING, BufferState::FULL)) { + throw std::runtime_error("ReturnAndTake: internal state error"); + } + size_t tensor_id = buffer_id / batch_size_; + std::vector task_id_list; + buffer_id = tensor_id * batch_size_; + if (buffer_.TakeRange(buffer_id, buffer_id + batch_size_, task_id_list)) { + queue_.Put(tensor_id, [&task_id_list](QueueItem& i) { + i.taskid_list = task_id_list; + }); + input_tensor = queue_.Take(); + } + } + + bool eof = false; + while (threadpool_.IsRunning()) { + if (!eof) { + int tasks = StartDownloadTasks(); + if (tasks < 0) { + threadpool_.SetFailBit(pci, "Schedule download task failed"); + return; + } + if (tasks == 0) { + threadpool_.SetEof(pci); + eof = true; + } + } + if (input_tensor == nullptr) { + break; + } + (*c_)(input_tensor->value.taskid_list, input_tensor->value.value); + ReturnAndTake(input_tensor); + } + } + + void Fail(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci, const char* errmsg) { + threadpool_.SetFailBit(pci, errmsg); + } + + public: + AsyncRingBuffer(size_t batch_size, size_t capacity, Controller& threadpool, const InputIterator& input_begin, + const InputIterator& input_end, DataProcessing* p, OutputCollector* c) + : batch_size_(batch_size), + p_(p), + c_(c), + capacity_((capacity + batch_size_ - 1) / batch_size_ * batch_size_), + queue_(capacity_ / batch_size_), + threadpool_(threadpool), + buffer_(capacity_, CalcItemSize(p->GetOutputShape(1))), + input_begin_(input_begin), + input_end_(input_end) { + OrtAllocatorInfo* allocator_info; + ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &allocator_info)); + uint8_t* output_data = buffer_.Begin(); + std::vector input_shape = p_->GetOutputShape(batch_size_); + size_t off = CalcItemSize(input_shape); + queue_.Init([allocator_info, off, &output_data, &input_shape](QueueItem& e) { + ORT_THROW_ON_ERROR(OrtCreateTensorWithDataAsOrtValue(allocator_info, output_data, off, input_shape.data(), + input_shape.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, + &e.value)); + output_data += off; + }); + OrtReleaseAllocatorInfo(allocator_info); + } + + void ProcessRemain() { + queue_.Release(); + c_->ResetCache(); + + uint8_t* output_data; + std::vector task_id_list; + if (!buffer_.TakeAllRemain(&output_data, task_id_list)) return; + OrtAllocatorInfo* allocator_info; + ORT_THROW_ON_ERROR(OrtCreateCpuAllocatorInfo(OrtArenaAllocator, OrtMemTypeDefault, &allocator_info)); + size_t count = task_id_list.size(); + assert(count != 0); + std::vector input_shape = p_->GetOutputShape(count); + size_t len = CalcItemSize(input_shape); + OrtValue* input_tensor = nullptr; + ORT_THROW_ON_ERROR(OrtCreateTensorWithDataAsOrtValue(allocator_info, output_data, len, input_shape.data(), + input_shape.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, + &input_tensor)); + (*c_)(task_id_list, input_tensor); + OrtReleaseAllocatorInfo(allocator_info); + OrtReleaseValue(input_tensor); + } + + /** + * call this function when a download task is just finished or any buffer became FREE. + * \return 0 EOF. No more download task to schedule + * 1 OK + * -1 ERROR + */ + int StartDownloadTasks() { + class DownloadTask : public RunnableTask { + public: + AsyncRingBuffer* requester; + InputType source; + uint8_t* dest; + DownloadTask(AsyncRingBuffer* r, const InputType& s, uint8_t* d) : requester(r), source(s), dest(d) {} + + void operator()(_In_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci) noexcept override { + AsyncRingBuffer* r = requester; + InputType s = source; + uint8_t* d = dest; + delete this; + try { + (*r->p_)(&s, d, r->buffer_.GetItemSizeInBytes()); + r->OnDownloadFinished(pci, d); + } catch (const std::exception& ex) { + fprintf(stderr, "%s\n", ex.what()); + r->Fail(pci, ex.what()); + } + } + }; + + // search empty slots, launch a download task for each of them + std::vector tasks_to_launch; + bool is_eof = false; + { + std::lock_guard g(m); + // if we have + // 1. cpu (current_running_downloders < parallelism) + // 2. memory (buffer available) + // 3. input_task + // then schedule a download task to the thread pool + for (; current_running_downloders + tasks_to_launch.size() < parallelism && !is_input_eof(); + ++input_begin_, ++current_running_downloders) { + uint8_t* b = buffer_.Next(*input_begin_); + if (b == nullptr) break; // no empty buffer + tasks_to_launch.push_back(new DownloadTask(this, *input_begin_, b)); + } + is_eof = is_input_eof(); + } + + for (DownloadTask* p : tasks_to_launch) { + if (!threadpool_.RunAsync(ThreadPoolEntry, p)) { + return -1; + } + } + + if (is_eof) { + return 0; + } + return 1; + } +}; \ No newline at end of file diff --git a/samples/c_cxx/imagenet/cached_interpolation.h b/samples/c_cxx/imagenet/cached_interpolation.h new file mode 100644 index 0000000000000..43f0a9223f7b7 --- /dev/null +++ b/samples/c_cxx/imagenet/cached_interpolation.h @@ -0,0 +1,26 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#pragma once + +#include +// Compute the interpolation indices only once. +struct CachedInterpolation { + int64_t lower; // Lower source index used in the interpolation + int64_t upper; // Upper source index used in the interpolation + // 1-D linear iterpolation scale (see: + // https://en.wikipedia.org/wiki/Bilinear_interpolation) + float lerp; +}; diff --git a/samples/c_cxx/imagenet/controller.cc b/samples/c_cxx/imagenet/controller.cc new file mode 100644 index 0000000000000..0081334b396fb --- /dev/null +++ b/samples/c_cxx/imagenet/controller.cc @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "controller.h" + +Controller::Controller() : cleanup_group_(CreateThreadpoolCleanupGroup()), event_(CreateOnnxRuntimeEvent()) { + InitializeThreadpoolEnvironment(&env_); + SetThreadpoolCallbackPool(&env_, nullptr); + SetThreadpoolCallbackCleanupGroup(&env_, cleanup_group_, nullptr); +} + +Controller::~Controller() noexcept { free(errmsg_); } + +bool Controller::RunAsync(_Inout_ ONNXRUNTIME_CALLBACK_FUNCTION callback, _In_ void* data) { + std::lock_guard g(m_); + if (state_ == State::RUNNING) { + ::CreateAndSubmitThreadpoolWork(callback, data, &env_); + return true; + } + return false; +} + +std::string Controller::Wait() { + WaitAndCloseEvent(event_); + CloseThreadpoolCleanupGroupMembers(cleanup_group_, errmsg_ == nullptr ? FALSE : TRUE, nullptr); + CloseThreadpoolCleanupGroup(cleanup_group_); + return errmsg_ == nullptr ? std::string() : errmsg_; +} + +void Controller::SetFailBit(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci, _In_ const char* err_msg) { + std::lock_guard g(m_); + if (state_ == State::RUNNING || state_ == State::SHUTDOWN) { + state_ = State::STOPPED; + is_running_ = false; + errmsg_ = my_strdup(err_msg); + ::OnnxRuntimeSetEventWhenCallbackReturns(pci, event_); + } +} + +bool Controller::SetEof(ONNXRUNTIME_CALLBACK_INSTANCE pci) { + std::lock_guard g(m_); + if (state_ == State::RUNNING) { + state_ = State::SHUTDOWN; + ::OnnxRuntimeSetEventWhenCallbackReturns(pci, event_); + return true; + } + return false; +} diff --git a/samples/c_cxx/imagenet/controller.h b/samples/c_cxx/imagenet/controller.h new file mode 100644 index 0000000000000..9205d6b0318c5 --- /dev/null +++ b/samples/c_cxx/imagenet/controller.h @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "sync_api.h" +#include +#include + +class Controller { + private: + PTP_CLEANUP_GROUP const cleanup_group_; + TP_CALLBACK_ENVIRON env_; + ONNXRUNTIME_EVENT event_; + std::atomic is_running_ = true; + std::mutex m_; + enum class State { RUNNING, SHUTDOWN, STOPPED } state_ = State::RUNNING; + char* errmsg_ = nullptr; + + public: + Controller(); + ~Controller() noexcept; + Controller(const Controller&) = delete; + Controller& operator=(const Controller&) = delete; + // return true if SetFailBit has not been called + bool IsRunning() const { return is_running_; } + + void SetFailBit(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci, _In_ const char* err_msg); + bool SetEof(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci); + + // Wait the state becoming stopped, and all the submitted work has been finished(or cancelled if error happened) + std::string Wait(); + bool RunAsync(_Inout_ ONNXRUNTIME_CALLBACK_FUNCTION callback, _In_ void* data); +}; \ No newline at end of file diff --git a/samples/c_cxx/imagenet/data_processing.h b/samples/c_cxx/imagenet/data_processing.h new file mode 100644 index 0000000000000..de060659b682c --- /dev/null +++ b/samples/c_cxx/imagenet/data_processing.h @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +class DataProcessing { + public: + virtual void operator()(_In_ const void* input_data, _Out_writes_bytes_all_(output_len) void* output_data, size_t output_len) const = 0; + virtual std::vector GetOutputShape(size_t batch_size) const = 0; + virtual ~DataProcessing() = default; +}; \ No newline at end of file diff --git a/samples/c_cxx/imagenet/image_loader.cc b/samples/c_cxx/imagenet/image_loader.cc new file mode 100644 index 0000000000000..afb2473cab974 --- /dev/null +++ b/samples/c_cxx/imagenet/image_loader.cc @@ -0,0 +1,189 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include +#include "image_loader.h" +#include "cached_interpolation.h" +#include "local_filesystem.h" + +namespace { +/** + * CalculateResizeScale determines the float scaling factor. + * @param in_size + * @param out_size + * @param align_corners If true, the centers of the 4 corner pixels of the input and output tensors are aligned, + * preserving the values at the corner pixels + * @return + */ +inline float CalculateResizeScale(int64_t in_size, int64_t out_size, bool align_corners) { + return (align_corners && out_size > 1) ? (in_size - 1) / static_cast(out_size - 1) + : in_size / static_cast(out_size); +} + +inline void compute_interpolation_weights(const int64_t out_size, const int64_t in_size, const float scale, + CachedInterpolation* interpolation) { + interpolation[out_size].lower = 0; + interpolation[out_size].upper = 0; + for (int64_t i = out_size - 1; i >= 0; --i) { + const float in = i * scale; + interpolation[i].lower = static_cast(in); + interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1); + interpolation[i].lerp = in - interpolation[i].lower; + } +} + +/** + * Computes the bilinear interpolation from the appropriate 4 float points + * and the linear interpolation weights. + */ +inline float compute_lerp(const float top_left, const float top_right, const float bottom_left, + const float bottom_right, const float x_lerp, const float y_lerp) { + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; + return top + (bottom - top) * y_lerp; +} + +} // namespace +template +void ResizeImageInMemory(const T* input_data, float* output_data, int in_height, int in_width, int out_height, + int out_width, int channels) { + float height_scale = CalculateResizeScale(in_height, out_height, false); + float width_scale = CalculateResizeScale(in_width, out_width, false); + + std::vector ys(out_height + 1); + std::vector xs(out_width + 1); + + // Compute the cached interpolation weights on the x and y dimensions. + compute_interpolation_weights(out_height, in_height, height_scale, ys.data()); + compute_interpolation_weights(out_width, in_width, width_scale, xs.data()); + + // Scale x interpolation weights to avoid a multiplication during iteration. + for (int i = 0; i < xs.size(); ++i) { + xs[i].lower *= channels; + xs[i].upper *= channels; + } + + const int64_t in_row_size = in_width * channels; + const int64_t in_batch_num_values = in_height * in_row_size; + const int64_t out_row_size = out_width * channels; + + const T* input_b_ptr = input_data; + float* output_y_ptr = output_data; + const int batch_size = 1; + + if (channels == 3) { + for (int b = 0; b < batch_size; ++b) { + for (int64_t y = 0; y < out_height; ++y) { + const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; + const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; + const float ys_lerp = ys[y].lerp; + for (int64_t x = 0; x < out_width; ++x) { + const int64_t xs_lower = xs[x].lower; + const int64_t xs_upper = xs[x].upper; + const float xs_lerp = xs[x].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower + 0]); + const float top_right0(ys_input_lower_ptr[xs_upper + 0]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); + + // Read channel 1. + const float top_left1(ys_input_lower_ptr[xs_lower + 1]); + const float top_right1(ys_input_lower_ptr[xs_upper + 1]); + const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); + const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); + + // Read channel 2. + const float top_left2(ys_input_lower_ptr[xs_lower + 2]); + const float top_right2(ys_input_lower_ptr[xs_upper + 2]); + const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); + const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); + + // Compute output. + output_y_ptr[x * channels + 0] = + compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0, xs_lerp, ys_lerp); + output_y_ptr[x * channels + 1] = + compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1, xs_lerp, ys_lerp); + output_y_ptr[x * channels + 2] = + compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2, xs_lerp, ys_lerp); + } + output_y_ptr += out_row_size; + } + input_b_ptr += in_batch_num_values; + } + } else { + for (int b = 0; b < batch_size; ++b) { + for (int64_t y = 0; y < out_height; ++y) { + const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; + const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; + const float ys_lerp = ys[y].lerp; + for (int64_t x = 0; x < out_width; ++x) { + auto xs_lower = xs[x].lower; + auto xs_upper = xs[x].upper; + auto xs_lerp = xs[x].lerp; + for (int c = 0; c < channels; ++c) { + const float top_left(ys_input_lower_ptr[xs_lower + c]); + const float top_right(ys_input_lower_ptr[xs_upper + c]); + const float bottom_left(ys_input_upper_ptr[xs_lower + c]); + const float bottom_right(ys_input_upper_ptr[xs_upper + c]); + output_y_ptr[x * channels + c] = + compute_lerp(top_left, top_right, bottom_left, bottom_right, xs_lerp, ys_lerp); + } + } + output_y_ptr += out_row_size; + } + input_b_ptr += in_batch_num_values; + } + } +} + +template void ResizeImageInMemory(const float* input_data, float* output_data, int in_height, int in_width, + int out_height, int out_width, int channels); + +template void ResizeImageInMemory(const uint8_t* input_data, float* output_data, int in_height, int in_width, + int out_height, int out_width, int channels); + +InceptionPreprocessing::InceptionPreprocessing(int out_height, int out_width, int channels) + : out_height_(out_height), out_width_(out_width), channels_(channels) { + if (!CreateImageLoader(&image_loader_)) { + throw std::runtime_error("create image loader failed"); + } +} + +// see: https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py +// function: preprocess_for_eval +void InceptionPreprocessing::operator()(_In_ const void* input_data, + _Out_writes_bytes_all_(output_len) void* output_data, size_t output_len) const { + const TCharString& file_name = *reinterpret_cast(input_data); + size_t output_count = channels_ * out_height_ * out_width_; + if (output_len < output_count * sizeof(float)) { + throw std::runtime_error("buffer is too small"); + } + float* float_file_data_pointer; + int bbox_h_size, bbox_w_size; + ORT_THROW_ON_ERROR(LoadImageFromFileAndCrop(image_loader_, file_name.c_str(), central_fraction_, + &float_file_data_pointer, &bbox_w_size, &bbox_h_size)); + auto output_data_ = reinterpret_cast(output_data); + ResizeImageInMemory(float_file_data_pointer, output_data_, bbox_h_size, bbox_w_size, out_height_, out_width_, + channels_); + free(float_file_data_pointer); + + for (size_t i = 0; i != output_count; ++i) { + output_data_[i] = (output_data_[i] - 0.5f) * 2.f; + } +} diff --git a/samples/c_cxx/imagenet/image_loader.h b/samples/c_cxx/imagenet/image_loader.h new file mode 100644 index 0000000000000..ea60ac07acb8d --- /dev/null +++ b/samples/c_cxx/imagenet/image_loader.h @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include +#include +#include "cached_interpolation.h" +#include "sync_api.h" +#include "data_processing.h" +#include + +template +void ResizeImageInMemory(const T* input_data, float* output_data, int in_height, int in_width, int out_height, + int out_width, int channels); + +template +class OutputCollector { + public: + virtual void operator()(const std::vector& task_id_list, const OrtValue* tensor) = 0; + // Release the internal cache. It need be called whenever batchsize is changed + virtual void ResetCache() = 0; + virtual ~OutputCollector() = default; +}; + +bool CreateImageLoader(void** out); +OrtStatus* LoadImageFromFileAndCrop(void* loader, const ORTCHAR_T* filename, double central_crop_fraction, float** out, + int* out_width, int* out_height); + +void ReleaseImageLoader(void* p); + +class InceptionPreprocessing : public DataProcessing { + private: + const int out_height_; + const int out_width_; + const int channels_; + const double central_fraction_ = 0.875; + void* image_loader_; + public: + InceptionPreprocessing(int out_height, int out_width, int channels); + + void operator()(_In_ const void* input_data, _Out_writes_bytes_all_(output_len) void* output_data, size_t output_len) const override; + + // output data from this class is in NWHC format + std::vector GetOutputShape(size_t batch_size) const override { + return {(int64_t)batch_size, out_height_, out_width_, channels_}; + } +}; diff --git a/samples/c_cxx/imagenet/image_loader_libjpeg.cc b/samples/c_cxx/imagenet/image_loader_libjpeg.cc new file mode 100644 index 0000000000000..4645475cc433d --- /dev/null +++ b/samples/c_cxx/imagenet/image_loader_libjpeg.cc @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "image_loader.h" +#include +#include "jpeg_mem.h" +#include "local_filesystem.h" +#include + +bool CreateImageLoader(void** out) { + *out = nullptr; + return true; +} + +void ReleaseImageLoader(void*) {} + +OrtStatus* LoadImageFromFileAndCrop(void*, const ORTCHAR_T* filename, double central_crop_fraction, float** out, + int* out_width, int* out_height) { + const int channels_ = 3; + UncompressFlags flags; + flags.components = channels_; + // The TensorFlow-chosen default for jpeg decoding is IFAST, sacrificing + // image quality for speed. + flags.dct_method = JDCT_IFAST; + size_t file_len; + void* file_data; + ReadFileAsString(filename, file_data, file_len); + int width; + int height; + int channels; + std::unique_ptr decompressed_image( + Uncompress(file_data, static_cast(file_len), flags, &width, &height, &channels, nullptr)); + free(file_data); + + if (decompressed_image == nullptr) { + std::ostringstream oss; + oss << "decompress '" << filename << "' failed"; + return OrtCreateStatus(ORT_FAIL, oss.str().c_str()); + } + + if (channels != channels_) { + std::ostringstream oss; + oss << "input format error, expect 3 channels, got " << channels; + return OrtCreateStatus(ORT_FAIL, oss.str().c_str()); + } + + // cast uint8 to float + // See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/image_ops_impl.py of + // tf.image.convert_image_dtype + + // crop it, and cast each pixel value from uint8 to float in range of [0,1] + // TODO: should the result be in range of [0,1) or [0,1]? + + int bbox_h_start = + static_cast((static_cast(height) - static_cast(height) * central_crop_fraction) / 2); + int bbox_w_start = + static_cast((static_cast(width) - static_cast(width) * central_crop_fraction) / 2); + int bbox_h_size = height - bbox_h_start * 2; + int bbox_w_size = width - bbox_w_start * 2; + const size_t ele_count = bbox_h_size * bbox_w_size * channels; + float* float_file_data = (float*)malloc(ele_count * sizeof(float)); + if (float_file_data == nullptr) { + return OrtCreateStatus(ORT_FAIL, "out of memory"); + } + + { + auto p = decompressed_image.get() + (bbox_h_start * width + bbox_w_start) * channels; + + size_t len = bbox_w_size * channels; + float* wptr = float_file_data; + for (int i = 0; i != bbox_h_size; ++i) { + for (int j = 0; j != len; ++j) { + // TODO: should it be divided by 255 or 256? + *wptr++ = static_cast(p[j]) / 255; + } + p += width * channels; + } + assert(wptr == float_file_data + ele_count); + } + + *out = float_file_data; + *out_width = bbox_w_size; + *out_height = bbox_h_size; + return nullptr; +} diff --git a/samples/c_cxx/imagenet/image_loader_wic.cc b/samples/c_cxx/imagenet/image_loader_wic.cc new file mode 100644 index 0000000000000..066038877e012 --- /dev/null +++ b/samples/c_cxx/imagenet/image_loader_wic.cc @@ -0,0 +1,106 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "image_loader.h" +#include +#include +#include +#include + +bool CreateImageLoader(void** out) { + IWICImagingFactory* piFactory; + auto hr = CoCreateInstance(CLSID_WICImagingFactory, NULL, CLSCTX_INPROC_SERVER, IID_PPV_ARGS(&piFactory)); + if (!SUCCEEDED(hr)) return false; + *out = piFactory; + return true; +} + +void ReleaseImageLoader(void* p){ + auto piFactory = reinterpret_cast(p); + piFactory->Release(); +} + + +template +static void PrintErrorDescription(HRESULT hr, std::basic_ostringstream& oss) { + if (FACILITY_WINDOWS == HRESULT_FACILITY(hr)) hr = HRESULT_CODE(hr); + TCHAR* szErrMsg; + + if (FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, hr, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&szErrMsg, 0, NULL) != 0) { + oss << szErrMsg; + LocalFree(szErrMsg); + } else { + oss << TEXT("[Could not find a description for error # ") << hr; + } +} + +OrtStatus* LoadImageFromFileAndCrop(void* loader, const ORTCHAR_T* filename, double central_crop_fraction, float** out, + int* out_width, int* out_height) { + auto piFactory = reinterpret_cast(loader); + const int channels = 3; + try { + CComPtr piDecoder; + ATLENSURE_SUCCEEDED( + piFactory->CreateDecoderFromFilename(filename, NULL, GENERIC_READ, + WICDecodeMetadataCacheOnDemand, // defer parsing non-critical metadata + &piDecoder)); + + UINT count = 0; + ATLENSURE_SUCCEEDED(piDecoder->GetFrameCount(&count)); + if(count != 1){ + return OrtCreateStatus(ORT_FAIL, "The image has multiple frames, I don't know which to choose"); + } + + CComPtr piFrameDecode; + ATLENSURE_SUCCEEDED(piDecoder->GetFrame(0, &piFrameDecode)); + UINT width, height; + ATLENSURE_SUCCEEDED(piFrameDecode->GetSize(&width, &height)); + CComPtr ppIFormatConverter; + ATLENSURE_SUCCEEDED(piFactory->CreateFormatConverter(&ppIFormatConverter)); + ATLENSURE_SUCCEEDED(ppIFormatConverter->Initialize(piFrameDecode, // Source frame to convert + GUID_WICPixelFormat24bppRGB, // The desired pixel format + WICBitmapDitherTypeNone, // The desired dither pattern + NULL, // The desired palette + 0.f, // The desired alpha threshold + WICBitmapPaletteTypeCustom // Palette translation type + )); + int bbox_h_start = + static_cast((static_cast(height) - static_cast(height) * central_crop_fraction) / 2); + int bbox_w_start = + static_cast((static_cast(width) - static_cast(width) * central_crop_fraction) / 2); + int bbox_h_size = height - bbox_h_start * 2; + int bbox_w_size = width - bbox_w_start * 2; + UINT stride = bbox_w_size * channels; + UINT result_buffer_size = bbox_h_size * bbox_w_size * channels; + // TODO: check result_buffer_size <= UNIT_MAX + std::vector data(result_buffer_size); + WICRect rect; + memset(&rect, 0, sizeof(WICRect)); + rect.X = bbox_w_start; + rect.Y = bbox_h_start; + rect.Height = bbox_h_size; + rect.Width = bbox_w_size; + + ATLENSURE_SUCCEEDED(ppIFormatConverter->CopyPixels(&rect, stride, static_cast(data.size()), data.data())); + float* float_file_data = (float*)malloc(data.size() * sizeof(float)); + size_t len = data.size(); + for (size_t i = 0; i != len; ++i) { + float_file_data[i] = static_cast(data[i]) / 255; + } + + *out = float_file_data; + *out_width = bbox_w_size; + *out_height = bbox_h_size; + return nullptr; + } catch (std::exception& ex) { + std::ostringstream oss; + oss << "Load " << filename << " failed:" << ex.what(); + return OrtCreateStatus(ORT_FAIL, oss.str().c_str()); + } catch (const CAtlException& ex) { + std::ostringstream oss; + oss << "Load " << filename << " failed:"; + PrintErrorDescription(ex.m_hr, oss); + return OrtCreateStatus(ORT_FAIL, oss.str().c_str()); + } +} \ No newline at end of file diff --git a/samples/c_cxx/imagenet/jpeg_handle.cc b/samples/c_cxx/imagenet/jpeg_handle.cc new file mode 100644 index 0000000000000..c3a491d40e77a --- /dev/null +++ b/samples/c_cxx/imagenet/jpeg_handle.cc @@ -0,0 +1,170 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This file implements a memory destination for libjpeg +// The design is very similar to jdatadst.c in libjpeg +// These functions are not meant to be used directly, see jpeg_mem.h instead. +// We are filling out stubs required by jpeglib, those stubs are private to +// the implementation, we are just making available JPGMemSrc, JPGMemDest + +#include "jpeg_handle.h" + +#include +#include +#include + +void CatchError(j_common_ptr cinfo) { + (*cinfo->err->output_message)(cinfo); + jmp_buf* jpeg_jmpbuf = reinterpret_cast(cinfo->client_data); + jpeg_destroy(cinfo); + longjmp(*jpeg_jmpbuf, 1); +} + +// ***************************************************************************** +// ***************************************************************************** +// ***************************************************************************** +// Destination functions + +// ----------------------------------------------------------------------------- +void MemInitDestination(j_compress_ptr cinfo) { + MemDestMgr* dest = reinterpret_cast(cinfo->dest); + std::cout << "Initializing buffer=" << dest->bufsize << " bytes"; + dest->pub.next_output_byte = dest->buffer; + dest->pub.free_in_buffer = dest->bufsize; + dest->datacount = 0; + if (dest->dest) { + dest->dest->clear(); + } +} + +// ----------------------------------------------------------------------------- +boolean MemEmptyOutputBuffer(j_compress_ptr cinfo) { + MemDestMgr* dest = reinterpret_cast(cinfo->dest); + std::cout << "Writing " << dest->bufsize << " bytes"; + if (dest->dest) { + dest->dest->append(reinterpret_cast(dest->buffer), dest->bufsize); + } + dest->pub.next_output_byte = dest->buffer; + dest->pub.free_in_buffer = dest->bufsize; + return TRUE; +} + +// ----------------------------------------------------------------------------- +void MemTermDestination(j_compress_ptr cinfo) { + MemDestMgr* dest = reinterpret_cast(cinfo->dest); + std::cout << "Writing " << dest->bufsize - dest->pub.free_in_buffer << " bytes"; + if (dest->dest) { + dest->dest->append(reinterpret_cast(dest->buffer), dest->bufsize - dest->pub.free_in_buffer); + std::cout << "Total size= " << dest->dest->size(); + } + dest->datacount = dest->bufsize - dest->pub.free_in_buffer; +} + +// ----------------------------------------------------------------------------- +void SetDest(j_compress_ptr cinfo, void* buffer, int bufsize) { SetDest(cinfo, buffer, bufsize, nullptr); } + +// ----------------------------------------------------------------------------- +void SetDest(j_compress_ptr cinfo, void* buffer, int bufsize, std::string* destination) { + MemDestMgr* dest; + if (cinfo->dest == nullptr) { + cinfo->dest = reinterpret_cast( + (*cinfo->mem->alloc_small)(reinterpret_cast(cinfo), JPOOL_PERMANENT, sizeof(MemDestMgr))); + } + + dest = reinterpret_cast(cinfo->dest); + dest->bufsize = bufsize; + dest->buffer = static_cast(buffer); + dest->dest = destination; + dest->pub.init_destination = MemInitDestination; + dest->pub.empty_output_buffer = MemEmptyOutputBuffer; + dest->pub.term_destination = MemTermDestination; +} + +// ***************************************************************************** +// ***************************************************************************** +// ***************************************************************************** +// Source functions + +// ----------------------------------------------------------------------------- +void MemInitSource(j_decompress_ptr cinfo) { + MemSourceMgr* src = reinterpret_cast(cinfo->src); + src->pub.next_input_byte = src->data; + src->pub.bytes_in_buffer = src->datasize; +} + +// ----------------------------------------------------------------------------- +// We emulate the same error-handling as fill_input_buffer() from jdatasrc.c, +// for coherency's sake. +boolean MemFillInputBuffer(j_decompress_ptr cinfo) { + static const JOCTET kEOIBuffer[2] = {0xff, JPEG_EOI}; + MemSourceMgr* src = reinterpret_cast(cinfo->src); + if (src->pub.bytes_in_buffer == 0 && src->pub.next_input_byte == src->data) { + // empty file -> treated as an error. + ERREXIT(cinfo, JERR_INPUT_EMPTY); + return FALSE; + } else if (src->pub.bytes_in_buffer) { + // if there's still some data left, it's probably corrupted + return src->try_recover_truncated_jpeg ? TRUE : FALSE; + } else if (src->pub.next_input_byte != kEOIBuffer && src->try_recover_truncated_jpeg) { + // In an attempt to recover truncated files, we insert a fake EOI + WARNMS(cinfo, JWRN_JPEG_EOF); + src->pub.next_input_byte = kEOIBuffer; + src->pub.bytes_in_buffer = 2; + return TRUE; + } else { + // We already inserted a fake EOI and it wasn't enough, so this time + // it's really an error. + ERREXIT(cinfo, JERR_FILE_READ); + return FALSE; + } +} + +// ----------------------------------------------------------------------------- +void MemTermSource(j_decompress_ptr) {} + +// ----------------------------------------------------------------------------- +void MemSkipInputData(j_decompress_ptr cinfo, long jump) { + MemSourceMgr* src = reinterpret_cast(cinfo->src); + if (jump < 0) { + return; + } + if (jump > src->pub.bytes_in_buffer) { + src->pub.bytes_in_buffer = 0; + (void)MemFillInputBuffer(cinfo); // warn with a fake EOI or error + } else { + src->pub.bytes_in_buffer -= jump; + src->pub.next_input_byte += jump; + } +} + +// ----------------------------------------------------------------------------- +void SetSrc(j_decompress_ptr cinfo, const void* data, unsigned long int datasize, bool try_recover_truncated_jpeg) { + MemSourceMgr* src; + + cinfo->src = reinterpret_cast( + (*cinfo->mem->alloc_small)(reinterpret_cast(cinfo), JPOOL_PERMANENT, sizeof(MemSourceMgr))); + + src = reinterpret_cast(cinfo->src); + src->pub.init_source = MemInitSource; + src->pub.fill_input_buffer = MemFillInputBuffer; + src->pub.skip_input_data = MemSkipInputData; + src->pub.resync_to_restart = jpeg_resync_to_restart; + src->pub.term_source = MemTermSource; + src->data = reinterpret_cast(data); + src->datasize = datasize; + src->pub.bytes_in_buffer = 0; + src->pub.next_input_byte = nullptr; + src->try_recover_truncated_jpeg = try_recover_truncated_jpeg; +} diff --git a/samples/c_cxx/imagenet/jpeg_handle.h b/samples/c_cxx/imagenet/jpeg_handle.h new file mode 100644 index 0000000000000..a16aadb2070c4 --- /dev/null +++ b/samples/c_cxx/imagenet/jpeg_handle.h @@ -0,0 +1,56 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This file declares the functions and structures for memory I/O with libjpeg +// These functions are not meant to be used directly, see jpeg_mem.h instead. + +#pragma once + +#include +#include +#include +#include +#include +extern "C" { +#include "jerror.h" +#include "jpeglib.h" +} + +// Handler for fatal JPEG library errors: clean up & return +void CatchError(j_common_ptr cinfo); + +typedef struct { + struct jpeg_destination_mgr pub; + JOCTET* buffer; + int bufsize; + int datacount; + std::string* dest; +} MemDestMgr; + +typedef struct { + struct jpeg_source_mgr pub; + const unsigned char* data; + unsigned long int datasize; + bool try_recover_truncated_jpeg; +} MemSourceMgr; + +void SetSrc(j_decompress_ptr cinfo, const void* data, unsigned long int datasize, bool try_recover_truncated_jpeg); + +// JPEG destination: we will store all the data in a buffer "buffer" of total +// size "bufsize", if the buffer overflows, we will be in trouble. +void SetDest(j_compress_ptr cinfo, void* buffer, int bufsize); +// Same as above, except that buffer is only used as a temporary structure and +// is emptied into "destination" as soon as it fills up. +void SetDest(j_compress_ptr cinfo, void* buffer, int bufsize, std::string* destination); diff --git a/samples/c_cxx/imagenet/jpeg_mem.cc b/samples/c_cxx/imagenet/jpeg_mem.cc new file mode 100644 index 0000000000000..400fe0b35cf6b --- /dev/null +++ b/samples/c_cxx/imagenet/jpeg_mem.cc @@ -0,0 +1,403 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This file defines functions to compress and uncompress JPEG data +// to and from memory, as well as some direct manipulations of JPEG string + +#include "jpeg_mem.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include "jpeg_handle.h" +#include +#include + +// ----------------------------------------------------------------------------- +// Decompression + +namespace { + +enum JPEGErrors { JPEGERRORS_OK, JPEGERRORS_UNEXPECTED_END_OF_DATA, JPEGERRORS_BAD_PARAM }; + +// Prevent bad compiler behavior in ASAN mode by wrapping most of the +// arguments in a struct struct. +class FewerArgsForCompiler { + public: + FewerArgsForCompiler(int datasize, const UncompressFlags& flags, int64* nwarn, + std::function allocate_output) + : datasize_(datasize), + flags_(flags), + pnwarn_(nwarn), + allocate_output_(std::move(allocate_output)), + height_read_(0), + height_(0), + stride_(0) { + if (pnwarn_ != nullptr) *pnwarn_ = 0; + } + + const int datasize_; + const UncompressFlags flags_; + int64* const pnwarn_; + std::function allocate_output_; + int height_read_; // number of scanline lines successfully read + int height_; + int stride_; +}; + +uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { + // unpack the argball + const int datasize = argball->datasize_; + const auto& flags = argball->flags_; + const int ratio = flags.ratio; + int components = flags.components; + int stride = flags.stride; // may be 0 + int64* const nwarn = argball->pnwarn_; // may be NULL + + // Can't decode if the ratio is not recognized by libjpeg + if ((ratio != 1) && (ratio != 2) && (ratio != 4) && (ratio != 8)) { + return nullptr; + } + + // Channels must be autodetect, grayscale, or rgb. + if (!(components == 0 || components == 1 || components == 3)) { + return nullptr; + } + + // if empty image, return + if (datasize == 0 || srcdata == nullptr) return nullptr; + + // Declare temporary buffer pointer here so that we can free on error paths + JSAMPLE* tempdata = nullptr; + + // Initialize libjpeg structures to have a memory source + // Modify the usual jpeg error manager to catch fatal errors. + JPEGErrors error = JPEGERRORS_OK; + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + cinfo.err = jpeg_std_error(&jerr); + jerr.error_exit = CatchError; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + jerr.output_message = no_print; +#endif + + jmp_buf jpeg_jmpbuf; + cinfo.client_data = &jpeg_jmpbuf; + if (setjmp(jpeg_jmpbuf)) { + delete[] tempdata; + return nullptr; + } + + jpeg_create_decompress(&cinfo); + SetSrc(&cinfo, srcdata, datasize, flags.try_recover_truncated_jpeg); + jpeg_read_header(&cinfo, TRUE); + + // Set components automatically if desired, autoconverting cmyk to rgb. + if (components == 0) components = std::min(cinfo.num_components, 3); + + // set grayscale and ratio parameters + switch (components) { + case 1: + cinfo.out_color_space = JCS_GRAYSCALE; + break; + case 3: + if (cinfo.jpeg_color_space == JCS_CMYK || cinfo.jpeg_color_space == JCS_YCCK) { + // Always use cmyk for output in a 4 channel jpeg. libjpeg has a builtin + // decoder. We will further convert to rgb below. + cinfo.out_color_space = JCS_CMYK; + } else { + cinfo.out_color_space = JCS_RGB; + } + break; + default: + std::cout << " Invalid components value " << components << std::endl; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + cinfo.do_fancy_upsampling = boolean(flags.fancy_upscaling); + cinfo.scale_num = 1; + cinfo.scale_denom = ratio; + cinfo.dct_method = flags.dct_method; + + // Determine the output image size before attempting decompress to prevent + // OOM'ing doing the decompress + jpeg_calc_output_dimensions(&cinfo); + + int64 total_size = static_cast(cinfo.output_height) * static_cast(cinfo.output_width) * + static_cast(cinfo.num_components); + // Some of the internal routines do not gracefully handle ridiculously + // large images, so fail fast. + if (cinfo.output_width <= 0 || cinfo.output_height <= 0) { + std::cout << "Invalid image size: " << cinfo.output_width << " x " << cinfo.output_height; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + if (total_size >= (1LL << 29)) { + std::cout << "Image too large: " << total_size; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + + jpeg_start_decompress(&cinfo); + + JDIMENSION target_output_width = cinfo.output_width; + JDIMENSION target_output_height = cinfo.output_height; + JDIMENSION skipped_scanlines = 0; + + // check for compatible stride + const int min_stride = target_output_width * components * sizeof(JSAMPLE); + if (stride == 0) { + stride = min_stride; + } else if (stride < min_stride) { + std::cout << "Incompatible stride: " << stride << " < " << min_stride; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + + // Remember stride and height for use in Uncompress + argball->height_ = target_output_height; + argball->stride_ = stride; + + uint8* dstdata = argball->allocate_output_(target_output_width, target_output_height, components); + + if (dstdata == nullptr) { + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + JSAMPLE* output_line = static_cast(dstdata); + + // jpeg_read_scanlines requires the buffers to be allocated based on + // cinfo.output_width, but the target image width might be different if crop + // is enabled and crop_width is not MCU aligned. In this case, we need to + // realign the scanline output to achieve the exact cropping. Notably, only + // cinfo.output_width needs to fall on MCU boundary, while cinfo.output_height + // has no such constraint. + const bool need_realign_cropped_scanline = (target_output_width != cinfo.output_width); + const bool use_cmyk = (cinfo.out_color_space == JCS_CMYK); + + if (use_cmyk) { + // Temporary buffer used for CMYK -> RGB conversion. + tempdata = new JSAMPLE[cinfo.output_width * 4]; + } else if (need_realign_cropped_scanline) { + // Temporary buffer used for MCU-aligned scanline data. + tempdata = new JSAMPLE[cinfo.output_width * components]; + } + + // If there is an error reading a line, this aborts the reading. + // Save the fraction of the image that has been read. + argball->height_read_ = target_output_height; + + // These variables are just to avoid repeated computation in the loop. + const int max_scanlines_to_read = skipped_scanlines + target_output_height; + const int mcu_align_offset = (cinfo.output_width - target_output_width) * (use_cmyk ? 4 : components); + while (cinfo.output_scanline < max_scanlines_to_read) { + int num_lines_read = 0; + if (use_cmyk) { + num_lines_read = jpeg_read_scanlines(&cinfo, &tempdata, 1); + if (num_lines_read > 0) { + // Convert CMYK to RGB if scanline read succeeded. + for (size_t i = 0; i < target_output_width; ++i) { + int offset = 4 * i; + if (need_realign_cropped_scanline) { + // Align the offset for MCU boundary. + offset += mcu_align_offset; + } + const int c = tempdata[offset + 0]; + const int m = tempdata[offset + 1]; + const int y = tempdata[offset + 2]; + const int k = tempdata[offset + 3]; + int r, g, b; + if (cinfo.saw_Adobe_marker) { + r = (k * c) / 255; + g = (k * m) / 255; + b = (k * y) / 255; + } else { + r = (255 - k) * (255 - c) / 255; + g = (255 - k) * (255 - m) / 255; + b = (255 - k) * (255 - y) / 255; + } + output_line[3 * i + 0] = r; + output_line[3 * i + 1] = g; + output_line[3 * i + 2] = b; + } + } + } else if (need_realign_cropped_scanline) { + num_lines_read = jpeg_read_scanlines(&cinfo, &tempdata, 1); + if (num_lines_read > 0) { + memcpy(output_line, tempdata + mcu_align_offset, min_stride); + } + } else { + num_lines_read = jpeg_read_scanlines(&cinfo, &output_line, 1); + } + // Handle error cases + if (num_lines_read == 0) { + std::cout << "Premature end of JPEG data. Stopped at line " << cinfo.output_scanline - skipped_scanlines << "/" + << target_output_height; + if (!flags.try_recover_truncated_jpeg) { + argball->height_read_ = cinfo.output_scanline - skipped_scanlines; + error = JPEGERRORS_UNEXPECTED_END_OF_DATA; + } else { + for (size_t line = cinfo.output_scanline; line < max_scanlines_to_read; ++line) { + if (line == 0) { + // If even the first line is missing, fill with black color + memset(output_line, 0, min_stride); + } else { + // else, just replicate the line above. + memcpy(output_line, output_line - stride, min_stride); + } + output_line += stride; + } + argball->height_read_ = target_output_height; // consider all lines as read + // prevent error-on-exit in libjpeg: + cinfo.output_scanline = max_scanlines_to_read; + } + break; + } + assert(num_lines_read == 1); + output_line += stride; + } + delete[] tempdata; + tempdata = nullptr; + + + + // Convert the RGB data to RGBA, with alpha set to 0xFF to indicate + // opacity. + // RGBRGBRGB... --> RGBARGBARGBA... + if (components == 4) { + // Start on the last line. + JSAMPLE* scanlineptr = static_cast(dstdata + static_cast(target_output_height - 1) * stride); + const JSAMPLE kOpaque = -1; // All ones appropriate for JSAMPLE. + const int right_rgb = (target_output_width - 1) * 3; + const int right_rgba = (target_output_width - 1) * 4; + + for (int y = target_output_height; y-- > 0;) { + // We do all the transformations in place, going backwards for each row. + const JSAMPLE* rgb_pixel = scanlineptr + right_rgb; + JSAMPLE* rgba_pixel = scanlineptr + right_rgba; + scanlineptr -= stride; + for (int x = target_output_width; x-- > 0; rgba_pixel -= 4, rgb_pixel -= 3) { + // We copy the 3 bytes at rgb_pixel into the 4 bytes at rgba_pixel + // The "a" channel is set to be opaque. + rgba_pixel[3] = kOpaque; + rgba_pixel[2] = rgb_pixel[2]; + rgba_pixel[1] = rgb_pixel[1]; + rgba_pixel[0] = rgb_pixel[0]; + } + } + } + + switch (components) { + case 1: + if (cinfo.output_components != 1) { + error = JPEGERRORS_BAD_PARAM; + } + break; + case 3: + case 4: + if (cinfo.out_color_space == JCS_CMYK) { + if (cinfo.output_components != 4) { + error = JPEGERRORS_BAD_PARAM; + } + } else { + if (cinfo.output_components != 3) { + error = JPEGERRORS_BAD_PARAM; + } + } + break; + default: + // will never happen, should be caught by the previous switch + std::cout << "Invalid components value " << components << std::endl; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + + // save number of warnings if requested + if (nwarn != nullptr) { + *nwarn = cinfo.err->num_warnings; + } + + // Handle errors in JPEG + switch (error) { + case JPEGERRORS_OK: + jpeg_finish_decompress(&cinfo); + break; + case JPEGERRORS_UNEXPECTED_END_OF_DATA: + case JPEGERRORS_BAD_PARAM: + jpeg_abort(reinterpret_cast(&cinfo)); + break; + default: + std::cout << "Unhandled case " << error; + break; + } + + jpeg_destroy_decompress(&cinfo); + return dstdata; +} + +} // anonymous namespace + +// ----------------------------------------------------------------------------- +// We do the apparently silly thing of packing 5 of the arguments +// into a structure that is then passed to another routine +// that does all the work. The reason is that we want to catch +// fatal JPEG library errors with setjmp/longjmp, and g++ and +// associated libraries aren't good enough to guarantee that 7 +// parameters won't get clobbered by the longjmp. So we help +// it out a little. +uint8* Uncompress(const void* srcdata, int datasize, const UncompressFlags& flags, int64* nwarn, + std::function allocate_output) { + FewerArgsForCompiler argball(datasize, flags, nwarn, std::move(allocate_output)); + uint8* const dstdata = UncompressLow(srcdata, &argball); + + const float fraction_read = + argball.height_ == 0 ? 1.0f : (static_cast(argball.height_read_) / argball.height_); + if (dstdata == nullptr || fraction_read < std::min(1.0f, flags.min_acceptable_fraction)) { + // Major failure, none or too-partial read returned; get out + return nullptr; + } + + // If there was an error in reading the jpeg data, + // set the unread pixels to black + if (argball.height_read_ != argball.height_) { + const int first_bad_line = argball.height_read_; + uint8* start = dstdata + first_bad_line * argball.stride_; + const int nbytes = (argball.height_ - first_bad_line) * argball.stride_; + memset(static_cast(start), 0, nbytes); + } + + return dstdata; +} + +uint8* Uncompress(const void* srcdata, int datasize, const UncompressFlags& flags, int* pwidth, int* pheight, + int* pcomponents, int64* nwarn) { + uint8* buffer = nullptr; + uint8* result = Uncompress(srcdata, datasize, flags, nwarn, [=, &buffer](int width, int height, int components) { + if (pwidth != nullptr) *pwidth = width; + if (pheight != nullptr) *pheight = height; + if (pcomponents != nullptr) *pcomponents = components; + buffer = new uint8[height * width * components]; + return buffer; + }); + if (!result) delete[] buffer; + return result; +} \ No newline at end of file diff --git a/samples/c_cxx/imagenet/jpeg_mem.h b/samples/c_cxx/imagenet/jpeg_mem.h new file mode 100644 index 0000000000000..1cb5b2595db23 --- /dev/null +++ b/samples/c_cxx/imagenet/jpeg_mem.h @@ -0,0 +1,93 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This file defines functions to compress and uncompress JPEG files +// to and from memory. It provides interfaces for raw images +// (data array and size fields). +// Direct manipulation of JPEG strings are supplied: Flip, Rotate, Crop.. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +extern "C" { +#include "jerror.h" +#include "jpeglib.h" +} + +using uint8 = std::uint8_t; +using int64 = std::int64_t; + +// Flags for Uncompress +struct UncompressFlags { + // ratio can be 1, 2, 4, or 8 and represent the denominator for the scaling + // factor (eg ratio = 4 means that the resulting image will be at 1/4 original + // size in both directions). + int ratio = 1; + + // The number of bytes per pixel (1, 3 or 4), or 0 for autodetect. + int components = 0; + + // If true, decoder will use a slower but nicer upscaling of the chroma + // planes (yuv420/422 only). + bool fancy_upscaling = true; + + // If true, will attempt to fill in missing lines of truncated files + bool try_recover_truncated_jpeg = false; + + // The minimum required fraction of lines read before the image is accepted. + float min_acceptable_fraction = 1.0; + + // The distance in bytes from one scanline to the other. Should be at least + // equal to width*components*sizeof(JSAMPLE). If 0 is passed, the stride + // used will be this minimal value. + int stride = 0; + + // Setting of J_DCT_METHOD enum in jpeglib.h, for choosing which + // algorithm to use for DCT/IDCT. + // + // Setting this has a quality/speed trade-off implication. + J_DCT_METHOD dct_method = JDCT_DEFAULT; +}; + +// Uncompress some raw JPEG data given by the pointer srcdata and the length +// datasize. +// - width and height are the address where to store the size of the +// uncompressed image in pixels. May be nullptr. +// - components is the address where the number of read components are +// stored. This is *output only*: to request a specific number of +// components use flags.components. May be nullptr. +// - nwarn is the address in which to store the number of warnings. +// May be nullptr. +// The function returns a pointer to the raw uncompressed data or NULL if +// there was an error. The caller of the function is responsible for +// freeing the memory (using delete []). +uint8* Uncompress(const void* srcdata, int datasize, const UncompressFlags& flags, int* width, int* height, + int* components, // Output only: useful with autodetect + int64* nwarn); + +// Version of Uncompress that allocates memory via a callback. The callback +// arguments are (width, height, components). If the size is known ahead of +// time this function can return an existing buffer; passing a callback allows +// the buffer to be shaped based on the JPEG header. The caller is responsible +// for freeing the memory *even along error paths*. +uint8* Uncompress(const void* srcdata, int datasize, const UncompressFlags& flags, int64* nwarn, + std::function allocate_output); diff --git a/samples/c_cxx/imagenet/local_filesystem.h b/samples/c_cxx/imagenet/local_filesystem.h new file mode 100644 index 0000000000000..dbb2055be0b11 --- /dev/null +++ b/samples/c_cxx/imagenet/local_filesystem.h @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#ifdef _WIN32 +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif +#include +void ReadFileAsString(const ORTCHAR_T* fname, void*& p, size_t& len); + +enum class OrtFileType { TYPE_BLK, TYPE_CHR, TYPE_DIR, TYPE_FIFO, TYPE_LNK, TYPE_REG, TYPE_SOCK, TYPE_UNKNOWN }; +using TCharString = std::basic_string; + +#ifdef _WIN32 +inline OrtFileType DTToFileType(DWORD dwFileAttributes) { + if (dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { + return OrtFileType::TYPE_DIR; + } + // TODO: test if it is reg + return OrtFileType::TYPE_REG; +} + +inline std::string FormatErrorCode(DWORD dw) { + char* lpMsgBuf; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, dw, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL); + std::string s(lpMsgBuf); + LocalFree(lpMsgBuf); + return s; +} + +template +void LoopDir(const std::wstring& dir_name, T func) { + std::wstring pattern = dir_name + L"\\*"; + WIN32_FIND_DATAW ffd; + std::unique_ptr hFind(FindFirstFileW(pattern.c_str(), &ffd), FindClose); + if (hFind.get() == INVALID_HANDLE_VALUE) { + DWORD dw = GetLastError(); + std::string s = FormatErrorCode(dw); + throw std::runtime_error(s); + } + do { + if (!func(ffd.cFileName, DTToFileType(ffd.dwFileAttributes))) return; + } while (FindNextFileW(hFind.get(), &ffd) != 0); + DWORD dwError = GetLastError(); + if (dwError != ERROR_NO_MORE_FILES) { + DWORD dw = GetLastError(); + std::string s = FormatErrorCode(dw); + throw std::runtime_error(s); + } +} +#else + +inline void ReportSystemError(const char* operation_name, const TCharString& path) { + auto e = errno; + char buf[1024]; + const char* msg = ""; + if (e > 0) { +#if defined(__GLIBC__) && defined(_GNU_SOURCE) && !defined(__ANDROID__) + msg = strerror_r(e, buf, sizeof(buf)); +#else + // for Mac OS X and Android lower than API 23 + if (strerror_r(e, buf, sizeof(buf)) != 0) { + buf[0] = '\0'; + } + msg = buf; +#endif + } + std::ostringstream oss; + oss << operation_name << " file \"" << path << "\" failed: " << msg; + throw std::runtime_error(oss.str()); +} + +inline OrtFileType DTToFileType(unsigned char t) { + switch (t) { + case DT_BLK: + return OrtFileType::TYPE_BLK; + case DT_CHR: + return OrtFileType::TYPE_CHR; + case DT_DIR: + return OrtFileType::TYPE_DIR; + case DT_FIFO: + return OrtFileType::TYPE_FIFO; + case DT_LNK: + return OrtFileType::TYPE_LNK; + case DT_REG: + return OrtFileType::TYPE_REG; + case DT_SOCK: + return OrtFileType::TYPE_SOCK; + default: + return OrtFileType::TYPE_UNKNOWN; + } +} + +template +void LoopDir(const TCharString& dir_name, T func) { + DIR* dir = opendir(dir_name.c_str()); + if (dir == nullptr) { + auto e = errno; + char buf[1024]; + char* msg; +#if defined(__GLIBC__) && defined(_GNU_SOURCE) && !defined(__ANDROID__) + msg = strerror_r(e, buf, sizeof(buf)); +#else + if (strerror_r(e, buf, sizeof(buf)) != 0) { + buf[0] = '\0'; + } + msg = buf; +#endif + std::ostringstream oss; + oss << "couldn't open '" << dir_name << "':" << msg; + std::string s = oss.str(); + throw std::runtime_error(s); + } + try { + struct dirent* dp; + while ((dp = readdir(dir)) != nullptr) { + if (!func(dp->d_name, DTToFileType(dp->d_type))) { + break; + } + } + } catch (std::exception& ex) { + closedir(dir); + throw; + } + closedir(dir); +} +#endif \ No newline at end of file diff --git a/samples/c_cxx/imagenet/local_filesystem_posix.cc b/samples/c_cxx/imagenet/local_filesystem_posix.cc new file mode 100644 index 0000000000000..ba96f04619381 --- /dev/null +++ b/samples/c_cxx/imagenet/local_filesystem_posix.cc @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "local_filesystem.h" +#include +#include + +static std::mutex m; + +void ReadFileAsString(const ORTCHAR_T* fname, void*& p, size_t& len) { + std::lock_guard g(m); + if (!fname) { + throw std::runtime_error("ReadFileAsString: 'fname' cannot be NULL"); + } + int fd = open(fname, O_RDONLY); + if (fd < 0) { + return ReportSystemError("open", fname); + } + struct stat stbuf; + if (fstat(fd, &stbuf) != 0) { + return ReportSystemError("fstat", fname); + } + + if (!S_ISREG(stbuf.st_mode)) { + throw std::runtime_error("ReadFileAsString: input is not a regular file"); + } + // TODO:check overflow + len = static_cast(stbuf.st_size); + + if (len == 0) { + p = nullptr; + } else { + char* buffer = reinterpret_cast(malloc(len)); + char* wptr = reinterpret_cast(buffer); + auto length_remain = len; + do { + size_t bytes_to_read = length_remain; + ssize_t bytes_read; + TEMP_FAILURE_RETRY(bytes_read = read(fd, wptr, bytes_to_read)); + if (bytes_read <= 0) { + return ReportSystemError("read", fname); + } + assert(static_cast(bytes_read) <= bytes_to_read); + wptr += bytes_read; + length_remain -= bytes_read; + } while (length_remain > 0); + p = buffer; + } + close(fd); +} diff --git a/samples/c_cxx/imagenet/local_filesystem_win.cc b/samples/c_cxx/imagenet/local_filesystem_win.cc new file mode 100644 index 0000000000000..ccb66120c4758 --- /dev/null +++ b/samples/c_cxx/imagenet/local_filesystem_win.cc @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "local_filesystem.h" +#include +#include + +static std::mutex m; + +void ReadFileAsString(const ORTCHAR_T* fname, void*& p, size_t& len) { + if (!fname) { + throw std::runtime_error("ReadFileAsString: 'fname' cannot be NULL"); + } + + HANDLE hFile = CreateFileW(fname, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile == INVALID_HANDLE_VALUE) { + int err = GetLastError(); + std::ostringstream oss; + oss << "open file " << fname << " fail, errcode =" << err; + throw std::runtime_error(oss.str().c_str()); + } + std::unique_ptr handler_holder(hFile, CloseHandle); + LARGE_INTEGER filesize; + if (!GetFileSizeEx(hFile, &filesize)) { + int err = GetLastError(); + std::ostringstream oss; + oss << "GetFileSizeEx file " << fname << " fail, errcode =" << err; + throw std::runtime_error(oss.str().c_str()); + } + if (static_cast(filesize.QuadPart) > std::numeric_limits::max()) { + throw std::runtime_error("ReadFileAsString: File is too large"); + } + len = static_cast(filesize.QuadPart); + // check the file file for avoiding allocating a zero length buffer + if (len == 0) { // empty file + p = nullptr; + len = 0; + return; + } + std::unique_ptr buffer(reinterpret_cast(malloc(len))); + char* wptr = reinterpret_cast(buffer.get()); + size_t length_remain = len; + DWORD bytes_read = 0; + for (; length_remain > 0; wptr += bytes_read, length_remain -= bytes_read) { + // read at most 1GB each time + DWORD bytes_to_read; + if (length_remain > (1 << 30)) { + bytes_to_read = 1 << 30; + } else { + bytes_to_read = static_cast(length_remain); + } + if (ReadFile(hFile, wptr, bytes_to_read, &bytes_read, nullptr) != TRUE) { + int err = GetLastError(); + p = nullptr; + len = 0; + std::ostringstream oss; + oss << "ReadFile " << fname << " fail, errcode =" << err; + throw std::runtime_error(oss.str().c_str()); + } + if (bytes_read != bytes_to_read) { + p = nullptr; + len = 0; + std::ostringstream oss; + oss << "ReadFile " << fname << " fail: unexpected end"; + throw std::runtime_error(oss.str().c_str()); + } + } + p = buffer.release(); + return; +} diff --git a/samples/c_cxx/imagenet/main.cc b/samples/c_cxx/imagenet/main.cc new file mode 100644 index 0000000000000..1077bcd40789f --- /dev/null +++ b/samples/c_cxx/imagenet/main.cc @@ -0,0 +1,262 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "providers.h" +#include "local_filesystem.h" +#include "sync_api.h" + +#include + +#include "image_loader.h" +#include "async_ring_buffer.h" +#include +#include +#ifdef _WIN32 +#include +#endif +using namespace std::chrono; + +class Validator : public OutputCollector { + private: + static std::vector ReadFileToVec(const TCharString& file_path, size_t expected_line_count) { + std::ifstream ifs(file_path); + if (!ifs) { + throw std::runtime_error("open file failed"); + } + std::string line; + std::vector labels; + while (std::getline(ifs, line)) { + if (!line.empty()) labels.push_back(line); + } + if (labels.size() != expected_line_count) { + std::ostringstream oss; + oss << "line count mismatch, expect " << expected_line_count << " from " << file_path.c_str() << ", got " + << labels.size(); + throw std::runtime_error(oss.str()); + } + return labels; + } + + // input file name has pattern like: + //"C:\tools\imagnet_validation_data\ILSVRC2012_val_00000001.JPEG" + //"C:\tools\imagnet_validation_data\ILSVRC2012_val_00000002.JPEG" + static int ExtractImageNumberFromFileName(const TCharString& image_file) { + size_t s = image_file.rfind('.'); + if (s == std::string::npos) throw std::runtime_error("illegal filename"); + size_t s2 = image_file.rfind('_'); + if (s2 == std::string::npos) throw std::runtime_error("illegal filename"); + + const ORTCHAR_T* start_ptr = image_file.c_str() + s2 + 1; + const ORTCHAR_T* endptr = nullptr; + long value = my_strtol(start_ptr, (ORTCHAR_T**)&endptr, 10); + if (start_ptr == endptr || value > INT32_MAX || value <= 0) throw std::runtime_error("illegal filename"); + return static_cast(value); + } + + static void VerifyInputOutputCount(OrtSession* session) { + size_t count; + ORT_THROW_ON_ERROR(OrtSessionGetInputCount(session, &count)); + assert(count == 1); + ORT_THROW_ON_ERROR(OrtSessionGetOutputCount(session, &count)); + assert(count == 1); + } + + OrtSession* session_ = nullptr; + const int output_class_count_ = 1001; + std::vector labels_; + std::vector validation_data_; + std::atomic top_1_correct_count_; + std::atomic finished_count_; + int image_size_; + + std::mutex m_; + char* input_name_ = nullptr; + char* output_name_ = nullptr; + OrtEnv* const env_; + const TCharString model_path_; + system_clock::time_point start_time_; + + public: + int GetImageSize() const { return image_size_; } + + ~Validator() { + free(input_name_); + free(output_name_); + OrtReleaseSession(session_); + } + + void PrintResult() { + if (finished_count_ == 0) return; + printf("Top-1 Accuracy %f\n", ((float)top_1_correct_count_.load() / finished_count_)); + } + + void ResetCache() override { + OrtReleaseSession(session_); + CreateSession(); + } + + void CreateSession() { + OrtSessionOptions* session_option; + ORT_THROW_ON_ERROR(OrtCreateSessionOptions(&session_option)); +#ifdef USE_CUDA + ORT_THROW_ON_ERROR(OrtSessionOptionsAppendExecutionProvider_CUDA(session_option, 0)); +#endif + ORT_THROW_ON_ERROR(OrtCreateSession(env_, model_path_.c_str(), session_option, &session_)); + OrtReleaseSessionOptions(session_option); + } + + Validator(OrtEnv* env, const TCharString& model_path, const TCharString& label_file_path, + const TCharString& validation_file_path, size_t input_image_count) + : labels_(ReadFileToVec(label_file_path, 1000)), + validation_data_(ReadFileToVec(validation_file_path, input_image_count)), + top_1_correct_count_(0), + finished_count_(0), + env_(env), + model_path_(model_path) { + CreateSession(); + VerifyInputOutputCount(session_); + OrtAllocator* ort_alloc; + ORT_THROW_ON_ERROR(OrtCreateDefaultAllocator(&ort_alloc)); + { + char* t; + ORT_THROW_ON_ERROR(OrtSessionGetInputName(session_, 0, ort_alloc, &t)); + input_name_ = my_strdup(t); + OrtAllocatorFree(ort_alloc, t); + ORT_THROW_ON_ERROR(OrtSessionGetOutputName(session_, 0, ort_alloc, &t)); + output_name_ = my_strdup(t); + OrtAllocatorFree(ort_alloc, t); + } + + OrtReleaseAllocator(ort_alloc); + OrtTypeInfo* info; + ORT_THROW_ON_ERROR(OrtSessionGetInputTypeInfo(session_, 0, &info)); + const OrtTensorTypeAndShapeInfo* tensor_info; + ORT_THROW_ON_ERROR(OrtCastTypeInfoToTensorInfo(info, &tensor_info)); + size_t dim_count; + ORT_THROW_ON_ERROR(OrtGetDimensionsCount(tensor_info, &dim_count)); + assert(dim_count == 4); + std::vector dims(dim_count); + ORT_THROW_ON_ERROR(OrtGetDimensions(tensor_info, dims.data(), dims.size())); + if (dims[1] != dims[2] || dims[3] != 3) { + throw std::runtime_error("This model is not supported by this program. input tensor need be in NHWC format"); + } + + image_size_ = static_cast(dims[1]); + start_time_ = system_clock::now(); + } + + void operator()(const std::vector& task_id_list, const OrtValue* input_tensor) override { + { + std::lock_guard l(m_); + const size_t remain = task_id_list.size(); + OrtValue* output_tensor = nullptr; + ORT_THROW_ON_ERROR(OrtRun(session_, nullptr, &input_name_, &input_tensor, 1, &output_name_, 1, &output_tensor)); + float* probs; + ORT_THROW_ON_ERROR(OrtGetTensorMutableData(output_tensor, (void**)&probs)); + for (const auto& s : task_id_list) { + float* end = probs + output_class_count_; + float* max_p = std::max_element(probs + 1, end); + auto max_prob_index = std::distance(probs, max_p); + assert(max_prob_index >= 1); + int test_data_id = ExtractImageNumberFromFileName(s); + assert(test_data_id >= 1); + if (labels_[max_prob_index - 1] == validation_data_[test_data_id - 1]) { + ++top_1_correct_count_; + } + probs = end; + } + size_t finished = finished_count_ += static_cast(remain); + float progress = static_cast(finished) / validation_data_.size(); + auto elapsed = system_clock::now() - start_time_; + auto eta = progress > 0 ? duration_cast(elapsed * (1 - progress) / progress).count() : 9999999; + float accuracy = finished > 0 ? top_1_correct_count_ / static_cast(finished) : 0; + printf("accuracy = %.2f, progress %.2f%%, expect to be finished in %d minutes\n", accuracy, progress * 100, eta); + OrtReleaseValue(output_tensor); + } + } +}; + +int real_main(int argc, ORTCHAR_T* argv[]) { + if (argc < 6) return -1; + std::vector image_file_paths; + TCharString data_dir = argv[1]; + TCharString model_path = argv[2]; + // imagenet_lsvrc_2015_synsets.txt + TCharString label_file_path = argv[3]; + TCharString validation_file_path = argv[4]; + const int batch_size = std::stoi(argv[5]); + + // TODO: remove the slash at the end of data_dir string + LoopDir(data_dir, [&data_dir, &image_file_paths](const ORTCHAR_T* filename, OrtFileType filetype) -> bool { + if (filetype != OrtFileType::TYPE_REG) return true; + if (filename[0] == '.') return true; + const ORTCHAR_T* p = my_strrchr(filename, '.'); + if (p == nullptr) return true; + // as we tested filename[0] is not '.', p should larger than filename + assert(p > filename); + if (my_strcasecmp(p, ORT_TSTR(".JPEG")) != 0 && my_strcasecmp(p, ORT_TSTR(".JPG")) != 0) return true; + TCharString v(data_dir); +#ifdef _WIN32 + v.append(1, '\\'); +#else + v.append(1, '/'); +#endif + v.append(filename); + image_file_paths.emplace_back(v); + return true; + }); + + std::vector data; + Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "Default"); + + Validator v(env, model_path, label_file_path, validation_file_path, image_file_paths.size()); + + //Which image size does the model expect? 224, 299, or ...? + int image_size = v.GetImageSize(); + const int channels = 3; + std::atomic finished(0); + + InceptionPreprocessing prepro(image_size, image_size, channels); + Controller c; + AsyncRingBuffer::iterator> buffer(batch_size, 160, c, image_file_paths.begin(), + image_file_paths.end(), &prepro, &v); + buffer.StartDownloadTasks(); + std::string err = c.Wait(); + if (err.empty()) { + buffer.ProcessRemain(); + v.PrintResult(); + return 0; + } + fprintf(stderr, "%s\n", err.c_str()); + return -1; +} +#ifdef _WIN32 +int wmain(int argc, ORTCHAR_T* argv[]) { + HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED); + if (!SUCCEEDED(hr)) return -1; +#else +int main(int argc, ORTCHAR_T* argv[]) { +#endif + int ret = -1; + try { + ret = real_main(argc, argv); + } catch (const std::exception& ex) { + fprintf(stderr, "%s\n", ex.what()); + } +#ifdef _WIN32 + CoUninitialize(); +#endif + return ret; +} \ No newline at end of file diff --git a/samples/c_cxx/imagenet/resize_image_cmd.cc b/samples/c_cxx/imagenet/resize_image_cmd.cc new file mode 100644 index 0000000000000..86a1588f966d7 --- /dev/null +++ b/samples/c_cxx/imagenet/resize_image_cmd.cc @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// A simple tool to test if the image resizing code works + +#include "image_loader.h" +#include "CachedInterpolation.h" +#include +#include "local_filesystem.h" +#include "jpeg_mem.h" + +#include + +int main(int argc, char* argv[]) { + std::string file_name(argv[1]); + std::string output_file_name(argv[2]); + int out_width = 299; + int out_height = 299; + + int width; + int height; + int channels; + + UncompressFlags flags; + flags.components = 3; + // The TensorFlow-chosen default for jpeg decoding is IFAST, sacrificing + // image quality for speed. + flags.dct_method = JDCT_IFAST; + size_t file_len; + void* file_data; + ReadFileAsString(file_name.c_str(), file_data, file_len); + uint8_t* image_data = Uncompress(file_data, file_len, flags, &width, &height, &channels, nullptr); + free(file_data); + + if (channels != 3) { + std::ostringstream oss; + oss << "input format error, expect 3 channels, got " << channels; + throw std::runtime_error(oss.str()); + } + + std::vector output_data(height * width * channels); + + ResizeImageInMemory((uint8_t*)image_data, output_data.data(), height, width, out_height, out_width, channels); + delete[](uint8*) image_data; + + std::vector model_output_bytes(output_data.size()); + for (size_t i = 0; i != output_data.size(); ++i) { + model_output_bytes[i] = (png_byte)(output_data[i]); + } + + png_image image; + memset(&image, 0, (sizeof image)); + image.version = PNG_IMAGE_VERSION; + image.format = PNG_FORMAT_RGB; + image.height = out_height; + image.width = out_width; + + if (png_image_write_to_file(&image, output_file_name.c_str(), 0 /*convert_to_8bit*/, model_output_bytes.data(), + 0 /*row_stride*/, nullptr /*colormap*/) == 0) { + printf("write to '%s' failed:%s\n", output_file_name.c_str(), image.message); + return -1; + } + + return 0; +} \ No newline at end of file diff --git a/samples/c_cxx/imagenet/runnable_task.h b/samples/c_cxx/imagenet/runnable_task.h new file mode 100644 index 0000000000000..c7ba25deefa88 --- /dev/null +++ b/samples/c_cxx/imagenet/runnable_task.h @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "sync_api.h" + +class RunnableTask : public std::unary_function { + public: + virtual void operator()(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci) noexcept = 0; + virtual ~RunnableTask() = default; +}; diff --git a/samples/c_cxx/imagenet/single_consumer.h b/samples/c_cxx/imagenet/single_consumer.h new file mode 100644 index 0000000000000..aa08524a7617d --- /dev/null +++ b/samples/c_cxx/imagenet/single_consumer.h @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +/** + * A special FIFO that is restricted to have only one consumer + * The consumer must return the previous borrowed item before taking the next + */ +template +class SingleConsumerFIFO { + public: + struct ListEntry { + ValueType value; + ListEntry* next = nullptr; + }; + + private: + // fixed size + ListEntry* values_; + ListEntry* free_list_ = nullptr; + // whenever free_list_ is nullptr, free_list_tail_ should equal to &free_list_; + ListEntry** free_list_tail_ = &free_list_; + bool is_consumer_running_ = false; + size_t len_; +#ifndef NDEBUG + size_t count_ = 0; +#endif + public: + explicit SingleConsumerFIFO(size_t len) : values_(new ListEntry[len]), len_(len) {} + + // destruct values earlier + void Release() { + delete[] values_; + values_ = nullptr; + } + ~SingleConsumerFIFO() noexcept { delete[] values_; } + + template + void Init(const T& t) { + for (size_t i = 0; i != len_; ++i) { + t(values_[i].value); + } + } + + /** + * Return a borrowed item + * @param e a pointer returned from the Take() function + * @return ID of the entry, in [0,len) + */ + size_t Return(ListEntry* e) { + is_consumer_running_ = false; + return e - values_; + } + + template + void Put(size_t element_id, const FUNC& f) { + assert(element_id < len_); +#ifndef NDEBUG + ++count_; +#endif + + // printf("Append %zd to the free list\n", element_id); + ListEntry* t = &values_[element_id]; + t->next = nullptr; + (*free_list_tail_) = t; + free_list_tail_ = &t->next; + f(t->value); + } + + ListEntry* Take() { + if (is_consumer_running_) return nullptr; + if (free_list_ == nullptr) { + is_consumer_running_ = false; + assert(count_ == 0); + return nullptr; + } + auto input_tensor = free_list_; + is_consumer_running_ = true; + if ((free_list_ = free_list_->next) == nullptr) free_list_tail_ = &free_list_; +#ifndef NDEBUG + --count_; + assert(free_list_ != nullptr || count_ == 0); +#endif + return input_tensor; + } +}; \ No newline at end of file diff --git a/samples/c_cxx/imagenet/sync_api.h b/samples/c_cxx/imagenet/sync_api.h new file mode 100644 index 0000000000000..4d574af1744f3 --- /dev/null +++ b/samples/c_cxx/imagenet/sync_api.h @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#ifdef _WIN32 +#include +#else +#include +#endif +#include +#include + +#ifdef _WIN32 +#define my_strtol wcstol +#define my_strrchr wcsrchr +#define my_strcasecmp _wcsicmp +#define my_strdup _strdup +#else +#define my_strtol strtol +#define my_strrchr strrchr +#define my_strcasecmp strcasecmp +#define my_strdup strdup +#endif + +#ifdef _WIN32 +using ONNXRUNTIME_CALLBACK_INSTANCE = PTP_CALLBACK_INSTANCE; +using ONNXRUNTIME_EVENT = HANDLE; +#define ONNXRUNTIME_CALLBACK __stdcall +using ONNXRUNTIME_WORK = PTP_WORK; +using PThreadPoolCallbackEnv = PTP_CALLBACK_ENVIRON; +using ONNXRUNTIME_CALLBACK_FUNCTION = PTP_WORK_CALLBACK; +#define OnnxRuntimeCloseThreadpoolWork CloseThreadpoolWork +inline PThreadPoolCallbackEnv GetDefaultThreadPool() { return nullptr; } +#else +#define ONNXRUNTIME_CALLBACK +namespace Eigen { +class ThreadPoolInterface; +} +using PThreadPoolCallbackEnv = Eigen::ThreadPoolInterface*; +#define ONNXRUNTIME_WORK void* +struct OnnxRuntimeEvent; +using ONNXRUNTIME_EVENT = OnnxRuntimeEvent*; + +class OnnxRuntimeCallbackInstance; +using ONNXRUNTIME_CALLBACK_INSTANCE = OnnxRuntimeCallbackInstance*; +using ONNXRUNTIME_CALLBACK_FUNCTION = void ONNXRUNTIME_CALLBACK (*)(ONNXRUNTIME_CALLBACK_INSTANCE pci, void* context, + ONNXRUNTIME_WORK work); +#endif + +// The returned value will be used with CreateAndSubmitThreadpoolWork function +PThreadPoolCallbackEnv GetDefaultThreadPool(); +// On Windows, the last parameter can be null, in that case it will use the default thread pool. +// On Linux, there is no per process default thread pool. You have to pass a non-null pointer. +// Caller must delete the data pointer if this function returns a non-ok status. Otherwise, the ownership is transferred +void CreateAndSubmitThreadpoolWork(_In_ ONNXRUNTIME_CALLBACK_FUNCTION callback, _In_ void* data, + _In_opt_ PThreadPoolCallbackEnv pool); +ONNXRUNTIME_EVENT CreateOnnxRuntimeEvent(); +// pci is a pointer, can be NULL. If pci is NULL, signal the event immediately +void OnnxRuntimeSetEventWhenCallbackReturns(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci, + _In_ ONNXRUNTIME_EVENT finish_event); +void WaitAndCloseEvent(_In_ ONNXRUNTIME_EVENT finish_event); diff --git a/samples/c_cxx/imagenet/sync_api_posix.cc b/samples/c_cxx/imagenet/sync_api_posix.cc new file mode 100644 index 0000000000000..00363645f3d29 --- /dev/null +++ b/samples/c_cxx/imagenet/sync_api_posix.cc @@ -0,0 +1,109 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "sync_api.h" +#include +#include +#include +#include +#include "simple_thread_pool.h" +#include "onnxruntime_event.h" + +using onnxruntime::common::Status; + +// this can be passed to one of the following functions: +// OnnxRuntimeSetEventWhenCallbackReturns +class OnnxRuntimeCallbackInstance { + private: + std::vector events_to_signal_; + + public: + void AddEvent(ONNXRUNTIME_EVENT event); + onnxruntime::common::Status SignalAllEvents(); +}; + +Status WaitAndCloseEvent(ONNXRUNTIME_EVENT finish_event) { + if (finish_event == nullptr) + return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::INVALID_ARGUMENT, ""); + pthread_mutex_lock(&finish_event->finish_event_mutex); + while (!finish_event->finished) { + pthread_cond_wait(&finish_event->finish_event_data, &finish_event->finish_event_mutex); + } + pthread_mutex_unlock(&finish_event->finish_event_mutex); + delete finish_event; + return Status::OK(); +} + +Status CreateAndSubmitThreadpoolWork(ONNXRUNTIME_CALLBACK_FUNCTION callback, void* data, PThreadPool pool) { + if (callback == nullptr) + return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::INVALID_ARGUMENT, "callback cannot be NULL"); + if (pool == nullptr) + return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::INVALID_ARGUMENT, "pool cannot be NULL"); + pool->Schedule([=]() { + OnnxRuntimeCallbackInstance instance; + callback(&instance, data, nullptr); + Status st = instance.SignalAllEvents(); + if (!st.IsOK()) { + LOGF_DEFAULT(ERROR, "SignalAllEvents failed:%s. aborting...\n", st.ErrorMessage().c_str()); + abort(); + } + }); + return Status::OK(); +} + +using DefaultThreadPoolType = onnxruntime::SimpleThreadPoolTempl; +static std::unique_ptr default_pool; +static std::once_flag default_pool_init; + +PThreadPool GetDefaultThreadPool(const onnxruntime::Env& env) { + std::call_once(default_pool_init, [&env] { + int core_num = env.GetNumCpuCores(); + default_pool.reset(new DefaultThreadPoolType(core_num, env)); + }); + return default_pool.get(); +} + +Status OnnxRuntimeSetEventWhenCallbackReturns(ONNXRUNTIME_CALLBACK_INSTANCE pci, ONNXRUNTIME_EVENT finish_event) { + if (finish_event == nullptr) + return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::INVALID_ARGUMENT, ""); + + if (pci == nullptr) { + if (pthread_mutex_lock(&finish_event->finish_event_mutex)) { + return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "lock failed"); + } + finish_event->finished = true; + if (pthread_mutex_unlock(&finish_event->finish_event_mutex)) + return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "unlock failed"); + if (!pthread_cond_broadcast(&finish_event->finish_event_data)) + return Status::OK(); + else + return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "pthread_cond_broadcast failed"); + } else { + pci->AddEvent(finish_event); + return Status::OK(); + } +} + +void OnnxRuntimeCallbackInstance::AddEvent(ONNXRUNTIME_EVENT event) { events_to_signal_.push_back(event); } + +Status OnnxRuntimeCallbackInstance::SignalAllEvents() { + for (ONNXRUNTIME_EVENT finish_event : events_to_signal_) { + if (pthread_mutex_lock(&finish_event->finish_event_mutex)) { + return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "lock failed"); + } + finish_event->finished = true; + if (pthread_mutex_unlock(&finish_event->finish_event_mutex)) + return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "unlock failed"); + if (pthread_cond_broadcast(&finish_event->finish_event_data)) + return ONNXRUNTIME_MAKE_STATUS(ONNXRUNTIME, FAIL, "pthread_cond_broadcast failed"); + } + return Status::OK(); +} + +Status CreateOnnxRuntimeEvent(ONNXRUNTIME_EVENT* out) { + if (out == nullptr) return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::INVALID_ARGUMENT, ""); + *out = new OnnxRuntimeEvent(); + return Status::OK(); +} + +void ONNXRuntimeCloseEvent(ONNXRUNTIME_EVENT finish_event) { delete finish_event; } diff --git a/samples/c_cxx/imagenet/sync_api_win.cc b/samples/c_cxx/imagenet/sync_api_win.cc new file mode 100644 index 0000000000000..afada9c03efb7 --- /dev/null +++ b/samples/c_cxx/imagenet/sync_api_win.cc @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "sync_api.h" + +void CreateAndSubmitThreadpoolWork(_In_ ONNXRUNTIME_CALLBACK_FUNCTION callback, _In_ void* data, + _In_opt_ PThreadPoolCallbackEnv pool) { + PTP_WORK work = CreateThreadpoolWork(callback, data, pool); + if (!work) { + throw std::runtime_error("create thread pool task failed"); + } + SubmitThreadpoolWork(work); +} + +void WaitAndCloseEvent(_In_ ONNXRUNTIME_EVENT finish_event) { + DWORD dwWaitResult = WaitForSingleObject(finish_event, INFINITE); + (void)CloseHandle(finish_event); + if (dwWaitResult != WAIT_OBJECT_0) { + throw std::runtime_error("WaitForSingleObject failed"); + } +} + +ONNXRUNTIME_EVENT CreateOnnxRuntimeEvent() { + HANDLE finish_event = CreateEvent(NULL, // default security attributes + TRUE, // manual-reset event + FALSE, // initial state is nonsignaled + NULL); + if (finish_event == NULL) { + throw std::runtime_error("unable to create finish event"); + } + return finish_event; +} + +void OnnxRuntimeSetEventWhenCallbackReturns(_Inout_opt_ ONNXRUNTIME_CALLBACK_INSTANCE pci, + _In_ ONNXRUNTIME_EVENT finish_event) { + if (pci) + SetEventWhenCallbackReturns(pci, finish_event); + else if (!SetEvent(finish_event)) { + throw std::runtime_error("SetEvent failed"); + } +} diff --git a/samples/c_cxx/imagenet/taskflow.png b/samples/c_cxx/imagenet/taskflow.png new file mode 100644 index 0000000000000..1208a169e3fc8 Binary files /dev/null and b/samples/c_cxx/imagenet/taskflow.png differ diff --git a/samples/c_cxx/include/providers.h b/samples/c_cxx/include/providers.h new file mode 100644 index 0000000000000..8b2dbe1193f27 --- /dev/null +++ b/samples/c_cxx/include/providers.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "onnxruntime/core/providers/cpu/cpu_provider_factory.h" + +#ifdef USE_CUDA +#include "onnxruntime/core/providers/cuda/cuda_provider_factory.h" +#endif +#ifdef USE_MKLDNN +#include "onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.h" +#endif +#ifdef USE_NGRAPH +#include "onnxruntime/core/providers/ngraph/ngraph_provider_factory.h" +#endif +#ifdef USE_NUPHAR +#include "onnxruntime/core/providers/nuphar/nuphar_provider_factory.h" +#endif +#if USE_BRAINSLICE +#include "onnxruntime/core/providers/brainslice/brainslice_provider_factory.h" +#endif +#ifdef USE_TENSORRT +#include "onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.h" +#endif diff --git a/samples/c_cxx/vs.png b/samples/c_cxx/vs.png new file mode 100644 index 0000000000000..a886e277cbb6f Binary files /dev/null and b/samples/c_cxx/vs.png differ diff --git a/setup.py b/setup.py index 03413d8bcc3cf..aa7cc8cdabc86 100644 --- a/setup.py +++ b/setup.py @@ -3,9 +3,14 @@ # Licensed under the MIT License. #-------------------------------------------------------------------------- -from setuptools import setup, find_packages -from os import path, getcwd +from setuptools import setup, find_packages, Extension +from distutils import log as logger +from distutils.command.build_ext import build_ext as _build_ext +from glob import glob +from os import path, getcwd, environ, remove +from shutil import copyfile import platform +import subprocess import sys import datetime @@ -38,12 +43,69 @@ nightly_build = True sys.argv.remove('--nightly_build') +is_manylinux2010 = False +if environ.get('AUDITWHEEL_PLAT', None) == 'manylinux2010_x86_64': + is_manylinux2010 = True + + +class build_ext(_build_ext): + def build_extension(self, ext): + dest_file = self.get_ext_fullpath(ext.name) + logger.info('copying %s -> %s', ext.sources[0], dest_file) + copyfile(ext.sources[0], dest_file) + + try: from wheel.bdist_wheel import bdist_wheel as _bdist_wheel class bdist_wheel(_bdist_wheel): def finalize_options(self): _bdist_wheel.finalize_options(self) - self.root_is_pure = False + if not is_manylinux2010: + self.root_is_pure = False + + def _rewrite_ld_preload(self, to_preload): + with open('onnxruntime/capi/_ld_preload.py', 'rt') as f: + ld_preload = f.read().splitlines() + with open('onnxruntime/capi/_ld_preload.py', 'wt') as f: + for line in ld_preload: + f.write(line) + f.write('\n') + if 'LD_PRELOAD_BEGIN_MARK' in line: + break + if len(to_preload) > 0: + f.write('from ctypes import CDLL, RTLD_GLOBAL\n') + for library in to_preload: + f.write('_{} = CDLL("{}", mode=RTLD_GLOBAL)\n'.format(library.split('.')[0], library)) + + def run(self): + if is_manylinux2010: + source = 'onnxruntime/capi/onnxruntime_pybind11_state.so' + dest = 'onnxruntime/capi/onnxruntime_pybind11_state_manylinux2010.so' + logger.info('copying %s -> %s', source, dest) + copyfile(source, dest) + result = subprocess.run(['patchelf', '--print-needed', dest], check=True, stdout=subprocess.PIPE, universal_newlines=True) + cuda_dependencies = ['libcublas.so', 'libcudnn.so', 'libcudart.so'] + to_preload = [] + args = ['patchelf', '--debug'] + for line in result.stdout.split('\n'): + for dependency in cuda_dependencies: + if dependency in line: + to_preload.append(line) + args.extend(['--remove-needed', line]) + args.append(dest) + if len(to_preload) > 0: + subprocess.run(args, check=True, stdout=subprocess.PIPE) + self._rewrite_ld_preload(to_preload) + _bdist_wheel.run(self) + if is_manylinux2010: + file = glob(path.join(self.dist_dir, '*linux*.whl'))[0] + logger.info('repairing %s for manylinux2010', file) + try: + subprocess.run(['auditwheel', 'repair', '--plat', 'manylinux2010_x86_64', '-w', self.dist_dir, file], check=True, stdout=subprocess.PIPE) + finally: + logger.info('removing %s', file) + remove(file) + except ImportError: bdist_wheel = None @@ -56,8 +118,20 @@ def finalize_options(self): libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.0.dylib'] # TODO add libmklml and libiomp5 later. else: libs = ['onnxruntime_pybind11_state.pyd', 'mkldnn.dll', 'mklml.dll', 'libiomp5md.dll'] + libs.extend(['ngraph.dll', 'cpu_backend.dll', 'tbb.dll']) + +if is_manylinux2010: + data = [] + ext_modules = [ + Extension( + 'onnxruntime.capi.onnxruntime_pybind11_state', + ['onnxruntime/capi/onnxruntime_pybind11_state_manylinux2010.so'], + ), + ] +else: + data = [path.join('capi', x) for x in libs if path.isfile(path.join('onnxruntime', 'capi', x))] + ext_modules = [] -data = [path.join('capi', x) for x in libs if path.isfile(path.join('onnxruntime', 'capi', x))] python_modules_list = list() if '--use_openvino' in sys.argv: @@ -66,7 +140,7 @@ def finalize_options(self): sys.argv.remove('--use_openvino') # Additional examples -examples_names = ["mul_1.pb", "logreg_iris.onnx", "sigmoid.onnx"] +examples_names = ["mul_1.onnx", "logreg_iris.onnx", "sigmoid.onnx"] examples = [path.join('datasets', x) for x in examples_names] # Extra files such as EULA and ThirdPartyNotices @@ -98,7 +172,7 @@ def finalize_options(self): long_description=long_description, author='Microsoft Corporation', author_email='onnx@microsoft.com', - cmdclass={'bdist_wheel': bdist_wheel}, + cmdclass={'bdist_wheel': bdist_wheel, 'build_ext': build_ext}, license="MIT License", packages=['onnxruntime', 'onnxruntime.backend', @@ -106,6 +180,7 @@ def finalize_options(self): 'onnxruntime.datasets', 'onnxruntime.tools', ], + ext_modules=ext_modules, package_data={ 'onnxruntime': data + examples + extra, }, diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index c1f0a1629e35a..bbaa891ef282b 100755 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -111,6 +111,12 @@ def parse_arguments(): parser.add_argument("--arm64", action='store_true', help="Create ARM64 makefiles. Requires --update and no existing cache CMake setup. Delete CMakeCache.txt if needed") parser.add_argument("--msvc_toolset", help="MSVC toolset to use. e.g. 14.11") + parser.add_argument("--android", action='store_true', help='Build for Android') + parser.add_argument("--android_abi", type=str, default='arm64-v8a', + help='') + parser.add_argument("--android_api", type=int, default=27, + help='Android API Level, e.g. 21') + parser.add_argument("--android_ndk_path", default="", help="Path to the Android NDK") # Arguments needed by CI parser.add_argument("--cmake_path", default="cmake", help="Path to the CMake program.") @@ -123,7 +129,8 @@ def parse_arguments(): parser.add_argument("--use_mklml", action='store_true', help="Build with MKLML.") parser.add_argument("--use_ngraph", action='store_true', help="Build with nGraph.") parser.add_argument("--use_openvino", nargs="?", const="CPU_FP32", - choices=["CPU_FP32","GPU_FP32","GPU_FP16","VAD-R_FP16","MYRIAD_FP16"], help="Build with OpenVINO for specific hardware.") + choices=["CPU_FP32","GPU_FP32","GPU_FP16","VAD-M_FP16","MYRIAD_FP16"], help="Build with OpenVINO for specific hardware.") + parser.add_argument("--use_dnnlibrary", action='store_true', help="Build with DNNLibrary.") parser.add_argument("--use_nsync", action='store_true', help="Build with NSYNC.") parser.add_argument("--use_preinstalled_eigen", action='store_true', help="Use pre-installed eigen.") parser.add_argument("--eigen_path", help="Path to pre-installed eigen.") @@ -146,6 +153,8 @@ def parse_arguments(): parser.add_argument("--skip_onnx_tests", action='store_true', help="Explicitly disable all onnx related tests") parser.add_argument("--enable_msvc_static_runtime", action='store_true', help="Enable static linking of MSVC runtimes.") parser.add_argument("--enable_language_interop_ops", action='store_true', help="Enable operator implemented in language other than cpp") + parser.add_argument("--cmake_generator", choices=['Visual Studio 15 2017', 'Visual Studio 16 2019'], + default='Visual Studio 15 2017', help="Specify the generator that CMake invokes. This is only supported on Windows") return parser.parse_args() def resolve_executable_path(command_or_path): @@ -181,6 +190,7 @@ def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False): return subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell) def update_submodules(source_dir): + run_subprocess(["git", "submodule", "sync", "--recursive"], cwd=source_dir) run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir) def is_docker(): @@ -308,7 +318,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home cmake_args = [cmake_path, cmake_dir, "-Donnxruntime_RUN_ONNX_TESTS=" + ("ON" if args.enable_onnx_tests else "OFF"), "-Donnxruntime_GENERATE_TEST_REPORTS=ON", - "-Donnxruntime_DEV_MODE=ON", + "-Donnxruntime_DEV_MODE=" + ("OFF" if args.android else "ON"), "-DPYTHON_EXECUTABLE=" + sys.executable, "-Donnxruntime_USE_CUDA=" + ("ON" if args.use_cuda else "OFF"), "-Donnxruntime_USE_NSYNC=" + ("OFF" if is_windows() or not args.use_nsync else "ON"), @@ -317,7 +327,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home "-Donnxruntime_USE_JEMALLOC=" + ("ON" if args.use_jemalloc else "OFF"), "-Donnxruntime_ENABLE_PYTHON=" + ("ON" if args.enable_pybind else "OFF"), "-Donnxruntime_BUILD_CSHARP=" + ("ON" if args.build_csharp else "OFF"), - "-Donnxruntime_BUILD_SHARED_LIB=" + ("ON" if args.build_shared_lib else "OFF"), + "-Donnxruntime_BUILD_SHARED_LIB=" + ("ON" if args.build_shared_lib or args.build_server else "OFF"), "-Donnxruntime_USE_EIGEN_FOR_BLAS=" + ("OFF" if args.use_openblas else "ON"), "-Donnxruntime_USE_OPENBLAS=" + ("ON" if args.use_openblas else "OFF"), "-Donnxruntime_USE_MKLDNN=" + ("ON" if args.use_mkldnn else "OFF"), @@ -330,8 +340,9 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home "-Donnxruntime_USE_OPENVINO_GPU_FP32=" + ("ON" if args.use_openvino == "GPU_FP32" else "OFF"), "-Donnxruntime_USE_OPENVINO_GPU_FP16=" + ("ON" if args.use_openvino == "GPU_FP16" else "OFF"), "-Donnxruntime_USE_OPENVINO_CPU_FP32=" + ("ON" if args.use_openvino == "CPU_FP32" else "OFF"), - "-Donnxruntime_USE_OPENVINO_VAD_R=" + ("ON" if args.use_openvino == "VAD-R_FP16" else "OFF"), - "-Donnxruntime_USE_OPENMP=" + ("ON" if args.use_openmp and not args.use_mklml and not args.use_ngraph else "OFF"), + "-Donnxruntime_USE_OPENVINO_VAD_M=" + ("ON" if args.use_openvino == "VAD-M_FP16" else "OFF"), + "-Donnxruntime_USE_NNAPI=" + ("ON" if args.use_dnnlibrary else "OFF"), + "-Donnxruntime_USE_OPENMP=" + ("ON" if args.use_openmp and not args.use_dnnlibrary and not args.use_mklml and not args.use_ngraph else "OFF"), "-Donnxruntime_USE_TVM=" + ("ON" if args.use_tvm else "OFF"), "-Donnxruntime_USE_LLVM=" + ("ON" if args.use_llvm else "OFF"), "-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"), @@ -369,11 +380,18 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home cmake_args += ["-Donnxruntime_USE_PREINSTALLED_EIGEN=ON", "-Deigen_SOURCE_PATH=" + args.eigen_path] + if args.android: + cmake_args += ["-DCMAKE_TOOLCHAIN_FILE=" + args.android_ndk_path + "/build/cmake/android.toolchain.cmake", + "-DANDROID_PLATFORM=android-" + str(args.android_api), + "-DANDROID_ABI=" + str(args.android_abi)] + if path_to_protoc_exe: cmake_args += ["-DONNX_CUSTOM_PROTOC_EXECUTABLE=%s" % path_to_protoc_exe] if args.gen_doc: cmake_args += ["-Donnxruntime_PYBIND_EXPORT_OPSCHEMA=ON"] + else: + cmake_args += ["-Donnxruntime_PYBIND_EXPORT_OPSCHEMA=OFF"] cmake_args += ["-D{}".format(define) for define in cmake_extra_defines] @@ -487,6 +505,8 @@ def setup_cuda_vars(args): "Current version is {}. CUDA 9.2 requires version 14.11.*".format(vc_ver_str), "If necessary manually install the 14.11 toolset using the Visual Studio 2017 updater.", "See 'Windows CUDA Build' in build.md in the root directory of this repository.") + + # TODO: check if cuda_version >=10.1, when cuda is enabled and VS version >=2019 return cuda_home, cudnn_home @@ -515,10 +535,29 @@ def setup_tensorrt_vars(args): return tensorrt_home +def adb_push(source_dir, src, dest, **kwargs): + return run_subprocess([os.path.join(source_dir, 'tools', 'ci_build', 'github', 'android', 'adb-push.sh'), src, dest], **kwargs) + +def adb_shell(*args, **kwargs): + return run_subprocess(['adb', 'shell', *args], **kwargs) + def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs, enable_python_tests, enable_tvm = False, enable_tensorrt = False, enable_ngraph = False): for config in configs: log.info("Running tests for %s configuration", config) cwd = get_config_build_dir(build_dir, config) + android_x86_64 = args.android_abi == 'x86_64' + if android_x86_64: + run_subprocess(os.path.join(source_dir, 'tools', 'ci_build', 'github', 'android', 'start_android_emulator.sh')) + adb_push(source_dir, 'testdata', '/data/local/tmp/', cwd=cwd) + adb_push(source_dir, os.path.join(source_dir, 'cmake', 'external', 'onnx', 'onnx', 'backend', 'test'), '/data/local/tmp/', cwd=cwd) + adb_push(source_dir, 'onnxruntime_test_all', '/data/local/tmp/', cwd=cwd) + adb_push(source_dir, 'onnx_test_runner', '/data/local/tmp/', cwd=cwd) + adb_shell('cd /data/local/tmp && /data/local/tmp/onnxruntime_test_all') + if args.use_dnnlibrary: + adb_shell('cd /data/local/tmp && /data/local/tmp/onnx_test_runner -e nnapi /data/local/tmp/test') + else: + adb_shell('cd /data/local/tmp && /data/local/tmp/onnx_test_runner /data/local/tmp/test') + continue if enable_tvm: dll_path = os.path.join(build_dir, config, "external", "tvm", config) elif enable_tensorrt: @@ -635,19 +674,25 @@ def split_server_binary_and_symbol(build_dir, configs): run_subprocess(['objcopy', '--only-keep-debug', 'onnxruntime_server', 'onnxruntime_server.symbol'], cwd=config_build_dir) run_subprocess(['strip', '--strip-debug', '--strip-unneeded', 'onnxruntime_server'], cwd=config_build_dir) run_subprocess(['objcopy', '--add-gnu-debuglink=onnxruntime_server.symbol', 'onnxruntime_server'], cwd=config_build_dir) + libonnx = glob.glob(os.path.join(config_build_dir, "libonnxruntime.so.*")) + if len(libonnx) != 1 : + raise ValueError("Too many libonxruntime.so.*") + libonnx = libonnx[0] + run_subprocess(['objcopy', '--only-keep-debug', libonnx, libonnx+'.symbol'], cwd=config_build_dir) + run_subprocess(['strip', '--strip-debug', libonnx], cwd=config_build_dir) + run_subprocess(['objcopy', '--add-gnu-debuglink={}.symbol'.format(libonnx), libonnx], cwd=config_build_dir) def run_server_tests(build_dir, configs): pip_freeze_result = run_subprocess([sys.executable, '-m', 'pip', 'freeze'], capture=True).stdout installed_packages = [r.decode().split('==')[0] for r in pip_freeze_result.split()] - if not (('requests' in installed_packages) and ('protobuf' in installed_packages) and ('numpy' in installed_packages)): + if not (('requests' in installed_packages) and ('protobuf' in installed_packages) and ('numpy' in installed_packages) and ('grpcio' in installed_packages)): if hasattr(sys, 'real_prefix'): # In virtualenv - run_subprocess([sys.executable, '-m', 'pip', 'install', '--trusted-host', 'files.pythonhosted.org', 'requests', 'protobuf', 'numpy']) + run_subprocess([sys.executable, '-m', 'pip', 'install', '--trusted-host', 'files.pythonhosted.org', 'requests', 'protobuf', 'numpy', 'grpcio']) else: # Outside virtualenv - run_subprocess([sys.executable, '-m', 'pip', 'install', '--user', '--trusted-host', 'files.pythonhosted.org', 'requests', 'protobuf', 'numpy']) - + run_subprocess([sys.executable, '-m', 'pip', 'install', '--user', '--trusted-host', 'files.pythonhosted.org', 'requests', 'protobuf', 'numpy', 'grpcio']) for config in configs: config_build_dir = get_config_build_dir(build_dir, config) if is_windows(): @@ -683,51 +728,39 @@ def run_server_model_tests(build_dir, configs): def build_python_wheel(source_dir, build_dir, configs, use_cuda, use_ngraph, use_tensorrt, use_openvino, nightly_build = False): for config in configs: cwd = get_config_build_dir(build_dir, config) - if is_windows(): cwd = os.path.join(cwd, config) + args = [sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel'] if nightly_build: - if use_tensorrt: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_tensorrt', '--nightly_build'], cwd=cwd) - elif use_cuda: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_cuda', '--nightly_build'], cwd=cwd) - elif use_ngraph: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_ngraph', '--nightly-build'], cwd=cwd) - elif use_openvino: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_openvino', '--nightly-build'], cwd=cwd) - else: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--nightly_build'], cwd=cwd) - else: - if use_tensorrt: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_tensorrt'], cwd=cwd) - elif use_cuda: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_cuda'], cwd=cwd) - elif use_ngraph: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_ngraph'], cwd=cwd) - elif use_openvino: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel', '--use_openvino'], cwd=cwd) - else: - run_subprocess([sys.executable, os.path.join(source_dir, 'setup.py'), 'bdist_wheel'], cwd=cwd) - if is_ubuntu_1604(): - run_subprocess([os.path.join(source_dir, 'rename_manylinux.sh')], cwd=cwd+'/dist') - -def build_protoc_for_windows_host(cmake_path, source_dir, build_dir): - if not is_windows(): + args.append('--nightly_build') + if use_tensorrt: + args.append('--use_tensorrt') + elif use_cuda: + args.append('--use_cuda') + elif use_ngraph: + args.append('--use_ngraph') + elif use_openvino: + args.append('--use_openvino') + run_subprocess(args, cwd=cwd) + +def build_protoc_for_host(cmake_path, source_dir, build_dir, args): + if (args.arm or args.arm64) and not is_windows(): raise BuildError('Currently only support building protoc for Windows host while cross-compiling for ARM/ARM64 arch') log.info("Building protoc for host to be used in cross-compiled build process") - protoc_build_dir = os.path.join(build_dir, 'host_protoc') + protoc_build_dir = os.path.join(os.getcwd(), build_dir, 'host_protoc') os.makedirs(protoc_build_dir, exist_ok=True) # Generate step cmd_args = [cmake_path, - os.path.join(source_dir, 'cmake\external\protobuf\cmake'), - '-T', - 'host=x64', - '-G', - 'Visual Studio 15 2017', + os.path.join(source_dir, 'cmake', 'external', 'protobuf', 'cmake'), '-Dprotobuf_BUILD_TESTS=OFF', '-Dprotobuf_WITH_ZLIB_DEFAULT=OFF', '-Dprotobuf_BUILD_SHARED_LIBS=OFF'] + if is_windows(): + cmd_args += ['-T', + 'host=x64', + '-G', + args.cmake_generator] run_subprocess(cmd_args, cwd= protoc_build_dir) # Build step cmd_args = [cmake_path, @@ -736,8 +769,12 @@ def build_protoc_for_windows_host(cmake_path, source_dir, build_dir): "--target", "protoc"] run_subprocess(cmd_args) - if not os.path.exists(os.path.join(build_dir, 'host_protoc', 'Release', 'protoc.exe')): - raise BuildError("Couldn't build protoc.exe for host. Failing build.") + # Absolute protoc path is needed for cmake + expected_protoc_path = os.path.join(protoc_build_dir, 'Release', 'protoc.exe') if is_windows() else os.path.join(protoc_build_dir, 'protoc') + if not os.path.exists(expected_protoc_path): + raise BuildError("Couldn't build protoc for host. Failing build.") + + return expected_protoc_path def generate_documentation(source_dir, build_dir, configs): operator_doc_path = os.path.join(source_dir, 'docs', 'ContribOperators.md') @@ -767,13 +804,15 @@ def main(): cmake_extra_defines = args.cmake_extra_defines if args.cmake_extra_defines else [] + cross_compiling = args.arm or args.arm64 or args.android + # if there was no explicit argument saying what to do, default to update, build and test (for native builds). if (args.update == False and args.clean == False and args.build == False and args.test == False): log.debug("Defaulting to running update, build [and test for native builds].") args.update = True args.build = True - if args.arm or args.arm64: - args.test = False + if cross_compiling: + args.test = args.android_abi == 'x86_64' else: args.test = True @@ -806,18 +845,19 @@ def main(): log.info("Build started") if (args.update): cmake_extra_args = [] + path_to_protoc_exe = None if(is_windows()): if (args.x86): - cmake_extra_args = ['-A','Win32','-T','host=x64','-G', 'Visual Studio 15 2017'] + cmake_extra_args = ['-A','Win32','-T','host=x64','-G', args.cmake_generator] elif (args.arm or args.arm64): # Cross-compiling for ARM(64) architecture # First build protoc for host to use during cross-compilation - build_protoc_for_windows_host(cmake_path, source_dir, build_dir) + path_to_protoc_exe = build_protoc_for_host(cmake_path, source_dir, build_dir, args) if args.arm: cmake_extra_args = ['-A', 'ARM'] else: cmake_extra_args = ['-A', 'ARM64'] - cmake_extra_args += ['-G', 'Visual Studio 15 2017'] + cmake_extra_args += ['-G', args.cmake_generator] # Cannot test on host build machine for cross-compiled builds (Override any user-defined behaviour for test if any) if args.test: log.info("Cannot test on host build machine for cross-compiled ARM(64) builds. Will skip test running after build.") @@ -829,7 +869,10 @@ def main(): if (args.cuda_version): toolset += ',cuda=' + args.cuda_version - cmake_extra_args = ['-A','x64','-T', toolset, '-G', 'Visual Studio 15 2017'] + cmake_extra_args = ['-A','x64','-T', toolset, '-G', args.cmake_generator] + if args.android: + # Cross-compiling for Android + path_to_protoc_exe = build_protoc_for_host(cmake_path, source_dir, build_dir, args) if is_ubuntu_1604(): if (args.arm or args.arm64): raise BuildError("Only Windows ARM(64) cross-compiled builds supported currently through this script") @@ -847,12 +890,8 @@ def main(): raise UsageError("The test_data_url and test_data_checksum arguments are required.") setup_test_data(build_dir, configs, args.test_data_url, args.test_data_checksum, args.azure_sas_key) - path_to_protoc_exe = None if args.path_to_protoc_exe: path_to_protoc_exe = args.path_to_protoc_exe - # Need to provide path to protoc.exe built for host to be used in the cross-compiled build process - elif args.arm or args.arm64: - path_to_protoc_exe = os.path.join(build_dir, 'host_protoc', 'Release', 'protoc.exe') generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home, tensorrt_home, path_to_protoc_exe, configs, cmake_extra_defines, args, cmake_extra_args) diff --git a/tools/ci_build/github/android/adb-push.sh b/tools/ci_build/github/android/adb-push.sh new file mode 100755 index 0000000000000..e017dd746e7cc --- /dev/null +++ b/tools/ci_build/github/android/adb-push.sh @@ -0,0 +1,17 @@ +#! /usr/bin/env bash +# Created by daquexian +# A bash script that push folders recursively, "adb push" doesn't work on some devices + +if [[ $# -ne 2 ]]; then + echo "Usage: $0 src dest" + exit 1 +fi + +src=`realpath $1` +src_basename=`basename $1` +pushd `dirname $src` +if [[ -d $src ]]; then + find $src_basename -type d -print -exec adb shell mkdir -p $2/{} \; > /dev/null +fi +find $src_basename -type f -exec adb push {} $2/{} \; > /dev/null +popd diff --git a/tools/ci_build/github/android/start_android_emulator.sh b/tools/ci_build/github/android/start_android_emulator.sh new file mode 100755 index 0000000000000..31e68ddf30ca6 --- /dev/null +++ b/tools/ci_build/github/android/start_android_emulator.sh @@ -0,0 +1,23 @@ +#! /usr/bin/env bash +# Created by daquexian + +set -e + +echo "y" | $ANDROID_HOME/tools/bin/sdkmanager --install 'system-images;android-28;google_apis;x86_64' + +echo "no" | $ANDROID_HOME/tools/bin/avdmanager create avd -n android_emulator -k 'system-images;android-28;google_apis;x86_64' --force + +echo "Starting emulator" + +# Start emulator in background +nohup $ANDROID_HOME/emulator/emulator -avd android_emulator -no-snapshot -no-audio & + +# start server in advance, so that the result of watch will only change when device gets online +$ANDROID_HOME/platform-tools/adb start-server + +echo "Waiting for device to come online" +# Sometimes wait-for-device hangs, so add a timeout here +timeout 180 adb wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; do sleep 1; done; input keyevent 82' + +echo "Emulator is online" + diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-crosscompile-ci-pipeline.yml deleted file mode 100644 index cfd453fa31aad..0000000000000 --- a/tools/ci_build/github/azure-pipelines/android-arm64-crosscompile-ci-pipeline.yml +++ /dev/null @@ -1,15 +0,0 @@ -jobs: -- job: Linux_CI_Dev - pool: Linux-CPU - steps: - - template: templates/set-test-data-variables-step.yml - - - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o android -r $(Build.BinariesDirectory) -d cpu -x "--build_wheel"' - displayName: 'Command Line Script' - - - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 - displayName: 'Component Detection' - condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')) - - - template: templates/clean-agent-build-directory-step.yml - diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml new file mode 100644 index 0000000000000..60761c61637d4 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml @@ -0,0 +1,15 @@ +jobs: +- job: Android_CI + pool: + vmImage: 'macOS-10.14' + steps: + # cmake 3.15 breaks Android NDK https://gitlab.kitware.com/cmake/cmake/issues/19515. pip install cmake==3.13.2 installs 3.12.2 + - script: pip install cmake==3.13.2.post1 && alias cmake=/usr/local/bin/cmake && cmake --version && brew install coreutils && alias time=gtimeout + displayName: Install cmake 3.14 and coreutils + - script: 'git submodule update --init --recursive --progress' + displayName: Clone submodules + - script: echo "y" | $ANDROID_HOME/tools/bin/sdkmanager --install 'ndk-bundle' + displayName: Install Android NDK + - script: tools/ci_build/build.py --android --build_dir build --android_ndk $ANDROID_HOME/ndk-bundle --android_abi=x86_64 --skip_submodule_sync --parallel --use_dnnlibrary + displayName: Build and Test on Android Emulator + diff --git a/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml b/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml index de22b1ca27798..8721078422c21 100644 --- a/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml +++ b/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml @@ -1,5 +1,5 @@ jobs: -- job: Ubuntu1604_py_Wheels +- job: Manylinux2010_py_Wheels pool: Linux-CPU strategy: matrix: @@ -24,7 +24,7 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) @@ -32,13 +32,13 @@ jobs: displayName: 'Run build script' inputs: scriptPath: 'tools/ci_build/github/linux/run_dockerbuild.sh' - args: '-c Release -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -p $(python.version) -x "--use_openmp --build_wheel"' + args: '-c Release -o manylinux2010 -d cpu -r $(Build.BinariesDirectory) -p $(python.version) -x "--use_openmp --build_wheel"' - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' inputs: SourceFolder: '$(Build.BinariesDirectory)' - Contents: 'Release/dist/*-manylinux1_x86_64.whl' + Contents: 'Release/dist/*-manylinux2010_x86_64.whl' TargetFolder: '$(Build.ArtifactStagingDirectory)' - task: PublishBuildArtifacts@1 @@ -51,7 +51,7 @@ jobs: - template: templates/clean-agent-build-directory-step.yml -- job: Ubuntu1604_py_GPU_Wheels +- job: Manylinux2010_py_GPU_Wheels pool: Linux-GPU strategy: matrix: @@ -86,7 +86,7 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) @@ -94,13 +94,13 @@ jobs: displayName: 'Run build script' inputs: scriptPath: 'tools/ci_build/github/linux/run_dockerbuild.sh' - args: '-c Release -o ubuntu16.04 -d gpu -c cuda9.1-cudnn7.1 -r $(Build.BinariesDirectory) -p $(python.version) -x "--use_openmp --build_wheel"' + args: '-c Release -o manylinux2010 -d gpu -c cuda10.1 -r $(Build.BinariesDirectory) -p $(python.version) -x "--use_openmp --build_wheel"' - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' inputs: SourceFolder: '$(Build.BinariesDirectory)' - Contents: 'Release/dist/*-manylinux1_x86_64.whl' + Contents: 'Release/dist/*-manylinux2010_x86_64.whl' TargetFolder: '$(Build.ArtifactStagingDirectory)' - task: PublishBuildArtifacts@1 @@ -161,7 +161,7 @@ jobs: pool: Win-GPU variables: buildDirectory: '$(Build.SourcesDirectory)\build' - CUDA_VERSION: '9.1' + CUDA_VERSION: '10.0' strategy: matrix: Python35: @@ -178,12 +178,6 @@ jobs: packageSpecs: 'python=$(python.version)' cleanEnvironment: true - - task: PowerShell@1 - displayName: 'Set CUDA path' - inputs: - scriptName: 'tools/ci_build/github/windows/set_cuda_path.ps1' - arguments: '-CudaMsbuildPath C:\local\cudaMsbuildIntegration-9.1.85-windows10-x64-0 -CudaVersion $(CUDA_VERSION)' - - task: BatchScript@1 displayName: 'Setup VS2017 env vars' inputs: @@ -195,8 +189,8 @@ jobs: displayName: 'Run build script' inputs: filename: 'build.bat' - arguments: ' --use_cuda --cuda_home="C:\local\cuda-9.1.85-windows10-x64-0" - --cudnn_home="C:\local\cudnn-9.1-windows10-x64-v7.1\cuda" --build_dir $(buildDirectory) --config Release --use_openmp --build_wheel' + arguments: ' --use_cuda --cuda_home="C:\local\cuda_10.0.130_win10" + --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --build_dir $(buildDirectory) --config Release --use_openmp --build_wheel' workingFolder: "$(Build.SourcesDirectory)" - task: CopyFiles@2 @@ -215,7 +209,7 @@ jobs: displayName: 'Component Detection' - template: templates/clean-agent-build-directory-step.yml - + - job: MacOS_py_Wheels pool: vmImage: 'macOS-10.13' @@ -238,14 +232,14 @@ jobs: - script: | sudo python -m pip install numpy==1.15.0 sudo xcode-select --switch /Applications/Xcode_10.app/Contents/Developer - ./build.sh --config Release --skip_submodule_sync --parallel --use_openmp --build_wheel - displayName: 'Command Line Script' - + ./build.sh --config Release --skip_submodule_sync --parallel --use_openmp --build_wheel + displayName: 'Command Line Script' + - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' inputs: - SourceFolder: '$(Build.SourcesDirectory)' - Contents: '**/dist/*.whl' + SourceFolder: '$(Build.SourcesDirectory)/build/Linux/Release/dist' + Contents: '*.whl' TargetFolder: '$(Build.ArtifactStagingDirectory)' - task: PublishBuildArtifacts@1 @@ -256,4 +250,4 @@ jobs: - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 displayName: 'Component Detection' - - template: templates/clean-agent-build-directory-step.yml + - template: templates/clean-agent-build-directory-step.yml \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml index 22296ee997273..d8222370b9433 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml @@ -3,10 +3,10 @@ jobs: - job: Linux_C_API_Packaging_CPU_x64 pool: Linux-CPU steps: - - template: templates/set-test-data-variables-step.yml + - template: templates/linux-set-variables-and-download.yml - template: templates/set-version-number-variables-step.yml - - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)"' + - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory)' displayName: 'Build and Test Linux on Docker' - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml parameters: @@ -19,10 +19,10 @@ jobs: - job: Linux_C_API_Packaging_CPU_x86 pool: Linux-CPU steps: - - template: templates/set-test-data-variables-step.yml + - template: templates/linux-set-variables-and-download.yml - template: templates/set-version-number-variables-step.yml - - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -a x86 -x "--x86 --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)"' + - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -a x86 -x "--x86"' displayName: 'Build and Test Linux on Docker' - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml parameters: @@ -35,10 +35,10 @@ jobs: - job: Linux_C_API_Packaging_GPU_x64 pool: Linux-GPU steps: - - template: templates/set-test-data-variables-step.yml + - template: templates/linux-set-variables-and-download.yml - template: templates/set-version-number-variables-step.yml - - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d gpu -c cuda9.1-cudnn7.1 -r $(Build.BinariesDirectory) -x "--test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)"' + - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d gpu -r $(Build.BinariesDirectory)' displayName: 'Build and Test Linux on Docker' - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml parameters: @@ -54,12 +54,12 @@ jobs: vmImage: 'macOS-10.13' steps: - - template: templates/set-test-data-variables-step.yml + - template: templates/mac-set-variables-and-download.yml - template: templates/set-version-number-variables-step.yml - script: | sudo xcode-select --switch /Applications/Xcode_10.app/Contents/Developer - python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --use_openmp --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --config RelWithDebInfo --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) + python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --use_openmp --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --config RelWithDebInfo --enable_onnx_tests displayName: 'Build and Test MacOS' - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml parameters: @@ -146,21 +146,6 @@ jobs: - template: templates/set-test-data-variables-step.yml - template: templates/set-version-number-variables-step.yml - - task: CmdLine@2 - displayName: 'Set CUDA 9.1 path' - inputs: - script: | - set PATH=C:\local\cuda-9.1.85-windows10-x64-0\bin;C:\local\cudnn-9.1-windows10-x64-v7.1\cuda\bin;%PATH% - modifyEnvironment: true - workingDirectory: '$(Build.BinariesDirectory)' - - - task: PowerShell@2 - displayName: 'Set CUDA 9.1 MSBuild properties' - inputs: - targetType: 'filePath' - filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/set_cuda_path.ps1' - arguments: '-CudaMsbuildPath C:\local\cudaMsbuildIntegration-9.1.85-windows10-x64-0 -CudaVersion 9.1' - - template: templates/windows-build-tools-setup-steps.yml parameters: EnvSetupScript: 'setup_env.bat' @@ -171,7 +156,7 @@ jobs: displayName: 'Build and Test OnnxRuntime' inputs: script: | - $(Build.BinariesDirectory)\packages\python\python.exe $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(buildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --use_openmp --msvc_toolset=14.11 --use_cuda --cuda_version 9.1 --cuda_home="C:\local\cuda-9.1.85-windows10-x64-0" --cudnn_home="C:\local\cudnn-9.1-windows10-x64-v7.1\cuda" + $(Build.BinariesDirectory)\packages\python\python.exe $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(buildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_onnx_tests --use_openmp --msvc_toolset=14.11 --use_cuda --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" workingDirectory: '$(Build.BinariesDirectory)' - template: templates/c-api-artifacts-package-and-publish-steps-windows.yml @@ -180,11 +165,4 @@ jobs: artifactName: 'onnxruntime-win-$(buildArch)-gpu-$(OnnxRuntimeVersion)' commitId: $(OnnxRuntimeGitCommitHash) - - task: PowerShell@2 - displayName: 'Clean up Cuda Path 9.1' - inputs: - targetType: 'filePath' - filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/clean_up_cuda_prop_files.ps1' - arguments: '-CudaVersion 9.1' - - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline-cuda9.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline-cuda9.yml index aa262490ee54f..1f6de3aef300d 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline-cuda9.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline-cuda9.yml @@ -16,7 +16,7 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index 90f597a28a72a..8b5fe3a3cd9eb 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -3,9 +3,24 @@ jobs: pool: Linux-GPU-CUDA10 steps: - template: templates/set-test-data-variables-step.yml + - task: CmdLine@2 + displayName: 'Download azcopy' + inputs: + script: | + curl -so azcopy.tar.gz -L 'https://aka.ms/downloadazcopy-v10-linux' + tar -zxvf azcopy.tar.gz --strip 1 + workingDirectory: $(Build.BinariesDirectory) - # There are some tests in 20190130.zip that TensorRT can't run. Instead here use 20181210 opset8 for TensorRT test. - - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d tensorrt -r $(Build.BinariesDirectory) -x "--build_wheel --test_data_url https://onnxruntimetestdata.blob.core.windows.net/models/20181210.zip --test_data_checksum a966def7447f4ff04f5665bca235b3f3"' + - task: PythonScript@0 + displayName: 'Download test data' + inputs: + scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' + # There are some tests in 20190130.zip that TensorRT can't run. Instead here use 20181210 opset8 for TensorRT test. + arguments: --test_data_url https://onnxruntimetestdata.blob.core.windows.net/models/20181210.zip --build_dir $(Build.BinariesDirectory) + pythonInterpreter: '/usr/bin/python3' + workingDirectory: $(Build.BinariesDirectory) + + - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d tensorrt -r $(Build.BinariesDirectory) -x "--build_wheel"' displayName: 'Command Line Script' diff --git a/tools/ci_build/github/azure-pipelines/linux-ngraph-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ngraph-ci-pipeline.yml index fe0769044e4d5..131c4cc06e83f 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ngraph-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ngraph-ci-pipeline.yml @@ -26,7 +26,8 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + # nGraph provider fails on the latest 20190729.zip test. revert back to previous zip file until failures can be investigated + arguments: --test_data_url https://onnxruntimetestdata.blob.core.windows.net/models/20190419.zip --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/linux-nocontribops-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-nocontribops-ci-pipeline.yml index 40e05acbd0d66..d518ec3285071 100644 --- a/tools/ci_build/github/azure-pipelines/linux-nocontribops-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-nocontribops-ci-pipeline.yml @@ -31,7 +31,7 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrlNoContribOps) + arguments: --test_data_url $(TestDataUrlNoContribOps) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml index 94f8550542ac2..07ba00ff3bd71 100644 --- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml @@ -26,7 +26,7 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/linux-ort-srv-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ort-srv-ci-pipeline.yml index a98484bc7bef8..2786df3c4e905 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ort-srv-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ort-srv-ci-pipeline.yml @@ -26,11 +26,11 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) - - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--config Debug Release --build_server --use_openmp --use_full_protobuf --enable_server_tests"' + - script: 'tools/ci_build/github/linux/server_run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--config Debug Release --build_server --use_openmp --use_full_protobuf --enable_server_tests"' displayName: 'Run build script' - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 diff --git a/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml index 50b84d48c6616..0000e6c7097cf 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml @@ -26,11 +26,11 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) - - script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--config RelWithDebInfo --build_server --use_openmp --use_full_protobuf --enable_server_model_tests --cmake_extra_defines onnxruntime_SERVER_VERSION=$(cat ./VERSION_NUMBER)-$(Build.BuildNumber) onnxruntime_LATEST_COMMIT_ID=$(Build.SourceVersion) onnxruntime_USE_SYSLOG=1"' + - script: 'tools/ci_build/github/linux/server_run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--config RelWithDebInfo --build_server --use_openmp --use_full_protobuf --enable_server_model_tests --cmake_extra_defines onnxruntime_SERVER_VERSION=$(cat ./VERSION_NUMBER)-$(Build.BuildNumber) onnxruntime_LATEST_COMMIT_ID=$(Build.SourceVersion) onnxruntime_USE_SYSLOG=1"' displayName: 'Run build script with model tests' - script: 'tools/ci_build/github/linux/upload_ortsrv_binaries.sh -a $(Build.BinariesDirectory) -r $(Build.BinariesDirectory)/RelWithDebInfo -i $(Build.BuildNumber) -c $(Build.SourceVersion) -b "$(blob.binary_upload_url)" -p "--config RelWithDebInfo --build_server --use_openmp --use_full_protobuf --enable_server_model_tests --cmake_extra_defines onnxruntime_SERVER_VERSION=$(cat ./VERSION_NUMBER)-$(Build.BuildNumber) onnxruntime_LATEST_COMMIT_ID=$(Build.SourceVersion)"' diff --git a/tools/ci_build/github/azure-pipelines/linux-x86-nocontribops-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-x86-nocontribops-ci-pipeline.yml index ef570a3cb6872..71b5d5fc94117 100644 --- a/tools/ci_build/github/azure-pipelines/linux-x86-nocontribops-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-x86-nocontribops-ci-pipeline.yml @@ -32,7 +32,7 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrlNoContribOps) + arguments: --test_data_url $(TestDataUrlNoContribOps) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/mac-nocontribops-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-nocontribops-ci-pipeline.yml index 74a004d89fe21..39aca7ec08596 100644 --- a/tools/ci_build/github/azure-pipelines/mac-nocontribops-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/mac-nocontribops-ci-pipeline.yml @@ -19,7 +19,7 @@ jobs: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrlNoContribOps) --azure_region centralus + arguments: --test_data_url $(TestDataUrlNoContribOps) --azure_region centralus --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/local/bin/python3' workingDirectory: $(Build.BinariesDirectory) - script: | diff --git a/tools/ci_build/github/azure-pipelines/nuget/cpu-nocontribops-arm64-esrp-pipeline.yml b/tools/ci_build/github/azure-pipelines/nuget/cpu-nocontribops-arm64-esrp-pipeline.yml new file mode 100644 index 0000000000000..56c9a86167b74 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/nuget/cpu-nocontribops-arm64-esrp-pipeline.yml @@ -0,0 +1,22 @@ +# Defined as pipeline variables +# variables: +# AgentPoolWin : 'Win-CPU' +# AgentPoolLinux : 'Linux-CPU' +# AgentPoolMacOS : 'macOS-10.13' + +schedules: +- cron: "0 14 * * *" + displayName: Daily Build + branches: + include: + - master + always: true + +variables: + DisableContribOps: 'ON' + +jobs: +- template: templates/cpu-nocontribops-arm64.yml + parameters: + AgentPool : $(AgentPoolWin) + DoEsrp: 'true' diff --git a/tools/ci_build/github/azure-pipelines/nuget/cpu-nocontribops-arm64-pipeline.yml b/tools/ci_build/github/azure-pipelines/nuget/cpu-nocontribops-arm64-pipeline.yml new file mode 100644 index 0000000000000..b09b21ef1ca44 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/nuget/cpu-nocontribops-arm64-pipeline.yml @@ -0,0 +1,14 @@ +# Defined as pipeline variables +# variables: +# AgentPoolWin : 'Win-CPU' +# AgentPoolLinux : 'Linux-CPU' +# AgentPoolMacOS : 'macOS-10.13' + +variables: + DisableContribOps: 'ON' + +jobs: +- template: templates/cpu-nocontribops-arm64.yml + parameters: + AgentPool : $(AgentPoolWin) + DoEsrp: 'false' diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-mklml.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-mklml.yml index 1d7c5d1f307f5..700f7b2b4c200 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-mklml.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-mklml.yml @@ -12,7 +12,7 @@ jobs: parameters: AgentPool : $(AgentPoolWin) JobName: 'Windows_CI_Dev' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --build_shared_lib --build_csharp --enable_onnx_tests' DoDebugBuild: 'false' DoNugetPack : 'true' DoCompliance: 'false' @@ -38,14 +38,14 @@ jobs: ldd $(Build.BinariesDirectory)/linux-x64/libonnxruntime.so cd $(Build.BinariesDirectory) zip -r linux-x64.zip linux-x64 - cp $(Build.BinariesDirectory)/*.zip $(Build.ArtifactStagingDirectory) + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) ls -al $(Build.ArtifactStagingDirectory) - template: ../../templates/mac-ci.yml parameters: AgentPool : $(AgentPoolMacOS) JobName: 'MacOS_CI_Dev' - BuildCommand: 'python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --use_mklml --use_openmp --enable_onnx_tests --config RelWithDebInfo' + BuildCommand: 'python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --use_mklml --enable_onnx_tests --config RelWithDebInfo' DoNugetPack : 'true' NuPackScript: | set -e -x @@ -133,7 +133,7 @@ jobs: - template: ../../templates/esrp_nuget.yml parameters: DisplayName: 'ESRP - sign NuGet package' - FolderPath: '$(Build.BinariesDirectory)/nuget-artifact' + FolderPath: '$(Build.ArtifactStagingDirectory)' DoEsrp: ${{ parameters.DoEsrp }} - task: PublishPipelineArtifact@0 diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml new file mode 100644 index 0000000000000..8cdcbadece4e8 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml @@ -0,0 +1,223 @@ +# Defined as pipeline variables +# variables: +# AgentPoolWin : 'Win-CPU' +# AgentPoolLinux : 'Linux-CPU' +# AgentPoolMacOS : 'macOS-10.13' + +parameters: + DoEsrp: 'false' + +jobs: +- template: ../../templates/win-ci.yml + parameters: + AgentPool : $(AgentPoolWin) + JobName: 'Windows_CI_Dev' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests' + DoDebugBuild: 'false' + DoNugetPack : 'true' + DoCompliance: 'false' + DoEsrp: ${{ parameters.DoEsrp }} + NuPackScript: | + msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage + copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory) + + +- template: ../../templates/win-x86-ci.yml + parameters: + AgentPool : $(AgentPoolWin) + JobName: 'Windows_CI_Dev_x86' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests --x86' + DoDebugBuild: 'false' + DoNugetPack : 'true' + DoCompliance: 'false' + DoEsrp: ${{ parameters.DoEsrp }} + NuPackScript: | + msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=x86 /t:CreatePackage + cd $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\ + ren *.nupkg win-x86.zip + copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\*zip $(Build.ArtifactStagingDirectory) + +- template: ../../templates/win-ci-arm.yml + parameters: + AgentPool : $(AgentPoolWin) + JobName: 'Windows_Arm64_Dev' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --arm64' + DoDebugBuild: 'false' + DoNugetPack : 'true' + DoCompliance: 'false' + MsbuildArguments: '/m /p:platform=arm64' + DoEsrp: ${{ parameters.DoEsrp }} + ArtifactName: 'drop-nuget-arm64' + NuPackScript: | + mkdir $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native + cd $(Build.BinariesDirectory)\arm64 + copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.pdb $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native + copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.lib $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native + copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.dll $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native + powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/zip.exe -OutFile zip.exe" + zip -r win10-arm.zip runtimes + copy *.zip $(Build.ArtifactStagingDirectory) + +- template: ../../templates/linux-ci.yml + parameters: + AgentPool : $(AgentPoolLinux) + JobName: 'Linux_CI_Dev' + BuildCommand: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--disable_contrib_ops"' + DoNugetPack : 'true' + ArtifactName: 'drop-linux' + NuPackScript: | + set -e -x + mkdir $(Build.BinariesDirectory)/linux-x64 + cp $(Build.BinariesDirectory)/Release/libonnxruntime.so $(Build.BinariesDirectory)/linux-x64 + cd $(Build.BinariesDirectory) + zip -r linux-x64.zip linux-x64 + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) + ls -al $(Build.ArtifactStagingDirectory) + +- template: ../../templates/linux-ci.yml + parameters: + AgentPool : $(AgentPoolLinux) + JobName: 'Linux_CI_Dev_x86' + BuildCommand : 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -a x86 -x "--x86 --disable_contrib_ops"' + DoNugetPack : 'true' + ArtifactName: 'drop-linux-x86' + NuPackScript: | + set -e -x + mkdir $(Build.BinariesDirectory)/linux-x86 + cp $(Build.BinariesDirectory)/Release/libonnxruntime.so $(Build.BinariesDirectory)/linux-x86 + cd $(Build.BinariesDirectory) + zip -r linux-x86.zip linux-x86 + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) + ls -al $(Build.ArtifactStagingDirectory) + +- template: ../../templates/mac-ci.yml + parameters: + AgentPool : $(AgentPoolMacOS) + JobName: 'MacOS_CI_Dev' + BuildCommand: 'python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --disable_contrib_ops --use_openmp --enable_onnx_tests --config RelWithDebInfo' + DoNugetPack : 'true' + NuPackScript: | + set -e -x + mkdir $(Build.BinariesDirectory)/osx-x64 + find $(Build.BinariesDirectory) + cp $(Build.BinariesDirectory)/RelWithDebInfo/libonnxruntime.dylib $(Build.BinariesDirectory)/osx-x64/ + dsymutil $(Build.BinariesDirectory)/osx-x64/libonnxruntime.dylib -o $(Build.BinariesDirectory)/osx-x64/libonnxruntime.dylib.dSYM + strip -S -x $(Build.BinariesDirectory)/osx-x64/libonnxruntime.dylib + find $(Build.BinariesDirectory)/osx-x64 + cwd=`pwd` + cd $(Build.BinariesDirectory) + zip -r osx-x64.zip osx-x64 + cp $(Build.BinariesDirectory)/osx-x64.zip $(Build.ArtifactStagingDirectory) + cd $cwd + +- job: NuGet_Packaging + pool: $(AgentPoolWin) + dependsOn: + - Windows_CI_Dev + - Windows_CI_Dev_x86 + - Windows_Arm64_Dev + - Linux_CI_Dev + - Linux_CI_Dev_x86 + - MacOS_CI_Dev + condition: succeeded() + steps: + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - NuGet' + inputs: + artifactName: 'drop-nuget' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + continueOnError: true + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - Win-x86' + inputs: + artifactName: 'drop-win-x86-zip' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + continueOnError: true + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - Arm64' + inputs: + artifactName: 'drop-nuget-arm64' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + continueOnError: true + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - Linux' + inputs: + artifactName: 'drop-linux' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + continueOnError: true + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - Linux-x86' + inputs: + artifactName: 'drop-linux-x86' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + continueOnError: true + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - MacOS' + inputs: + artifactName: 'drop-osx' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + continueOnError: true + + - script: | + pushd $(Build.BinariesDirectory)\nuget-artifact + dir + powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/unzip.exe -OutFile unzip.exe" + powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/zip.exe -OutFile zip.exe" + set PATH=%CD%;%PATH% + FOR /R %%i IN (*.nupkg) do ( + rename %%~ni.nupkg %%~ni.zip + unzip %%~ni.zip -d %%~ni + del /Q %%~ni.zip + unzip win-x86.zip -d win-x86 + unzip win10-arm.zip -d win10-arm + unzip linux-x64.zip -d linux-x64 + unzip linux-x86.zip -d linux-x86 + mkdir %%~ni\runtimes\win-x86 + mkdir %%~ni\runtimes\win10-arm + mkdir %%~ni\runtimes\linux-x64 + mkdir %%~ni\runtimes\linux-x86 + mkdir %%~ni\runtimes\win-x86\native + mkdir %%~ni\runtimes\win10-arm\native + mkdir %%~ni\runtimes\linux-x64\native + mkdir %%~ni\runtimes\linux-x86\native + move win-x86\runtimes\win-x86\native\onnxruntime.dll %%~ni\runtimes\win-x86\native\onnxruntime.dll + move win-x86\runtimes\win-x86\native\onnxruntime.lib %%~ni\runtimes\win-x86\native\onnxruntime.lib + move win-x86\runtimes\win-x86\native\onnxruntime.pdb %%~ni\runtimes\win-x86\native\onnxruntime.pdb + move win10-arm\runtimes\win10-arm\native\onnxruntime.lib %%~ni\runtimes\win10-arm\native\onnxruntime.lib + move win10-arm\runtimes\win10-arm\native\onnxruntime.dll %%~ni\runtimes\win10-arm\native\onnxruntime.dll + move win10-arm\runtimes\win10-arm\native\onnxruntime.pdb %%~ni\runtimes\win10-arm\native\onnxruntime.pdb + move linux-x64\linux-x64\libonnxruntime.so %%~ni\runtimes\linux-x64\native\libonnxruntime.so + move linux-x86\linux-x86\libonnxruntime.so %%~ni\runtimes\linux-x86\native\libonnxruntime.so + unzip osx-x64.zip -d osx-x64 + dir osx-x64 /s + mkdir %%~ni\runtimes\osx-x64 + mkdir %%~ni\runtimes\osx-x64\native + move osx-x64\osx-x64\libonnxruntime.dylib %%~ni\runtimes\osx-x64\native\libonnxruntime.dylib + move osx-x64\osx-x64\libonnxruntime.dylib.dSYM %%~ni\runtimes\osx-x64\native\libonnxruntime.dylib.dSYM + pushd %%~ni + zip -r ..\%%~ni.zip . + popd + move %%~ni.zip %%~ni.nupkg + ) + popd + copy $(Build.BinariesDirectory)\nuget-artifact\*.nupkg $(Build.ArtifactStagingDirectory) + displayName: 'Bundle NuGet and other binaries' + + - template: ../../templates/esrp_nuget.yml + parameters: + DisplayName: 'ESRP - sign NuGet package' + FolderPath: '$(Build.ArtifactStagingDirectory)' + DoEsrp: ${{ parameters.DoEsrp }} + + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline NuGet Artifact' + inputs: + artifactName: 'drop-signed-nuget' + targetPath: '$(Build.ArtifactStagingDirectory)' + +- template: test_all_os.yml diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops.yml index 114e632221e8d..659b5dad12e0d 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops.yml @@ -12,7 +12,7 @@ jobs: parameters: AgentPool : $(AgentPoolWin) JobName: 'Windows_CI_Dev' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests' DoDebugBuild: 'false' DoNugetPack : 'true' DoCompliance: 'false' @@ -26,7 +26,7 @@ jobs: parameters: AgentPool : $(AgentPoolWin) JobName: 'Windows_CI_Dev_x86' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --x86' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests --x86' DoDebugBuild: 'false' DoNugetPack : 'true' DoCompliance: 'false' @@ -50,7 +50,7 @@ jobs: cp $(Build.BinariesDirectory)/Release/libonnxruntime.so $(Build.BinariesDirectory)/linux-x64 cd $(Build.BinariesDirectory) zip -r linux-x64.zip linux-x64 - cp $(Build.BinariesDirectory)/*.zip $(Build.ArtifactStagingDirectory) + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) ls -al $(Build.ArtifactStagingDirectory) - template: ../../templates/linux-ci.yml @@ -66,7 +66,7 @@ jobs: cp $(Build.BinariesDirectory)/Release/libonnxruntime.so $(Build.BinariesDirectory)/linux-x86 cd $(Build.BinariesDirectory) zip -r linux-x86.zip linux-x86 - cp $(Build.BinariesDirectory)/*.zip $(Build.ArtifactStagingDirectory) + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) ls -al $(Build.ArtifactStagingDirectory) - template: ../../templates/mac-ci.yml @@ -139,7 +139,7 @@ jobs: - template: ../../templates/esrp_nuget.yml parameters: DisplayName: 'ESRP - sign NuGet package' - FolderPath: '$(Build.BinariesDirectory)/nuget-artifact' + FolderPath: '$(Build.ArtifactStagingDirectory)' DoEsrp: ${{ parameters.DoEsrp }} - task: PublishPipelineArtifact@0 diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml index db871065c51c7..72377b20683ef 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml @@ -13,10 +13,10 @@ jobs: parameters: AgentPool : $(AgentPoolWin) JobName: 'Windows_CI_Dev' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum)' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests' DoDebugBuild: 'false' DoNugetPack : 'true' - DoCompliance: $${ parameters.DoCompliance }} + DoCompliance: ${{ parameters.DoCompliance }} DoEsrp: ${{ parameters.DoEsrp }} NuPackScript: | msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage @@ -27,7 +27,7 @@ jobs: parameters: AgentPool : $(AgentPoolWin) JobName: 'Windows_CI_Dev_x86' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --x86' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --x86' DoDebugBuild: 'false' DoNugetPack : 'true' DoCompliance: 'false' @@ -51,7 +51,7 @@ jobs: cp $(Build.BinariesDirectory)/Release/libonnxruntime.so $(Build.BinariesDirectory)/linux-x64 cd $(Build.BinariesDirectory) zip -r linux-x64.zip linux-x64 - cp $(Build.BinariesDirectory)/*.zip $(Build.ArtifactStagingDirectory) + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) ls -al $(Build.ArtifactStagingDirectory) - template: ../../templates/linux-ci.yml @@ -67,7 +67,7 @@ jobs: cp $(Build.BinariesDirectory)/Release/libonnxruntime.so $(Build.BinariesDirectory)/linux-x86 cd $(Build.BinariesDirectory) zip -r linux-x86.zip linux-x86 - cp $(Build.BinariesDirectory)/*.zip $(Build.ArtifactStagingDirectory) + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) ls -al $(Build.ArtifactStagingDirectory) - template: ../../templates/mac-ci.yml @@ -100,6 +100,7 @@ jobs: - MacOS_CI_Dev condition: succeeded() steps: + - task: DownloadPipelineArtifact@0 displayName: 'Download Pipeline Artifact - NuGet' inputs: @@ -107,6 +108,7 @@ jobs: targetPath: '$(Build.BinariesDirectory)/nuget-artifact' continueOnError: true + - task: DownloadPipelineArtifact@0 displayName: 'Download Pipeline Artifact - Win-x86' inputs: @@ -140,14 +142,48 @@ jobs: - template: ../../templates/esrp_nuget.yml parameters: DisplayName: 'ESRP - sign NuGet package' - FolderPath: '$(Build.BinariesDirectory)/nuget-artifact' + FolderPath: '$(Build.ArtifactStagingDirectory)' DoEsrp: ${{ parameters.DoEsrp }} - - task: PublishPipelineArtifact@0 displayName: 'Publish Pipeline NuGet Artifact' inputs: artifactName: 'drop-signed-nuget' targetPath: '$(Build.ArtifactStagingDirectory)' + - template: test_all_os.yml + +- job: Publish_NuGet_Package_And_Report + variables: + - group: Dashboard_MySQL_Secret + pool: + name: Hosted Windows 2019 with VS2019 + # AzureFileCopy@3 task has some bug that it depends on a particular version of azure power shell, + # which is not available in OnnxRuntime build VMs, but available in the latest hosted agents. + # So, all the copy/publish jobs are being run on hosted agent + # TODO: install the desired azureps on our VMs or use later bugfixed version of AzureFileCopy + demands: azureps + condition: and (${{ parameters.DoEsrp }}, eq(variables['Build.SourceBranch'], 'refs/heads/master')) + dependsOn: + - NuGet_Test_Win + - NuGet_Test_Linux + - NuGet_Test_MacOS + steps: + + - template: ../../templates/set-version-number-variables-step.yml + - template: upload-binary-sizes-from-nuget-package.yml + parameters: + downloadPath: $(Build.BinariesDirectory)/nuget-artifact/final-package + gitCommitHash: $(OnnxRuntimeGitCommitHashShort) + + - task: AzureFileCopy@3 + displayName: 'Copy Signed NuGet Package to Blob Store' + condition: ne(variables['IsReleaseBuild'], 'true') # rlease build has a different package naming scheme + inputs: + sourcePath: '$(Build.BinariesDirectory)/nuget-artifact/final-package/Microsoft.ML.OnnxRuntime.$(OnnxRuntimeVersion)-dev-$(OnnxRuntimeGitCommitHashShort).nupkg' + azureSubscription: 'AIInfraBuildOnnxRuntimeOSS' + destination: azureBlob + storage: ortpackages + containerName: ortpackages + diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml index 20610d93bdcb0..ebe56764e8b80 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml @@ -13,7 +13,7 @@ jobs: parameters: AgentPool : $(AgentPoolWin) JobName: 'Windows_CI_GPU_Dev' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --use_mkldnn --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --msvc_toolset=14.11' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --msvc_toolset=14.11' DoDebugBuild: 'false' DoNugetPack : 'true' DoCompliance: 'false' @@ -41,7 +41,7 @@ jobs: cp $(Build.BinariesDirectory)/Release/libonnxruntime.so $(Build.BinariesDirectory)/linux-x64 cd $(Build.BinariesDirectory) zip -r linux-x64.zip linux-x64 - cp $(Build.BinariesDirectory)/*.zip $(Build.ArtifactStagingDirectory) + cp $(Build.BinariesDirectory)/linux*.zip $(Build.ArtifactStagingDirectory) ls -al $(Build.ArtifactStagingDirectory) @@ -92,7 +92,7 @@ jobs: - template: ../../templates/esrp_nuget.yml parameters: DisplayName: 'ESRP - sign NuGet package' - FolderPath: '$(Build.BinariesDirectory)/nuget-artifact' + FolderPath: '$(Build.ArtifactStagingDirectory)' DoEsrp: ${{ parameters.DoEsrp }} - task: PublishPipelineArtifact@0 @@ -107,3 +107,4 @@ jobs: - template: test_linux.yml parameters: AgentPool : $(AgentPoolLinux) + TestGPU : 'true' diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index 1eabf3fdf8408..2cb8763745653 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -1,5 +1,6 @@ parameters: AgentPool: 'Linux-CPU' + TestGPU: 'false' jobs: - job: NuGet_Test_Linux @@ -18,12 +19,21 @@ jobs: artifactName: 'drop-signed-nuget' targetPath: '$(Build.BinariesDirectory)/nuget-artifact' - - script: | - set -e -x - $(Build.SourcesDirectory)/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh $(Build.SourcesDirectory) $(Build.BinariesDirectory) nuget-artifact - displayName: 'Run Package Test (x64)' - env: - OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory) + - ${{ if eq(parameters['TestGPU'], 'false') }}: + - script: | + set -e -x + $(Build.SourcesDirectory)/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker.sh $(Build.SourcesDirectory) $(Build.BinariesDirectory) nuget-artifact + displayName: 'Run Package Test (x64)' + env: + OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory) + + - ${{ if eq(parameters['TestGPU'], 'true') }}: + - script: | + set -e -x + $(Build.SourcesDirectory)/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest-docker-gpu.sh $(Build.SourcesDirectory) $(Build.BinariesDirectory) nuget-artifact + displayName: 'Run Package Test GPU (x64)' + env: + OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory) - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 displayName: 'Component Detection' diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml index c85001e7ba1e5..4336c4279c599 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml @@ -25,7 +25,7 @@ jobs: - script: | @echo "Running build.py --update" - $(Build.BinariesDirectory)\packages\python\python.exe $(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --config Debug --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --update --download_test_data --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) + $(Build.BinariesDirectory)\packages\python\python.exe $(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --config Debug --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --update --download_test_data displayName: 'Download Test Data' @@ -67,7 +67,23 @@ jobs: EXIT 1 } workingDirectory: '$(Build.SourcesDirectory)\csharp' - displayName: 'Run End to End Test (C++) ' + displayName: 'Run End to End Test (C-API) ' + timeoutInMinutes: 30 + continueOnError: true + + # test C++ API sample + - script: | + @echo ##vso[task.setvariable variable=OnnxRuntimeSampleCode]CXX_Api_Sample.cpp + + - script: | + @echo "Running runtest.bat" + test\Microsoft.ML.OnnxRuntime.EndToEndTests.Capi\runtest.bat $(Build.BinariesDirectory)\nuget-artifact + REM Need an ErrorLevel check, since the script uses Exit /B + IF NOT %ERRORLEVEL% EQU 0 { + EXIT 1 + } + workingDirectory: '$(Build.SourcesDirectory)\csharp' + displayName: 'Run End to End Test (C++ API) ' timeoutInMinutes: 30 continueOnError: true diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml new file mode 100644 index 0000000000000..2b1e0aca9a537 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml @@ -0,0 +1,49 @@ +parameters: + gitCommitHash: '' + downloadPath: $(Build.BinariesDirectory)/nuget-artifact/final-package + +steps: +- task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - Signed NuGet Package' + inputs: + artifactName: 'drop-signed-nuget' + targetPath: '${{ parameters.downloadPath }}' + +- task: UsePythonVersion@0 + inputs: + versionSpec: '3.7' + addToPath: true + architecture: 'x64' + +- task: CmdLine@1 + displayName: 'Install conda modules mysql-connector-python' + inputs: + filename: '%CONDA%\condabin\conda.bat' + arguments: 'install -q --insecure -y mysql-connector-python' + timeoutInMinutes: 10 + +- task: CmdLine@2 + displayName: 'Post binary sizes to the dashboard database using command line' + inputs: + script: | + echo changing directory to artifact download path + pushd "${{ parameters.downloadPath }}" + echo processing nupkg + FOR /R %%i IN (*.nupkg) do ( + echo processing %%~ni.nupkg + copy %%~ni.nupkg %%~ni.zip + echo copied to zip + echo listing lib files in the zip + REM use a single .csv file to put the data + echo os,arch,build_config,size > binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\linux-x64\native\libonnxruntime.so | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo linux,x64,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\linux-x86\native\libonnxruntime.so | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo linux,x86,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\osx-x64\native\libonnxruntime.dylib | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo osx,x64,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\win-x64\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo win,x64,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\win-x86\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo win,x86,openmp,%%a >> binary_size_data.txt + echo calling python script to post to database + %CONDA%\python.exe $(Build.SourcesDirectory)\tools\ci_build\github\windows\post_binary_sizes_to_dashboard.py --commit_hash=${{ parameters.gitCommitHash }} --size_data_file=binary_size_data.txt --build_project=Lotus --build_id=$(Build.BuildId) + ) + + env: + DASHBOARD_MYSQL_ORT_PASSWORD: $(dashboard-mysql-ort-password) diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml index 807e6246f78c7..503bb0595532a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml @@ -12,11 +12,15 @@ steps: mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}} mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\include + echo "Directories created" copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib copy $(Build.SourcesDirectory)\include\onnxruntime\core\session\onnxruntime_*.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include + copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cpu\cpu_provider_factory.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include + copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cuda\cuda_provider_factory.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include + # copy the README, licence and TPN copy $(Build.SourcesDirectory)\README.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\README.md copy $(Build.SourcesDirectory)\docs\C_API.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\C_API.md diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-set-variables-and-download.yml b/tools/ci_build/github/azure-pipelines/templates/linux-set-variables-and-download.yml index 0242ac0a65187..405dcc9418aa1 100644 --- a/tools/ci_build/github/azure-pipelines/templates/linux-set-variables-and-download.yml +++ b/tools/ci_build/github/azure-pipelines/templates/linux-set-variables-and-download.yml @@ -24,7 +24,7 @@ steps: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/bin/python3' workingDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-set-variables-and-download.yml b/tools/ci_build/github/azure-pipelines/templates/mac-set-variables-and-download.yml index 829ef22b97612..233fab77222d2 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-set-variables-and-download.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-set-variables-and-download.yml @@ -13,6 +13,6 @@ steps: displayName: 'Download test data' inputs: scriptPath: '$(Build.SourcesDirectory)/tools/ci_build/github/download_test_data.py' - arguments: --test_data_url $(TestDataUrl) --azure_region centralus + arguments: --test_data_url $(TestDataUrl) --azure_region centralus --build_dir $(Build.BinariesDirectory) pythonInterpreter: '/usr/local/bin/python3' workingDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/set-test-data-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-test-data-variables-step.yml index 5cd9b12eecaeb..960f29d25bf78 100644 --- a/tools/ci_build/github/azure-pipelines/templates/set-test-data-variables-step.yml +++ b/tools/ci_build/github/azure-pipelines/templates/set-test-data-variables-step.yml @@ -1,8 +1,8 @@ # sets variables $(TestDataUrl) and $(TestDataChecksum) parameters: - TestDataUrl: https://onnxruntimetestdata.blob.core.windows.net/models/20190419.zip - TestDataChecksum: 3f46c31ee02345dbe707210b339e31fe + TestDataUrl: https://onnxruntimetestdata.blob.core.windows.net/models/20190729.zip + TestDataChecksum: ef36198a9ebaca8413b9739b15e87954 steps: - task: CmdLine@1 diff --git a/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml index 007d40b7aef0f..dcbdcffb730d8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml +++ b/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml @@ -12,6 +12,10 @@ steps: FOR /F "tokens=* USEBACKQ" %%F IN (`git rev-parse HEAD`) DO ( @echo ##vso[task.setvariable variable=OnnxRuntimeGitCommitHash;]%%F ) + + FOR /F "tokens=* USEBACKQ" %%F IN (`git rev-parse --short HEAD`) DO ( + @echo ##vso[task.setvariable variable=OnnxRuntimeGitCommitHashShort;]%%F + ) workingDirectory: '$(Build.SourcesDirectory)' condition: eq(variables['Agent.OS'], 'Windows_NT') @@ -26,5 +30,8 @@ steps: _OnnxRuntimeGitCommitHash=$(git rev-parse HEAD) echo "##vso[task.setvariable variable=OnnxRuntimeGitCommitHash;]$_OnnxRuntimeGitCommitHash" + _OnnxRuntimeGitCommitHash=$(git rev-parse --short=8 HEAD) + echo "##vso[task.setvariable variable=OnnxRuntimeGitCommitHashShort;]$_OnnxRuntimeGitCommitHash" + workingDirectory: '$(Build.SourcesDirectory)' condition: not(eq(variables['Agent.OS'], 'Windows_NT')) \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci-arm.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci-arm.yml new file mode 100644 index 0000000000000..4eeea5285074d --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci-arm.yml @@ -0,0 +1,103 @@ +parameters: + AgentPool : 'Win-CPU' + DoDebugBuild: 'true' + BuildCommand: '' + JobName: 'Windows_CI_Dev' + DoNugetPack: 'false' + NuPackScript : '' + ArtifactName: 'drop-nuget' + DoEsrp: 'false' + BuildArch: 'x64' + SetVcvars: 'false' + MsbuildArguments: '/m' + EnvSetupScript: 'setup_env.bat' + CudaVersion: '' + +jobs: +- job: ${{ parameters.JobName }} + timeoutInMinutes: 120 + pool: ${{ parameters.AgentPool }} + variables: + buildDirectory: '$(Build.BinariesDirectory)' + BuildCommand: ${{ parameters.BuildCommand }} + OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)' + DotNetExe: 'dotnet.exe' + CUDA_VERSION: ${{ parameters.CudaVersion }} + + steps: + - template: set-test-data-variables-step.yml + - template: windows-build-tools-setup-steps.yml + parameters: + EnvSetupScript: ${{ parameters.EnvSetupScript }} + buildArch: ${{ parameters.BuildArch }} + setVcvars: ${{ parameters.SetVcvars }} + + - task: CmdLine@1 + displayName: 'Download test data and generate cmake config' + inputs: + filename: '$(Build.BinariesDirectory)\packages\python\python.exe' + arguments: '$(BuildCommand) --update --config Debug RelWithDebInfo' + workingDirectory: "$(Build.BinariesDirectory)" + + # Build Debug Mode + - ${{ if eq(parameters['DoDebugBuild'], 'true') }}: + - task: VSBuild@1 + displayName: 'Build Debug' + inputs: + solution: '$(Build.BinariesDirectory)\Debug\onnxruntime.sln' + platform: 'arm64' + configuration: 'Debug' + msbuildArguments: ${{ parameters.MsbuildArguments }} + msbuildArchitecture: 'x64' + logProjectEvents: true + workingFolder: '$(Build.BinariesDirectory)\Debug' + createLogFile: true + + # Build RelWithDebInfo -- this variable required to build C# + - script: | + @echo ##vso[task.setvariable variable=Configuration]RelWithDebInfo + + - task: VSBuild@1 + displayName: 'Build RelWithDebInfo' + inputs: + solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln' + platform: 'arm64' + configuration: 'RelWithDebInfo' + msbuildArguments: ${{ parameters.MsbuildArguments }} + msbuildArchitecture: 'x64' + logProjectEvents: true + workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo' + + # Nuget packaging if needed + - ${{ if eq(parameters['DoNugetPack'], 'true') }}: + - task: BatchScript@1 + displayName: 'Setup VS2017 env vars' + inputs: + filename: 'C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat' + arguments: 'x64 -vcvars_ver=14.11' + modifyEnvironment: true + + # Esrp signing + - template: esrp_dll.yml + parameters: + FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo' + DisplayName: 'ESRP - Sign Native dlls' + DoEsrp: ${{ parameters.DoEsrp }} + + - script: | + ${{ parameters.NuPackScript }} + workingDirectory: '$(Build.SourcesDirectory)\csharp' + displayName: 'Create NuGet Package' + + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline Artifact: drop-nuget' + inputs: + artifactName: ${{ parameters.ArtifactName }} + targetPath: '$(Build.ArtifactStagingDirectory)' + + - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 + displayName: 'Component Detection' + condition: succeeded() + + - template: clean-agent-build-directory-step.yml + diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 8ca6418322185..bcd1cb8da9b76 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -1,11 +1,11 @@ parameters: - AgentPool : 'Win-CPU' DoDebugBuild: 'true' DoCompliance: 'false' BuildCommand: '' JobName: 'Windows_CI_Dev' DoNugetPack: 'false' NuPackScript : '' + ArtifactName: 'drop-nuget' DoEsrp: 'false' DoTestCoverage: 'false' BuildArch: 'x64' @@ -13,6 +13,7 @@ parameters: MsbuildArguments: '/m' EnvSetupScript: 'setup_env.bat' CudaVersion: '' + AgentPool: 'Win-CPU' jobs: - job: ${{ parameters.JobName }} @@ -33,6 +34,14 @@ jobs: buildArch: ${{ parameters.BuildArch }} setVcvars: ${{ parameters.SetVcvars }} + # Copy CUDA props files if needed + - ${{ if eq(parameters['CudaVersion'], '10.0') }}: + - task: PowerShell@1 + displayName: 'Set CUDA path' + inputs: + scriptName: 'tools/ci_build/github/windows/set_cuda_path.ps1' + arguments: '-CudaMsbuildPath C:\local\cudaMsbuildIntegration-10.0.130-win10 -CudaVersion ${{ parameters.CudaVersion }}' + - task: CmdLine@1 displayName: 'Download test data and generate cmake config' inputs: @@ -163,9 +172,18 @@ jobs: - task: PublishPipelineArtifact@0 displayName: 'Publish Pipeline Artifact: drop-nuget' inputs: - artifactName: 'drop-nuget' + artifactName: ${{ parameters.ArtifactName }} targetPath: '$(Build.ArtifactStagingDirectory)' + # Remove CUDA props files after build + - ${{ if eq(parameters['CudaVersion'], '10.0') }}: + - task: PowerShell@1 + displayName: 'Clean up CUDA props files' + inputs: + scriptName: 'tools/ci_build/github/windows/clean_up_cuda_prop_files.ps1' + arguments: '-CudaVersion ${{ parameters.CudaVersion }}' + + # Compliance tasks require logs from Debug Build - ${{ if eq(parameters['DoCompliance'], 'true') }}: - template: compliance.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/windows-build-and-test-steps.yml b/tools/ci_build/github/azure-pipelines/templates/windows-build-and-test-steps.yml index a846bb8dcb78f..a6a4c4d06206e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/windows-build-and-test-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/windows-build-and-test-steps.yml @@ -11,7 +11,7 @@ steps: displayName: 'Download test data and generate cmake config' inputs: script: | - $(Build.BinariesDirectory)\packages\python\python.exe $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{parameters.buildConfig}} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --update ${{parameters.buildAdditionalParams}} + $(Build.BinariesDirectory)\packages\python\python.exe $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{parameters.buildConfig}} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_onnx_tests --update ${{parameters.buildAdditionalParams}} workingDirectory: '$(Build.BinariesDirectory)' - task: VSBuild@1 @@ -29,7 +29,7 @@ steps: displayName: 'Test ${{parameters.buildConfig}}' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{parameters.buildConfig}} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --build_shared_lib --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --test ${{parameters.buildAdditionalParams}}' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{parameters.buildConfig}} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --build_shared_lib --enable_onnx_tests --test ${{parameters.buildAdditionalParams}}' workingFolder: '$(Build.BinariesDirectory)' - task: PublishTestResults@2 diff --git a/tools/ci_build/github/azure-pipelines/templates/windows-build-tools-setup-steps.yml b/tools/ci_build/github/azure-pipelines/templates/windows-build-tools-setup-steps.yml index d149775a78317..d9fda65add37d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/windows-build-tools-setup-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/windows-build-tools-setup-steps.yml @@ -26,19 +26,25 @@ steps: # downloadDirectory: '$(Build.BinariesDirectory)\python' # Temporary bypass of artifacts permission issue - - task: PowerShell@2 + - task: CmdLine@1 + displayName: 'Download azcopy' + inputs: + filename: 'AzCopy.exe' + arguments: '/Y /Source:https://onnxruntimetestdata.blob.core.windows.net/models/azcopy.exe /Dest:$(Build.BinariesDirectory)\azcopy.exe' + + - task: CmdLine@1 displayName: 'Download python' inputs: - targetType: 'inline' - script: 'Invoke-WebRequest -OutFile installer.exe https://onnxruntimeinstaller.blob.core.windows.net/conda-installer/installer.exe' - workingDirectory: '$(Build.BinariesDirectory)' - + filename: 'AzCopy.exe' + arguments: '/Y /Source:https://onnxruntimetestdata.blob.core.windows.net/models/Miniconda3-4.7.10-Windows-x86_64.exe /Dest:$(Build.BinariesDirectory)\Miniconda3-4.7.10-Windows-x86_64.exe' + - task: CmdLine@1 displayName: 'Run python installer' inputs: - filename: '$(Build.BinariesDirectory)\installer.exe' + filename: '$(Build.BinariesDirectory)\Miniconda3-4.7.10-Windows-x86_64.exe' arguments: '/S /NoRegistry=1 /AddToPath=0 /RegisterPython=0 /D=$(Build.BinariesDirectory)\packages\python' timeoutInMinutes: 10 + - task: BatchScript@1 displayName: 'setup env' inputs: @@ -52,24 +58,13 @@ steps: arguments: 'install -q --insecure -y pyopenssl setuptools wheel numpy' timeoutInMinutes: 10 - - task: CmdLine@1 - displayName: 'Download cmake' - inputs: - filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\download_cmake.py --build_dir $(Build.BinariesDirectory)' - - - task: PowerShell@2 - displayName: 'Download OpenCppCoverage installer' - continueOnError: true + - task: PythonScript@0 + displayName: 'Download test data' inputs: - targetType: 'inline' - script: ' - New-Item -Path "$(Build.BinariesDirectory)\installer" -ItemType "directory" - - New-Item -Path "$(Build.BinariesDirectory)\installer\opencppcoverage" -ItemType "directory" - - Invoke-WebRequest -OutFile $(Build.BinariesDirectory)\installer\opencppcoverage\installer.exe https://onnxruntimeinstaller.blob.core.windows.net/opencppcovergae-installer/OpenCppCoverageSetup-x64-0.9.7.0.exe - ' + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\github\download_test_data.py' + arguments: --test_data_url $(TestDataUrl) --build_dir $(Build.BinariesDirectory) + pythonInterpreter: '$(Build.BinariesDirectory)\packages\python\python.exe' + workingDirectory: $(Build.BinariesDirectory) - task: CmdLine@1 continueOnError: true diff --git a/tools/ci_build/github/azure-pipelines/win-arm-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-arm-crosscompile-ci-pipeline.yml index 644acf7dfa9a0..6a9cddaee3a44 100644 --- a/tools/ci_build/github/azure-pipelines/win-arm-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-arm-crosscompile-ci-pipeline.yml @@ -1,5 +1,6 @@ jobs: - job: Windows_ARM_CrossCompile_CI_Dev + timeoutInMinutes: 120 variables: buildDirectory: '$(Build.BinariesDirectory)' steps: diff --git a/tools/ci_build/github/azure-pipelines/win-arm64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-arm64-crosscompile-ci-pipeline.yml index b70c419a49a96..6c44fc7ea6a70 100644 --- a/tools/ci_build/github/azure-pipelines/win-arm64-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-arm64-crosscompile-ci-pipeline.yml @@ -1,5 +1,6 @@ jobs: - job: Windows_ARM_CrossCompile_CI_Dev + timeoutInMinutes: 120 variables: buildDirectory: '$(Build.BinariesDirectory)' steps: diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml index c5021e5bb4edc..bdaa8f0cf5289 100644 --- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml @@ -4,7 +4,7 @@ jobs: AgentPool : 'Win-CPU' DoDebugBuild: 'true' DoCompliance: 'false' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_tvm --enable_pybind --use_mkldnn --use_mklml --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --gen_doc' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_tvm --enable_pybind --use_mkldnn --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --gen_doc' JobName: 'Windows_CI_Dev' DoNugetPack: 'false' NuPackScript : '' diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml index d4930faf193e7..a2c42de7d7367 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml @@ -4,7 +4,7 @@ jobs: AgentPool : 'Win-GPU-CUDA10' DoDebugBuild: 'true' DoCompliance: 'false' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --use_mkldnn --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --msvc_toolset=14.11' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --use_mkldnn --build_shared_lib --build_csharp --enable_onnx_tests --use_cuda --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --msvc_toolset=14.11' JobName: 'Windows_CI_GPU_Dev' DoNugetPack: 'false' NuPackScript : '' diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml index 35f555c182207..a31ad59615489 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml @@ -1,5 +1,6 @@ jobs: - job: Windows_CI_GPU_Dev + timeoutInMinutes: 120 pool: Win-GPU-CUDA10 variables: buildDirectory: '$(Build.BinariesDirectory)' @@ -16,11 +17,18 @@ jobs: buildArch: 'amd64' setVcvars: true + # Copy CUDA 10.0 props files + - task: PowerShell@1 + displayName: 'Set CUDA path' + inputs: + scriptName: 'tools/ci_build/github/windows/set_cuda_path.ps1' + arguments: '-CudaMsbuildPath C:\local\cudaMsbuildIntegration-10.0.130-win10 -CudaVersion 10.0' + - task: CmdLine@1 displayName: 'Download test data and generate cmake config' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --build_shared_lib --enable_onnx_tests --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --use_tensorrt --tensorrt_home="C:\local\TensorRT-5.0.4.3" --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --update --msvc_toolset=14.11' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --build_shared_lib --enable_onnx_tests --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --use_tensorrt --tensorrt_home="C:\local\TensorRT-5.0.4.3" --update --msvc_toolset=14.11' workingDirectory: "$(Build.BinariesDirectory)" - task: VSBuild@1 @@ -37,7 +45,7 @@ jobs: displayName: 'Test Debug' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --build_shared_lib --enable_onnx_tests --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --use_tensorrt --tensorrt_home="C:\local\TensorRT-5.0.4.3" --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --test' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --build_shared_lib --enable_onnx_tests --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --use_tensorrt --tensorrt_home="C:\local\TensorRT-5.0.4.3" --test' workingFolder: '$(Build.BinariesDirectory)' - task: VSBuild@1 displayName: 'Build C# Debug' @@ -73,7 +81,7 @@ jobs: displayName: 'Test Release' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --build_shared_lib --enable_onnx_tests --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --use_tensorrt --tensorrt_home="C:\local\TensorRT-5.0.4.3" --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --test' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_mkldnn --build_shared_lib --enable_onnx_tests --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --use_tensorrt --tensorrt_home="C:\local\TensorRT-5.0.4.3" --test' workingFolder: "$(Build.BinariesDirectory)" - task: VSBuild@1 @@ -103,6 +111,13 @@ jobs: testRunTitle: 'Unit Test Run' condition: succeededOrFailed() + # Remove CUDA 10.0 props files after build + - task: PowerShell@1 + displayName: 'Clean up CUDA props files' + inputs: + scriptName: 'tools/ci_build/github/windows/clean_up_cuda_prop_files.ps1' + arguments: '-CudaVersion 10.0' + - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 displayName: 'Component Detection' condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')) diff --git a/tools/ci_build/github/azure-pipelines/win-mklml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-mklml-ci-pipeline.yml index f059dbb907fb9..7b539c4c6b24d 100644 --- a/tools/ci_build/github/azure-pipelines/win-mklml-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-mklml-ci-pipeline.yml @@ -1,5 +1,6 @@ jobs: - job: Windows_CI_Dev + timeoutInMinutes: 120 variables: buildDirectory: '$(Build.BinariesDirectory)' steps: @@ -13,7 +14,7 @@ jobs: displayName: 'Download test data and generate cmake config' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --update' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --build_shared_lib --build_csharp --enable_onnx_tests --update' workingDirectory: "$(Build.BinariesDirectory)" - task: VSBuild@1 @@ -30,7 +31,7 @@ jobs: displayName: 'Test Debug' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --test' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test' workingFolder: '$(Build.BinariesDirectory)' - task: VSBuild@1 displayName: 'Build C# Debug' @@ -65,7 +66,7 @@ jobs: displayName: 'Test Release' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --test' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_mklml --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test' workingFolder: "$(Build.BinariesDirectory)" - task: VSBuild@1 diff --git a/tools/ci_build/github/azure-pipelines/win-ngraph-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ngraph-ci-pipeline.yml index fd7d73fec6e2a..b45c0bf74c44b 100644 --- a/tools/ci_build/github/azure-pipelines/win-ngraph-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-ngraph-ci-pipeline.yml @@ -1,7 +1,11 @@ jobs: - job: Windows_nGraph_CI_Dev + timeoutInMinutes: 120 variables: buildDirectory: '$(Build.BinariesDirectory)' + # nGraph provider fails on the latest 20190729.zip test. revert back to previous zip file until failures can be investigated + TestDataUrl: https://onnxruntimetestdata.blob.core.windows.net/models/20190419.zip + TestDataChecksum: 3f46c31ee02345dbe707210b339e31fe steps: - template: templates/set-test-data-variables-step.yml - template: templates/windows-build-tools-setup-steps.yml @@ -13,7 +17,7 @@ jobs: displayName: 'Download test data and generate cmake config' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_ngraph --use_full_protobuf --build_shared_lib --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --gen_doc --update' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_ngraph --use_full_protobuf --build_shared_lib --gen_doc --update' workingDirectory: "$(Build.BinariesDirectory)" - task: VSBuild@1 displayName: 'Build Debug' @@ -29,7 +33,7 @@ jobs: displayName: 'Test Debug' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_ngraph --use_full_protobuf --build_shared_lib --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --gen_doc --test' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_ngraph --use_full_protobuf --build_shared_lib --gen_doc --test' workingFolder: '$(Build.BinariesDirectory)' - task: VSBuild@1 displayName: 'Build Release' @@ -45,7 +49,7 @@ jobs: displayName: 'Test Release' inputs: filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_ngraph --use_full_protobuf --build_shared_lib --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --test' + arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --enable_pybind --use_openmp --use_ngraph --use_full_protobuf --build_shared_lib --enable_onnx_tests --test' workingFolder: "$(Build.BinariesDirectory)" - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 diff --git a/tools/ci_build/github/azure-pipelines/win-nocontribops-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-nocontribops-ci-pipeline.yml index 9b2135b5f0ae1..b41400e0bd7ca 100644 --- a/tools/ci_build/github/azure-pipelines/win-nocontribops-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-nocontribops-ci-pipeline.yml @@ -3,107 +3,13 @@ variables: TestDataUrlNoContribOps : https://onnxruntimetestdata.blob.core.windows.net/models/20181210.zip jobs: -- job: Windows_CI_Dev - variables: - buildDirectory: '$(Build.BinariesDirectory)' - steps: - - template: templates/set-test-data-variables-step.yml - - template: templates/windows-build-tools-setup-steps.yml - parameters: - EnvSetupScript: 'setup_env.bat' - buildArch: 'x64' - setVcvars: false - - task: CmdLine@1 - displayName: 'Download test data and generate cmake config' - inputs: - filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrlNoContribOps) --test_data_checksum $(TestDataChecksum) --update' - workingDirectory: "$(Build.BinariesDirectory)" - - - task: VSBuild@1 - displayName: 'Build Debug' - inputs: - solution: '$(Build.BinariesDirectory)\Debug\onnxruntime.sln' - platform: 'x64' - configuration: 'Debug' - msbuildArgs: '/m' - msbuildArchitecture: 'x64' - logProjectEvents: true - workingFolder: '$(Build.BinariesDirectory)\Debug' - - - task: BatchScript@1 - displayName: 'Test Debug' - inputs: - filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrlNoContribOps) --test_data_checksum $(TestDataChecksum) --test' - workingFolder: '$(Build.BinariesDirectory)' - - - task: VSBuild@1 - displayName: 'Build C# Debug' - inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' - platform: 'any cpu' - configuration: 'Debug' - restoreNugetPackages: false - msbuildArchitecture: 'x64' - workingFolder: '$(Build.SourcesDirectory)\csharp' - msbuildArgs: '/m /p:OnnxRuntimeBuildDirectory=$(Build.BinariesDirectory)' - - - task: VSTest@2 - displayName: 'VsTest - C# Debug' - inputs: - testAssemblyVer2: '**\bin\Debug\**\*Tests.dll' - searchFolder: '$(Build.SourcesDirectory)\csharp\test' - runInParallel: true - configuration: Debug - - - task: VSBuild@1 - displayName: 'Build Release' - inputs: - solution: '$(Build.BinariesDirectory)\Release\onnxruntime.sln' - platform: 'x64' - configuration: 'Release' - msbuildArgs: '/m' - msbuildArchitecture: 'x64' - logProjectEvents: true - workingFolder: '$(Build.BinariesDirectory)\Release' - - - task: BatchScript@1 - displayName: 'Test Release' - inputs: - filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrlNoContribOps) --test_data_checksum $(TestDataChecksum) --test' - workingFolder: "$(Build.BinariesDirectory)" - - - task: VSBuild@1 - displayName: 'Build c# Release' - inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' - platform: 'any cpu' - configuration: 'Release' - msbuildArchitecture: 'x64' - restoreNugetPackages: false - workingFolder: '$(Build.SourcesDirectory)\csharp' - msbuildArgs: '/m /p:OnnxRuntimeBuildDirectory=$(Build.BinariesDirectory)' - - - task: VSTest@2 - displayName: 'VsTest - C# Release' - inputs: - testAssemblyVer2: '**\bin\Release\**\*Tests.dll' - searchFolder: '$(Build.SourcesDirectory)\csharp\test' - runInParallel: true - configuration: Release - - - task: PublishTestResults@2 - displayName: 'Publish unit test results' - inputs: - testResultsFiles: '**\*.results.xml' - searchFolder: '$(Build.BinariesDirectory)' - testRunTitle: 'Unit Test Run' - condition: succeededOrFailed() - - - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 - displayName: 'Component Detection' - condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')) - - - template: templates/clean-agent-build-directory-step.yml +- template: templates/win-ci.yml + parameters: + AgentPool : 'Win-CPU' + DoDebugBuild: 'true' + DoCompliance: 'false' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --disable_contrib_ops --enable_msvc_static_runtime --build_shared_lib --build_csharp --enable_onnx_tests' + JobName: 'Windows_CI_Dev' + DoNugetPack: 'false' + NuPackScript : '' + DoTestCoverage: 'false' \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/win-x86-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-x86-ci-pipeline.yml index 87e7b6eec4fca..3d3ced7591015 100644 --- a/tools/ci_build/github/azure-pipelines/win-x86-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-x86-ci-pipeline.yml @@ -4,7 +4,7 @@ jobs: AgentPool : 'Win-CPU' DoDebugBuild: 'true' DoCompliance: 'false' - BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --test_data_url $(TestDataUrl) --test_data_checksum $(TestDataChecksum) --x86' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --x86' JobName: 'Windows_CI_Dev_x86' DoNugetPack: 'false' NuPackScript : '' diff --git a/tools/ci_build/github/azure-pipelines/win-x86-nocontribops-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-x86-nocontribops-ci-pipeline.yml index e8f8229c38d6f..96a62acee69b6 100644 --- a/tools/ci_build/github/azure-pipelines/win-x86-nocontribops-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-x86-nocontribops-ci-pipeline.yml @@ -4,108 +4,12 @@ variables: TestDataUrlNoContribOps : https://onnxruntimetestdata.blob.core.windows.net/models/20181210.zip jobs: -- job: Windows_CI_Dev - variables: - buildDirectory: '$(Build.BinariesDirectory)' - steps: - - template: templates/set-test-data-variables-step.yml - - template: templates/windows-build-tools-setup-steps.yml - parameters: - EnvSetupScript: 'setup_env.bat' - buildArch: 'x86' - setVcvars: false - - - task: CmdLine@1 - displayName: 'Download test data and generate cmake config' - inputs: - filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --enable_onnx_tests --test_data_url $(TestDataUrlNoContribOps) --test_data_checksum $(TestDataChecksum) --update --x86' - workingDirectory: "$(Build.BinariesDirectory)" - - - task: VSBuild@1 - displayName: 'Build Debug' - inputs: - solution: '$(Build.BinariesDirectory)\Debug\onnxruntime.sln' - platform: Win32 - msbuildArguments: '/m /p:PlatformTarget=x86' - configuration: 'Debug' - msbuildArchitecture: 'x86' - logProjectEvents: true - workingFolder: '$(Build.BinariesDirectory)\Debug' - - - task: BatchScript@1 - displayName: 'Test Debug' - inputs: - filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --enable_onnx_tests --test_data_url $(TestDataUrlNoContribOps) --test_data_checksum $(TestDataChecksum) --test --x86 --disable_contrib_ops --enable_msvc_static_runtime' - workingFolder: '$(Build.BinariesDirectory)' - - - task: VSBuild@1 - displayName: 'Build C# Debug' - inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' - platform: 'any cpu' - configuration: 'Debug' - restoreNugetPackages: false - msbuildArchitecture: 'x86' - workingFolder: '$(Build.SourcesDirectory)\csharp' - msbuildArgs: '/m /p:OnnxRuntimeBuildDirectory=$(Build.BinariesDirectory)' - - - task: VSTest@2 - displayName: 'VsTest - C# Debug' - inputs: - testAssemblyVer2: '**\bin\Debug\**\*Tests.dll' - searchFolder: '$(Build.SourcesDirectory)\csharp\test' - runInParallel: true - configuration: Debug - - - task: VSBuild@1 - displayName: 'Build Release' - inputs: - solution: '$(Build.BinariesDirectory)\Release\onnxruntime.sln' - platform: Win32 - msbuildArguments: '/m /p:PlatformTarget=x86' - configuration: 'Release' - msbuildArchitecture: 'x86' - logProjectEvents: true - workingFolder: '$(Build.BinariesDirectory)\Release' - - - task: BatchScript@1 - displayName: 'Test Release' - inputs: - filename: '$(Build.BinariesDirectory)\packages\python\python.exe' - arguments: '$(Build.SourcesDirectory)\tools\ci_build\build.py --config Release --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --enable_onnx_tests --test_data_url $(TestDataUrlNoContribOps) --test_data_checksum $(TestDataChecksum) --test --x86 --disable_contrib_ops --enable_msvc_static_runtime' - workingFolder: "$(Build.BinariesDirectory)" - - - task: VSBuild@1 - displayName: 'Build C# Release' - inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' - platform: 'any cpu' - configuration: 'Release' - msbuildArchitecture: 'x86' - restoreNugetPackages: false - workingFolder: '$(Build.SourcesDirectory)\csharp' - msbuildArgs: '/m /p:OnnxRuntimeBuildDirectory=$(Build.BinariesDirectory)' - - - task: VSTest@2 - displayName: 'VsTest - C# Release' - inputs: - testAssemblyVer2: '**\bin\Release\**\*Tests.dll' - searchFolder: '$(Build.SourcesDirectory)\csharp\test' - runInParallel: true - configuration: Release - - - task: PublishTestResults@2 - displayName: 'Publish unit test results' - inputs: - testResultsFiles: '**\*.results.xml' - searchFolder: '$(Build.BinariesDirectory)' - testRunTitle: 'Unit Test Run' - condition: succeededOrFailed() - - - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 - displayName: 'Component Detection' - condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')) - - - template: templates/clean-agent-build-directory-step.yml +- template: templates/win-x86-ci.yml + parameters: + AgentPool : 'Win-CPU' + DoDebugBuild: 'true' + DoCompliance: 'false' + BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --disable_contrib_ops --enable_msvc_static_runtime --x86' + JobName: 'Windows_CI_Dev_x86' + DoNugetPack: 'false' + NuPackScript : '' \ No newline at end of file diff --git a/tools/ci_build/github/download_test_data.py b/tools/ci_build/github/download_test_data.py index dc475071265fc..821b6054eca5e 100755 --- a/tools/ci_build/github/download_test_data.py +++ b/tools/ci_build/github/download_test_data.py @@ -1,12 +1,16 @@ #!/usr/bin/python3 - import urllib.request import json import subprocess import os +import sys +import shutil import argparse from urllib.parse import urlparse +def is_windows(): + return sys.platform.startswith("win") + def get_azure_region(): req = urllib.request.Request('http://169.254.169.254/metadata/instance?api-version=2018-10-01') req.add_header('Metadata', 'true') @@ -18,28 +22,60 @@ def parse_arguments(): parser = argparse.ArgumentParser(description="ONNXRuntime Data Downloader.") parser.add_argument("--test_data_url", help="Test data URL.") parser.add_argument("--azure_region", help="Azure region") + parser.add_argument("--build_dir", required=True, help="Path to the build directory.") return parser.parse_args() def get_server_hostname(azure_location): if azure_location is None: #should be northcentralus or centralus - azure_location=get_azure_region() + azure_location = get_azure_region() print("This VM is in azure location: %s" % azure_location) if azure_location == 'centralus': - hostname='onnxruntimetestdata' + hostname = 'onnxruntimetestdata' elif azure_location == 'northcentralus': - hostname='onnxruntimetestdata2' + hostname = 'onnxruntimetestdata2' else: print('warning: no local data cache for azure region %s' % azure_location) - hostname='onnxruntimetestdata2' + hostname = 'onnxruntimetestdata2' return hostname + +def download_and_unzip(build_dir, url, dest_folder): + print("Downloading %s" % url) + dest_folder = os.path.join(build_dir, dest_folder) + subprocess.run([os.path.join(build_dir,'azcopy'),'cp', '--log-level','ERROR', url, build_dir],check=True) + os.makedirs(dest_folder,exist_ok=True) + local_file_name = os.path.join(build_dir, os.path.basename(urlparse(url).path)) + if is_windows(): + print("unzip %s" % local_file_name) + if shutil.which('7z'): # 7-Zip + subprocess.run(['7z','x', local_file_name, '-y', '-o' + dest_folder], check=True) + elif shutil.which('7za'): # 7-Zip standalone + subprocess.run(['7za', 'x', local_file_name, '-y', '-o' + dest_folder], check=True) + else: + log.error("No unzip tool for use") + sys.exit(1) + else: + subprocess.run(['unzip','-qd', dest_folder ,local_file_name], check=True) + os.unlink(local_file_name) + args = parse_arguments() -hostname=get_server_hostname(args.azure_region) -url=args.test_data_url.replace('onnxruntimetestdata', hostname) +hostname = get_server_hostname(args.azure_region) +url = args.test_data_url.replace('onnxruntimetestdata', hostname) print('data url=%s' % url) -subprocess.run(['./azcopy','cp', '--log-level','ERROR', url,'.'],check=True) -os.makedirs('models',exist_ok=True) -local_file_name = os.path.basename(urlparse(url).path) -subprocess.run(['unzip', '-qd','models',local_file_name]) +download_and_unzip(args.build_dir, url, 'models') +if is_windows(): + url = 'https://onnxruntimetestdata.blob.core.windows.net/models/cmake-3.15.1-win64-x64.zip' + url = url.replace('onnxruntimetestdata', hostname) + download_and_unzip(args.build_dir, url, 'cmake_temp') + dest_dir = os.path.join(args.build_dir,'cmake') + if os.path.exists(dest_dir): + print('deleting %s' % dest_dir) + shutil.rmtree(dest_dir) + shutil.move(os.path.join(args.build_dir,'cmake_temp','cmake-3.15.1-win64-x64'),dest_dir) + url = 'https://onnxruntimetestdata.blob.core.windows.net/models/OpenCppCoverageSetup-x64-0.9.7.0.exe' + url = url.replace('onnxruntimetestdata', hostname) + dest_folder = os.path.join(args.build_dir, 'installer','opencppcoverage') + os.makedirs(dest_folder,exist_ok=True) + subprocess.run([os.path.join(args.build_dir,'azcopy'),'cp', '--log-level','ERROR', url, os.path.join(dest_folder,'installer.exe')],check=True) diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh index 8f15d9ba42c36..585310f6ce1d9 100644 --- a/tools/ci_build/github/linux/copy_strip_binary.sh +++ b/tools/ci_build/github/linux/copy_strip_binary.sh @@ -32,6 +32,9 @@ then ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so fi cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_c_api.h $BINARY_DIR/$ARTIFACT_NAME/include +cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h $BINARY_DIR/$ARTIFACT_NAME/include +cp $SOURCE_DIR/include/onnxruntime/core/providers/cuda/cuda_provider_factory.h $BINARY_DIR/$ARTIFACT_NAME/include + # copy the README, licence and TPN cp $SOURCE_DIR/README.md $BINARY_DIR/$ARTIFACT_NAME/README.md cp $SOURCE_DIR/docs/C_API.md $BINARY_DIR/$ARTIFACT_NAME/C_API.md diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2010 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2010 new file mode 100644 index 0000000000000..17a3417bfd043 --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2010 @@ -0,0 +1,31 @@ +FROM quay.io/pypa/manylinux2010_x86_64:latest + +ARG PYTHON_VERSION=3.5 + +ADD scripts/install_manylinux2010.sh /tmp/scripts/install_manylinux2010.sh +RUN /tmp/scripts/install_manylinux2010.sh -p ${PYTHON_VERSION} +ADD scripts/install_protobuf.sh /tmp/scripts/install_protobuf.sh +RUN (source /opt/onnxruntime-python/bin/activate; pip install cmake && /tmp/scripts/install_protobuf.sh && pip uninstall -y cmake) +ADD scripts /tmp/scripts +RUN (source /opt/onnxruntime-python/bin/activate; /tmp/scripts/install_deps.sh) +RUN rm -rf /tmp/scripts # not useful at all except not to see the scripts + +RUN echo "#!/bin/bash" > /opt/entrypoint.sh && \ + echo "set -e" >> /opt/entrypoint.sh && \ + echo "source /opt/onnxruntime-python/bin/activate" >> /opt/entrypoint.sh && \ + echo "exec \"\$@\"" >> /opt/entrypoint.sh +RUN cat /opt/entrypoint.sh +RUN chmod +x /opt/entrypoint.sh + +WORKDIR /root + +ENV LD_LIBRARY_PATH /usr/local/openblas/lib:$LD_LIBRARY_PATH + +ARG BUILD_UID=1000 +ARG BUILD_USER=onnxruntimedev +WORKDIR /home/$BUILD_USER +# --disabled-password +RUN adduser --comment 'onnxruntime Build User' $BUILD_USER --uid $BUILD_UID +USER $BUILD_USER + +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2010_gpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2010_gpu new file mode 100644 index 0000000000000..578c7b28657b0 --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2010_gpu @@ -0,0 +1,66 @@ +FROM quay.io/pypa/manylinux2010_x86_64:latest + +ARG PYTHON_VERSION=3.5 + +ADD scripts/install_manylinux2010.sh /tmp/scripts/install_manylinux2010.sh +RUN /tmp/scripts/install_manylinux2010.sh -p ${PYTHON_VERSION} +ADD scripts/install_protobuf.sh /tmp/scripts/install_protobuf.sh +RUN (source /opt/onnxruntime-python/bin/activate; pip install cmake && /tmp/scripts/install_protobuf.sh && pip uninstall -y cmake) +ADD scripts /tmp/scripts +RUN (source /opt/onnxruntime-python/bin/activate; /tmp/scripts/install_deps.sh) +RUN rm -rf /tmp/scripts # not useful at all except not to see the scripts + +RUN echo "#!/bin/bash" > /opt/entrypoint.sh && \ + echo "set -e" >> /opt/entrypoint.sh && \ + echo "source /opt/onnxruntime-python/bin/activate" >> /opt/entrypoint.sh && \ + echo "exec \"\$@\"" >> /opt/entrypoint.sh +RUN cat /opt/entrypoint.sh +RUN chmod +x /opt/entrypoint.sh + +RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64/7fa2af80.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ + echo "$NVIDIA_GPGKEY_SUM /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c - + +COPY cuda_manylinux2010.repo /etc/yum.repos.d/cuda.repo + +ENV CUDA_VERSION 10.1.168 +ENV CUDA_PKG_VERSION 10-1-$CUDA_VERSION-1 + +# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a +RUN yum install -y \ + cuda-cudart-$CUDA_PKG_VERSION \ + cuda-compat-10-1 \ + cuda-libraries-$CUDA_PKG_VERSION \ + cuda-nvtx-$CUDA_PKG_VERSION \ + cuda-libraries-dev-$CUDA_PKG_VERSION \ + cuda-nvml-dev-$CUDA_PKG_VERSION \ + cuda-minimal-build-$CUDA_PKG_VERSION \ + cuda-command-line-tools-$CUDA_PKG_VERSION \ + && \ + ln -s cuda-10.1 /usr/local/cuda && \ + rm -rf /var/cache/yum/* + +# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement +RUN CUDNN_DOWNLOAD_SUM=e956c6f9222fcb867a10449cfc76dee5cfd7c7531021d95fe9586d7e043b57d7 && \ + curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-10.1-linux-x64-v7.6.0.64.tgz -O && \ + echo "$CUDNN_DOWNLOAD_SUM cudnn-10.1-linux-x64-v7.6.0.64.tgz" | sha256sum -c - && \ + tar --no-same-owner -xzf cudnn-10.1-linux-x64-v7.6.0.64.tgz -C /usr/local && \ + rm cudnn-10.1-linux-x64-v7.6.0.64.tgz && \ + ldconfig + +ENV LD_LIBRARY_PATH /usr/local/openblas/lib:/usr/local/cuda/lib64/stubs:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} + +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411" + +ARG BUILD_UID=1000 +ARG BUILD_USER=onnxruntimedev +WORKDIR /home/$BUILD_USER +# --disabled-password +RUN adduser --comment 'onnxruntime Build User' $BUILD_USER --uid $BUILD_UID +USER $BUILD_USER + +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino index 27cf31c9e549b..b993c360ef741 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino @@ -2,7 +2,7 @@ ARG OS_VERSION=16.04 FROM ubuntu:${OS_VERSION} ARG PYTHON_VERSION=3.5 -ARG OPENVINO_VERSION=2018_R5 +ARG OPENVINO_VERSION=2019_R1.1 ADD scripts /tmp/scripts ENV PATH="/opt/cmake/bin:${PATH}" @@ -14,9 +14,10 @@ RUN /tmp/scripts/install_openvino.sh -o ${OPENVINO_VERSION} && \ WORKDIR /root -ENV INTEL_CVSDK_DIR /data/dldt +ENV INTEL_CVSDK_DIR /data/dldt/openvino_2019.1.144 +ENV INTEL_OPENVINO_DIR /data/dldt/openvino_2019.1.144 -ENV LD_LIBRARY_PATH $INTEL_CVSDK_DIR/deployment_tools/inference_engine/lib/ubuntu_16.04/intel64:$INTEL_CVSDK_DIR/deployment_tools/inference_engine/temp/omp/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH $INTEL_CVSDK_DIR/deployment_tools/inference_engine/lib/intel64:$INTEL_CVSDK_DIR/deployment_tools/inference_engine/temp/omp/lib:$INTEL_CVSDK_DIR/deployment_tools/inference_engine/external/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH ENV PATH $INTEL_CVSDK_DIR/deployment_tools/model_optimizer:$PATH ENV PYTHONPATH $INTEL_CVSDK_DIR/deployment_tools/model_optimizer:$INTEL_CVSDK_DIR/tools:$PYTHONPATH diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_server b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_server new file mode 100644 index 0000000000000..c056fa4d6e2c4 --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_server @@ -0,0 +1,18 @@ +ARG OS_VERSION=16.04 +FROM ubuntu:${OS_VERSION} + +ARG PYTHON_VERSION=3.5 + +ADD scripts /tmp/scripts +RUN /tmp/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} && /tmp/scripts/install_deps.sh && /tmp/scripts/install_server_deps.sh && rm -rf /tmp/scripts + +WORKDIR /root + +ENV LD_LIBRARY_PATH /usr/local/openblas/lib:$LD_LIBRARY_PATH +ENV PATH /usr/local/go/bin:$PATH + +ARG BUILD_UID=1000 +ARG BUILD_USER=onnxruntimedev +RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID +WORKDIR /home/$BUILD_USER +USER $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt index ba17a56f29939..8f6264f71f092 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt @@ -7,7 +7,8 @@ FROM nvcr.io/nvidia/tensorrt:19.02-py3 ARG PYTHON_VERSION=3.5 ADD scripts /tmp/scripts -RUN /tmp/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts +RUN /tmp/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts \ + && rm /usr/local/bin/cmake && rm /usr/local/bin/ctest && rm -r /usr/local/share/cmake-3.12 WORKDIR /root diff --git a/tools/ci_build/github/linux/docker/cuda_manylinux2010.repo b/tools/ci_build/github/linux/docker/cuda_manylinux2010.repo new file mode 100644 index 0000000000000..20972766acf7b --- /dev/null +++ b/tools/ci_build/github/linux/docker/cuda_manylinux2010.repo @@ -0,0 +1,6 @@ +[cuda] +name=cuda +baseurl=http://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64 +enabled=1 +gpgcheck=1 +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA \ No newline at end of file diff --git a/tools/ci_build/github/linux/docker/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_deps.sh index 9cfdd6a1bd90f..c128feef450a6 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_deps.sh @@ -37,6 +37,9 @@ rm -rf /tmp/src DISTRIBUTOR=$(lsb_release -i -s) if [ "$DISTRIBUTOR" = "Ubuntu" ]; then apt-get -y remove libprotobuf-dev protobuf-compiler +elif [ "$AUDITWHEEL_PLAT" = "manylinux2010_x86_64" ]; then + # we did not install protobuf 2.x no need to uninstall + : else dnf remove -y protobuf-devel protobuf-compiler fi diff --git a/tools/ci_build/github/linux/docker/scripts/install_deps_android.sh b/tools/ci_build/github/linux/docker/scripts/install_deps_android.sh index 771b2ae4f2854..445e166644d5f 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_deps_android.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_deps_android.sh @@ -1,19 +1,12 @@ #!/bin/bash set -e -mkdir -p /tmp/src -aria2c -q -d /tmp/src https://github.com/Kitware/CMake/releases/download/v3.13.2/cmake-3.13.2.tar.gz -tar -xf /tmp/src/cmake-3.13.2.tar.gz -C /tmp/src -cd /tmp/src/cmake-3.13.2 -./configure --prefix=/usr --parallel=`nproc` --system-curl --system-zlib --system-expat -make -j`nproc` -make install -#download Android NDK r19c -aria2c -q -d /tmp https://dl.google.com/android/repository/android-ndk-r19c-linux-x86_64.zip -unzip -oq /tmp/android-ndk-r19c-linux-x86_64.zip -d /tmp/android-ndk && mv /tmp/android-ndk/* /android-ndk -cd / -rm -rf /tmp/src +# cmake==3.13.2 is actually 3.12.2 lol +python3 -m pip install cmake==3.13.2.post1 -apt-get -y remove libprotobuf-dev protobuf-compiler +cmake --version +# Download Android NDK r20, move /temp_ndk/android-ndk- to /android_ndk +wget -qO- -O temp.zip https://dl.google.com/android/repository/android-ndk-r20-linux-x86_64.zip && unzip -oq temp.zip -d /temp-ndk && mv /temp-ndk/* /android-ndk && rm temp.zip && rm -rf /temp-ndk && ls /android-ndk +apt-get -y remove libprotobuf-dev protobuf-compiler diff --git a/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh b/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh new file mode 100755 index 0000000000000..50c2b9880f719 --- /dev/null +++ b/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -e +set -x + +while getopts p: parameter_Option +do case "${parameter_Option}" +in +p) PYTHON_VER=${OPTARG};; +esac +done + +PYTHON_VER=${PYTHON_VER:=3.5} +CPYTHON_VER=cp${PYTHON_VER//./} + +# need to install rpmforge in order to get aria2 +curl -fsSLo /tmp/rpmforge.rpm http://repository.it4i.cz/mirrors/repoforge/redhat/el6/en/x86_64/rpmforge/RPMS/rpmforge-release-0.5.3-1.el6.rf.x86_64.rpm +yum -y install /tmp/rpmforge.rpm +rm -f /tmp/rpmforge.rpm + +yum -y install openblas-devel zlib-devel curl-devel expat-devel aria2 rsync redhat-lsb-core +yum -y clean all + +/opt/python/${CPYTHON_VER}-${CPYTHON_VER}m/bin/python -m venv /opt/onnxruntime-python +source /opt/onnxruntime-python/bin/activate +if [ ! -f /opt/onnxruntime-python/bin/python${PYTHON_VER} ]; then + ln -s python /opt/onnxruntime-python/bin/python${PYTHON_VER} +fi +python -m pip install --upgrade --force-reinstall pip==19.1.1 +python -m pip install --upgrade --force-reinstall numpy==1.15.0 +python -m pip install --upgrade --force-reinstall requests==2.21.0 +python -m pip install --upgrade --force-reinstall wheel==0.31.1 +python -m pip install --upgrade --force-reinstall setuptools==41.0.1 +python -m pip install --upgrade --force-reinstall pytest==4.6.2 + +ls -al /opt/onnxruntime-python/bin + +echo "#!/bin/sh" > /opt/entrypoint.sh +echo "source /opt/onnxruntime-python/bin/activate" >> /opt/entrypoint.sh +echo "exec \"$@\"" >> /opt/entrypoint.sh + +mkdir -p $HOME/.aria2 +echo "ca-certificate=/opt/_internal/certs.pem" > $HOME/.aria2/aria2.conf diff --git a/tools/ci_build/github/linux/docker/scripts/install_onnx.sh b/tools/ci_build/github/linux/docker/scripts/install_onnx.sh index f06f000d8f7f0..a8495aff3a966 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_onnx.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_onnx.sh @@ -1,10 +1,10 @@ #!/bin/bash # The script is to generate all supported versions of onnx models which will be tested by onnx_test_runner -# in the end of ci build pipeline. The purpose is to make sure latest onnxruntime has no regressions. Note -# that the order of installation must be onnx123, onnx130, onnx141, onnx150 and onnxtip since we want +# in the end of ci build pipeline. The purpose is to make sure latest onnxruntime has no regressions. Note +# that the order of installation must be onnx123, onnx130, onnx141, onnx150 and onnxtip since we want # to keep the tip of master on script exit for onnx backend test which is also a part of build pipeline. -# One possible improvement here is to keep the models saved to some public storage instead of generating +# One possible improvement here is to keep the models saved to some public storage instead of generating # on the fly every time. set -e @@ -13,7 +13,7 @@ version2tag=(5af210ca8a1c73aa6bae8754c9346ec54d0a756e-onnx123 bae6333e149a59a3faa9c4d9c44974373dcf5256-onnx130 9e55ace55aad1ada27516038dfbdc66a8a0763db-onnx141 7d7bc83d29a328233d3e8affa4c4ea8b3e3599ef-onnx150 - d94f99d21a9a0820d58966410ceaf525132f85f1-onnxtip) + 65b8e0f9979fbade16e3becbdfa69c0764946f72-onnxtip) for v2t in ${version2tag[*]}; do onnx_version="$(cut -d'-' -f1<<<${v2t})" onnx_tag="$(cut -d'-' -f2<<<${v2t})" @@ -21,14 +21,14 @@ for v2t in ${version2tag[*]}; do echo "first pass"; else echo "deleting old onnx-${lastest_onnx_version}"; - /usr/bin/python${PYTHON_VER} -m pip uninstall -y onnx + python${PYTHON_VER} -m pip uninstall -y onnx fi lastest_onnx_version=$onnx_version aria2c -q -d /tmp/src https://github.com/onnx/onnx/archive/$onnx_version.tar.gz tar -xf /tmp/src/onnx-$onnx_version.tar.gz -C /tmp/src cd /tmp/src/onnx-$onnx_version git clone https://github.com/pybind/pybind11.git third_party/pybind11 - /usr/bin/python${PYTHON_VER} -m pip install . + python${PYTHON_VER} -m pip install . mkdir -p /data/onnx/${onnx_tag} backend-test-tools generate-data -o /data/onnx/$onnx_tag done diff --git a/tools/ci_build/github/linux/docker/scripts/install_openvino.sh b/tools/ci_build/github/linux/docker/scripts/install_openvino.sh index d205bcdf78e7c..0c44d8d97cd3b 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_openvino.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_openvino.sh @@ -7,10 +7,11 @@ o) OPENVINO_VERSION=${OPTARG};; esac done -OPENVINO_VERSION=${OPENVINO_VERSION:=2018_R5} -git clone https://github.com/opencv/dldt.git /data/dldt +OPENVINO_VERSION=${OPENVINO_VERSION:=2019_R1.1} +git clone https://github.com/opencv/dldt.git /data/dldt/openvino_2019.1.144 -export INTEL_CVSDK_DIR=/data/dldt +export INTEL_CVSDK_DIR=/data/dldt/openvino_2019.1.144 +apt-get update && apt-get -y install libusb-1.0-0-dev cd ${INTEL_CVSDK_DIR}/inference-engine git submodule init @@ -30,7 +31,9 @@ mv model-optimizer model_optimizer && mv model_optimizer deployment_tools/ cd ${INTEL_CVSDK_DIR}/deployment_tools/model_optimizer/install_prerequisites && ./install_prerequisites_onnx.sh cd ${INTEL_CVSDK_DIR}/deployment_tools/inference_engine -mkdir -p lib/ubuntu_16.04/intel64 -mv bin/intel64/Release/lib/* lib/ubuntu_16.04/intel64 +mkdir -p lib/intel64 +mkdir -p external/tbb/lib +mv bin/intel64/Release/lib/* lib/intel64 +mv temp/tbb/lib/* external/tbb/lib -cd ~ \ No newline at end of file +cd ~ diff --git a/tools/ci_build/github/linux/docker/scripts/install_server_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_server_deps.sh new file mode 100755 index 0000000000000..d471064191684 --- /dev/null +++ b/tools/ci_build/github/linux/docker/scripts/install_server_deps.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +SYS_LONG_BIT=$(getconf LONG_BIT) + +echo "Installing Go" +if [ $SYS_LONG_BIT = "64" ]; then + mkdir -p /tmp/go + cd /tmp/go + wget https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz + tar -C /usr/local -vzxf /tmp/go/go1.12.6.linux-amd64.tar.gz +fi + + diff --git a/tools/ci_build/github/linux/run_build.sh b/tools/ci_build/github/linux/run_build.sh index 53c77ad40e867..150ebec491238 100755 --- a/tools/ci_build/github/linux/run_build.sh +++ b/tools/ci_build/github/linux/run_build.sh @@ -17,17 +17,37 @@ done if [ $BUILD_OS = "android" ]; then pushd /onnxruntime_src mkdir build-android && cd build-android - cmake -DCMAKE_TOOLCHAIN_FILE=/android-ndk/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc ../cmake + if [ $BUILD_DEVICE = "nnapi" ]; then + cmake -DCMAKE_TOOLCHAIN_FILE=/android-ndk/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc -Donnxruntime_USE_NNAPI=ON ../cmake + else + cmake -DCMAKE_TOOLCHAIN_FILE=/android-ndk/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc ../cmake + fi make -j$(nproc) else COMMON_BUILD_ARGS="--skip_submodule_sync --enable_onnx_tests --parallel --build_shared_lib --use_openmp --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest" + if [ $BUILD_OS = "manylinux2010" ]; then + # FindPython3 does not work on manylinux2010 image, define things manually + # ask python where to find includes + COMMON_BUILD_ARGS="${COMMON_BUILD_ARGS} --cmake_extra_defines PYTHON_INCLUDE_DIR=$(python3 -c 'import distutils.sysconfig; print(distutils.sysconfig.get_python_inc())')" + # Python does not provide a shared library on manylinux, use another library + COMMON_BUILD_ARGS="${COMMON_BUILD_ARGS} PYTHON_LIBRARY=/usr/lib64/librt.so" + + fi if [ $BUILD_DEVICE = "gpu" ]; then - _CUDNN_VERSION=$(echo $CUDNN_VERSION | cut -d. -f1-2) - python3 $SCRIPT_DIR/../../build.py --build_dir /build \ - --config Debug Release $COMMON_BUILD_ARGS \ - --use_cuda \ - --cuda_home /usr/local/cuda \ - --cudnn_home /usr/local/cudnn-$_CUDNN_VERSION/cuda $BUILD_EXTR_PAR + if [ $BUILD_OS = "manylinux2010" ]; then + python3 $SCRIPT_DIR/../../build.py --build_dir /build \ + --config Debug Release $COMMON_BUILD_ARGS \ + --use_cuda \ + --cuda_home /usr/local/cuda \ + --cudnn_home /usr/local/cuda $BUILD_EXTR_PAR + else + _CUDNN_VERSION=$(echo $CUDNN_VERSION | cut -d. -f1-2) + python3 $SCRIPT_DIR/../../build.py --build_dir /build \ + --config Debug Release $COMMON_BUILD_ARGS \ + --use_cuda \ + --cuda_home /usr/local/cuda \ + --cudnn_home /usr/local/cudnn-$_CUDNN_VERSION/cuda $BUILD_EXTR_PAR + fi elif [ $BUILD_DEVICE = "tensorrt" ]; then _CUDNN_VERSION=$(echo $CUDNN_VERSION | cut -d. -f1-2) python3 $SCRIPT_DIR/../../build.py --build_dir /build \ diff --git a/tools/ci_build/github/linux/run_dockerbuild.sh b/tools/ci_build/github/linux/run_dockerbuild.sh index cbd1d803067d2..7d40231cb118e 100755 --- a/tools/ci_build/github/linux/run_dockerbuild.sh +++ b/tools/ci_build/github/linux/run_dockerbuild.sh @@ -8,7 +8,7 @@ CUDA_VER=cuda10.0-cudnn7.3 while getopts c:o:d:r:p:x:a:v: parameter_Option do case "${parameter_Option}" in -#android, ubuntu16.04 +#android, ubuntu16.04, manylinux2010 o) BUILD_OS=${OPTARG};; #cpu, gpu, tensorrt d) BUILD_DEVICE=${OPTARG};; @@ -21,8 +21,8 @@ x) BUILD_EXTR_PAR=${OPTARG};; c) CUDA_VER=${OPTARG};; # x86 or other, only for ubuntu16.04 os a) BUILD_ARCH=${OPTARG};; -# openvino version tag: 2018_R5, 2019_R1 (Default is 2018_R5) -v) OPENVINO_VERSION=${OPTARG};; +# openvino version tag: 2018_R5, 2019_R1.1 (Default is 2019_R1.1) +v) OPENVINO_VERSION=${OPTARG};; esac done @@ -35,6 +35,15 @@ if [ $BUILD_OS = "android" ]; then IMAGE="android" DOCKER_FILE=Dockerfile.ubuntu_for_android docker build -t "onnxruntime-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} -f $DOCKER_FILE . +elif [ $BUILD_OS = "manylinux2010" ]; then + if [ $BUILD_DEVICE = "gpu" ]; then + IMAGE="manylinux2010-cuda10.1" + DOCKER_FILE=Dockerfile.manylinux2010_gpu + else + IMAGE="manylinux2010" + DOCKER_FILE=Dockerfile.manylinux2010 + fi + docker build -t "onnxruntime-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} -f $DOCKER_FILE . else if [ $BUILD_DEVICE = "gpu" ]; then IMAGE="ubuntu16.04-$CUDA_VER" @@ -67,35 +76,26 @@ mkdir -p ~/.cache/onnxruntime mkdir -p ~/.onnx if [ -z "$NIGHTLY_BUILD" ]; then -set NIGHTLY_BUILD=0 + set NIGHTLY_BUILD=0 fi -if [ $BUILD_DEVICE = "cpu" ] || [ $BUILD_DEVICE = "ngraph" ] || [ $BUILD_DEVICE = "openvino" ]; then - docker rm -f "onnxruntime-$BUILD_DEVICE" || true - docker run -h $HOSTNAME \ - --name "onnxruntime-$BUILD_DEVICE" \ - --volume "$SOURCE_ROOT:/onnxruntime_src" \ - --volume "$BUILD_DIR:/build" \ - --volume "$HOME/.cache/onnxruntime:/home/onnxruntimedev/.cache/onnxruntime" \ - --volume "$HOME/.onnx:/home/onnxruntimedev/.onnx" \ - -e NIGHTLY_BUILD \ - "onnxruntime-$IMAGE" \ - /bin/bash /onnxruntime_src/tools/ci_build/github/linux/run_build.sh \ - -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" -o $BUILD_OS & +if [ $BUILD_DEVICE = "cpu" ] || [ $BUILD_DEVICE = "ngraph" ] || [ $BUILD_DEVICE = "openvino" ] || [ $BUILD_DEVICE = "nnapi" ]; then + RUNTIME= else - docker rm -f "onnxruntime-$BUILD_DEVICE" || true - nvidia-docker run --rm -h $HOSTNAME \ - --rm \ - --name "onnxruntime-$BUILD_DEVICE" \ - --volume "$SOURCE_ROOT:/onnxruntime_src" \ - --volume "$BUILD_DIR:/build" \ - --volume "$HOME/.cache/onnxruntime:/home/onnxruntimedev/.cache/onnxruntime" \ - --volume "$HOME/.onnx:/home/onnxruntimedev/.onnx" \ - -e NIGHTLY_BUILD \ - "onnxruntime-$IMAGE" \ - /bin/bash /onnxruntime_src/tools/ci_build/github/linux/run_build.sh \ - -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" -o $BUILD_OS & + RUNTIME="--runtime=nvidia" fi + +docker rm -f "onnxruntime-$BUILD_DEVICE" || true +docker run $RUNTIME -h $HOSTNAME \ + --name "onnxruntime-$BUILD_DEVICE" \ + --volume "$SOURCE_ROOT:/onnxruntime_src" \ + --volume "$BUILD_DIR:/build" \ + --volume "$HOME/.cache/onnxruntime:/home/onnxruntimedev/.cache/onnxruntime" \ + --volume "$HOME/.onnx:/home/onnxruntimedev/.onnx" \ + -e NIGHTLY_BUILD \ + "onnxruntime-$IMAGE" \ + /bin/bash /onnxruntime_src/tools/ci_build/github/linux/run_build.sh \ + -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" -o $BUILD_OS & wait $! EXIT_CODE=$? diff --git a/tools/ci_build/github/linux/server_run_build.sh b/tools/ci_build/github/linux/server_run_build.sh index bad12b38a48f4..9ca7ccd3b9ec2 100755 --- a/tools/ci_build/github/linux/server_run_build.sh +++ b/tools/ci_build/github/linux/server_run_build.sh @@ -15,17 +15,15 @@ done if [ $BUILD_DEVICE = "gpu" ]; then _CUDNN_VERSION=$(echo $CUDNN_VERSION | cut -d. -f1-2) - python3 $SCRIPT_DIR/../../build.py --build_dir /home/onnxruntimedev \ + python3 $SCRIPT_DIR/../../build.py --build_dir /build \ --config Debug Release \ --skip_submodule_sync --enable_onnx_tests \ --parallel --build_shared_lib \ --use_cuda --use_openmp \ --cuda_home /usr/local/cuda \ --cudnn_home /usr/local/cudnn-$_CUDNN_VERSION/cuda --build_shared_lib $BUILD_EXTR_PAR - /home/onnxruntimedev/Release/onnx_test_runner -e cuda /data/onnx else - python3 $SCRIPT_DIR/../../build.py --build_dir /home/onnxruntimedev \ + python3 $SCRIPT_DIR/../../build.py --build_dir /build \ --skip_submodule_sync \ --parallel $BUILD_EXTR_PAR - # /home/onnxruntimedev/Release/onnx_test_runner /data/onnx fi \ No newline at end of file diff --git a/tools/ci_build/github/linux/server_run_dockerbuild.sh b/tools/ci_build/github/linux/server_run_dockerbuild.sh index 87869b1df811d..f5b22bb9506b4 100755 --- a/tools/ci_build/github/linux/server_run_dockerbuild.sh +++ b/tools/ci_build/github/linux/server_run_dockerbuild.sh @@ -1,18 +1,17 @@ #!/bin/bash set -e -o -x - SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )" +SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )" SOURCE_ROOT=$(realpath $SCRIPT_DIR/../../../../) CUDA_VER=cuda10.0-cudnn7.3 - while getopts c:o:d:k:r:p:x: parameter_Option +while getopts c:o:d:r:p:x:a: parameter_Option do case "${parameter_Option}" in -#ubuntu16.04 +#android, ubuntu16.04 o) BUILD_OS=${OPTARG};; -#cpu, gpu +#cpu, gpu, tensorrt d) BUILD_DEVICE=${OPTARG};; -k) ACR_KEY=${OPTARG};; r) BUILD_DIR=${OPTARG};; #python version: 3.6 3.7 (absence means default 3.5) p) PYTHON_VER=${OPTARG};; @@ -20,58 +19,63 @@ p) PYTHON_VER=${OPTARG};; x) BUILD_EXTR_PAR=${OPTARG};; # "cuda10.0-cudnn7.3, cuda9.1-cudnn7.1" c) CUDA_VER=${OPTARG};; +# x86 or other, only for ubuntu16.04 os +a) BUILD_ARCH=${OPTARG};; esac done - EXIT_CODE=1 +EXIT_CODE=1 +PYTHON_VER=${PYTHON_VER:=3.5} +echo "bo=$BUILD_OS bd=$BUILD_DEVICE bdir=$BUILD_DIR pv=$PYTHON_VER bex=$BUILD_EXTR_PAR" - echo "bo=$BUILD_OS bd=$BUILD_DEVICE bdir=$BUILD_DIR pv=$PYTHON_VER bex=$BUILD_EXTR_PAR" +IMAGE=ubuntu16.04 + +cd $SCRIPT_DIR/docker +docker build -t "onnxruntime-server-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg OS_VERSION=16.04 --build-arg PYTHON_VERSION=${PYTHON_VER} -f Dockerfile.ubuntu_server . - cd $SCRIPT_DIR/docker -if [ $BUILD_DEVICE = "gpu" ]; then - IMAGE="ubuntu16.04-$CUDA_VER" - DOCKER_FILE=Dockerfile.ubuntu_gpu - if [ $CUDA_VER = "cuda9.1-cudnn7.1" ]; then - DOCKER_FILE=Dockerfile.ubuntu_gpu_cuda9 - fi - docker build -t "onnxruntime-$IMAGE" --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} -f $DOCKER_FILE . -else - IMAGE="ubuntu16.04" - docker login onnxhostingdev.azurecr.io -u onnxhostingdev -p ${ACR_KEY} - docker pull onnxhostingdev.azurecr.io/onnxruntime-ubuntu16.04:latest - docker tag onnxhostingdev.azurecr.io/onnxruntime-ubuntu16.04:latest onnxruntime-ubuntu16.04:latest - docker images - id -fi - set +e +set +e +mkdir -p ~/.cache/onnxruntime +mkdir -p ~/.onnx +mkdir -p ~/.cache/go - if [ $BUILD_DEVICE = "cpu" ]; then +if [ -z "$NIGHTLY_BUILD" ]; then +set NIGHTLY_BUILD=0 +fi + +if [ $BUILD_DEVICE = "cpu" ] || [ $BUILD_DEVICE = "ngraph" ]; then docker rm -f "onnxruntime-$BUILD_DEVICE" || true docker run -h $HOSTNAME \ - --rm \ --name "onnxruntime-$BUILD_DEVICE" \ --volume "$SOURCE_ROOT:/onnxruntime_src" \ - --volume "$BUILD_DIR:/home/onnxruntimedev" \ + --volume "$BUILD_DIR:/build" \ --volume "$HOME/.cache/onnxruntime:/home/onnxruntimedev/.cache/onnxruntime" \ - "onnxruntime-$IMAGE" \ + --volume "$HOME/.onnx:/home/onnxruntimedev/.onnx" \ + --volume "$HOME/.cache/go:/home/onnxruntimedev/.cache/go" \ + -e NIGHTLY_BUILD \ + -e GOCACHE=/home/onnxruntimedev/.cache/go \ + "onnxruntime-server-$IMAGE" \ /bin/bash /onnxruntime_src/tools/ci_build/github/linux/server_run_build.sh \ - -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" & + -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" -o $BUILD_OS & else docker rm -f "onnxruntime-$BUILD_DEVICE" || true nvidia-docker run --rm -h $HOSTNAME \ --rm \ --name "onnxruntime-$BUILD_DEVICE" \ --volume "$SOURCE_ROOT:/onnxruntime_src" \ - --volume "$BUILD_DIR:/home/onnxruntimedev" \ + --volume "$BUILD_DIR:/build" \ --volume "$HOME/.cache/onnxruntime:/home/onnxruntimedev/.cache/onnxruntime" \ - "onnxruntime-$IMAGE" \ + --volume "$HOME/.onnx:/home/onnxruntimedev/.onnx" \ + --volume "$HOME/.cache/go:/home/onnxruntimedev/.cache/go" \ + -e NIGHTLY_BUILD \ + -e GOCACHE=/home/onnxruntimedev/.cache/go \ + "onnxruntime-server-$IMAGE" \ /bin/bash /onnxruntime_src/tools/ci_build/github/linux/server_run_build.sh \ - -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" & + -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" -o $BUILD_OS & fi -wait -n +wait $! - EXIT_CODE=$? +EXIT_CODE=$? - set -e -exit $EXIT_CODE \ No newline at end of file +set -e +exit $EXIT_CODE diff --git a/tools/ci_build/github/linux/ubuntu16.04/install.sh b/tools/ci_build/github/linux/ubuntu16.04/install.sh index 4339aed67b324..06813b9114ceb 100755 --- a/tools/ci_build/github/linux/ubuntu16.04/install.sh +++ b/tools/ci_build/github/linux/ubuntu16.04/install.sh @@ -74,5 +74,3 @@ done chmod 0777 /data/onnx rm -rf /tmp/src - - diff --git a/tools/ci_build/github/linux/upload_ortsrv_binaries.sh b/tools/ci_build/github/linux/upload_ortsrv_binaries.sh index dc7577c1926a4..9d6f9406e4181 100755 --- a/tools/ci_build/github/linux/upload_ortsrv_binaries.sh +++ b/tools/ci_build/github/linux/upload_ortsrv_binaries.sh @@ -21,6 +21,8 @@ echo "Creating temp folder $BINARY_DIR/$BUILD_ID ... " mkdir $BINARY_DIR/$BUILD_ID cp $BINARY_DIR/onnxruntime_server $BINARY_DIR/$BUILD_ID cp $BINARY_DIR/onnxruntime_server.symbol $BINARY_DIR/$BUILD_ID +cp $BINARY_DIR/libonnxruntime.so.* $BINARY_DIR/$BUILD_ID +cp $BINARY_DIR/libonnxruntime.so.*.symbol $BINARY_DIR/$BUILD_ID echo "Create build info file ..." echo "Build parameters: $BUILD_PARAMETERS" >> $BINARY_DIR/$BUILD_ID/build_info.txt diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py new file mode 100644 index 0000000000000..7161b6f897457 --- /dev/null +++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + + +import argparse +import mysql.connector +import xml.etree.ElementTree as ET +import sys +import os + +def parse_arguments(): + parser = argparse.ArgumentParser(description="ONNXRuntime binary size uploader for dashboard") + parser.add_argument("--commit_hash", help="Full Git commit hash") + parser.add_argument("--build_project", default='Lotus', choices=['Lotus','onnxruntime'], help="Lotus or onnxruntime build project, to construct the build URL") + parser.add_argument("--build_id", help="Build Id") + parser.add_argument("--size_data_file", help="Path to file that contains the binary size data") + + return parser.parse_args() + +# Assumes size_data_file is a csv file with a header line, containing binary sizes and other attributes +# CSV fields are: +# os,arch,build_config,size +# No empty line or space between fields expected +def get_binary_sizes(size_data_file): + binary_size = [] + with open(size_data_file, 'r') as f: + line = f.readline() + headers = line.strip().split(',') + while line: + line = f.readline() + if not line: + break; + linedata = line.strip().split(',') + tablerow = {} + for i in range(0,len(headers)): + if headers[i] == 'size': + tablerow[headers[i]] = int(linedata[i]) + else: + tablerow[headers[i]] = linedata[i] + binary_size.append(tablerow) + return binary_size + + +def write_to_db(binary_size_data, args): + # connect to database + + cnx = mysql.connector.connect( + user='ort@onnxruntimedashboard', + password=os.environ.get('DASHBOARD_MYSQL_ORT_PASSWORD'), + host='onnxruntimedashboard.mysql.database.azure.com', + database='onnxruntime') + + try: + cursor = cnx.cursor() + + #delete old records + delete_query = ('DELETE FROM onnxruntime.binary_size ' + 'WHERE build_time < DATE_SUB(Now(), INTERVAL 30 DAY);' + ) + + cursor.execute(delete_query) + + #insert current records + for row in binary_size_data: + insert_query = ('INSERT INTO onnxruntime.binary_size ' + '(build_time, build_project, build_id, commit_id, os, arch, build_config, size) ' + 'VALUES (Now(), "%s", "%s", "%s", "%s", "%s", "%s", %d) ' + 'ON DUPLICATE KEY UPDATE ' + 'build_time=Now(), build_project="%s", build_id="%s", size=%d;' + ) % ( + args.build_project, + args.build_id, + args.commit_hash, + row['os'], + row['arch'], + row['build_config'], + row['size'], + + args.build_project, + args.build_id, + row['size'] + ) + cursor.execute(insert_query) + + cnx.commit() + + # # Use below for debugging: + # cursor.execute('select * from onnxruntime.binary_size') + # for r in cursor: + # print(r) + + cursor.close() + cnx.close() + except BaseException as e: + cnx.close() + raise e + + +if __name__ == "__main__": + try: + args = parse_arguments() + binary_size_data = get_binary_sizes(args.size_data_file) + write_to_db(binary_size_data, args) + except BaseException as e: + print(str(e)) + sys.exit(1) + + + diff --git a/tools/python/get_submodules.py b/tools/python/get_submodules.py new file mode 100644 index 0000000000000..6c86048367291 --- /dev/null +++ b/tools/python/get_submodules.py @@ -0,0 +1,33 @@ +import pygit2 +import argparse +import json + +def format_component(submod): + return {"component":{"type":"git","git":{"commitHash":str(submod.head_id), "repositoryUrl":str(submod.url)}}} + +def process_component(repo): + return [repo.lookup_submodule(submod) for submod in repo.listall_submodules()] + +def recursive_process(base_repo): + processed_subs = [] + repos_to_process = [base_repo] + while repos_to_process: + repo = repos_to_process.pop() + submodules = process_component(repo) + processed_subs.extend(submodules) + repos_to_process.extend([mod.open() for mod in submodules]) + return {"Registrations":[format_component(component) for component in processed_subs]} + +def main(repo_path, output_file): + repo = pygit2.Repository(repo_path) + registrations = recursive_process(repo) + with open(output_file, 'w') as f: + json.dump(registrations, f, indent=4, sort_keys=True) + +if __name__=="__main__": + parser = argparse.ArgumentParser() + parser.add_argument("base_repository", help="path to base repository to get registrations for.") + parser.add_argument("-o", "--output", help="output file name.", default="cgmanifest.json") + args = parser.parse_args() + main(args.base_repository, args.output) + diff --git a/tools/python/update_version.py b/tools/python/update_version.py index 340f66bb2ea75..2aa4712823f29 100755 --- a/tools/python/update_version.py +++ b/tools/python/update_version.py @@ -13,18 +13,23 @@ def update_version(): for line in lines: if line.startswith('|'): sections = line.split('|') - if len(sections) == 6 and sections[1].strip()[0].isdigit() : + if len(sections) == 8 and sections[1].strip()[0].isdigit() : current_version = sections[1].strip() break + print ('Current version of ORT seems to be: ' + current_version) if version != current_version: with open(file_path, 'w') as f: for i,line in enumerate(lines): f.write(line) if line.startswith('|--'): sections = lines[i+1].split('|') - sections[1] = ' ' + version + ' ' - new_line = '|'.join(sections) - f.write(new_line) + # Make sure there are no 'False Positive' version additions + # by making sure the line we are building a new line from + # contains the current_version + if len(sections) > 1 and sections[1].strip() == current_version: + sections[1] = ' ' + version + ' ' + new_line = '|'.join(sections) + f.write(new_line) lines = [] current_version = '' file_path = os.path.join(cwd, '..', '..', 'docs', 'python', 'README.rst') @@ -43,6 +48,7 @@ def update_version(): if inserted == False and len(sections) == 3 and sections[0].isdigit() and sections[1].isdigit() and sections[2].isdigit(): f.write(version + '\n') f.write('^^^^^\n\n') + f.write('Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v' + version.strip() + '\n\n') inserted = True f.write(line) lines = []