diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 615491d3b0b54..0447e4814d37d 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -77,6 +77,17 @@ if(HAS_DEPRECATED_COPY) set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/tensor/onehot.cc" PROPERTIES COMPILE_FLAGS -Wno-deprecated-copy) set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/tensor/where_op.cc" PROPERTIES COMPILE_FLAGS -Wno-deprecated-copy) endif() + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT MSVC) + # For x86 platforms it is important to pass this flag to compiler. Without this gemmlowp will use slow reference code. + # These optimizations are not enabled on MSVC so excluding it. + message("enabling optimizations for gemmlowp") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/math/matmul_integer.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/math/quantize_linear_matmul.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/nn/qlinearconv.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") + set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/nn/conv_integer.cc" PROPERTIES COMPILE_FLAGS "-msse4.1") +endif() + set(gemmlowp_src ${PROJECT_SOURCE_DIR}/external/gemmlowp) set(re2_src ${ONNXRUNTIME_ROOT}/../cmake/external/re2) target_include_directories(onnxruntime_providers PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${gemmlowp_src} ${re2_src}) diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs index 1738fc4ba2067..5f89bad8bbe9b 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs @@ -303,7 +303,7 @@ internal static NodeMetadata GetMetadataFromTypeInfo(IntPtr typeInfo) OnnxValueType valueType; unsafe { - NativeApiStatus.VerifySuccess(NativeMethods.OrtOnnxTypeFromTypeInfo(typeInfo, new IntPtr(&valueType))); + NativeApiStatus.VerifySuccess(NativeMethods.OrtGetOnnxTypeFromTypeInfo(typeInfo, new IntPtr(&valueType))); } if (valueType != OnnxValueType.ONNX_TYPE_TENSOR && valueType != OnnxValueType.ONNX_TYPE_SPARSETENSOR) { diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs index 7fd2c33c6a6cb..4c213ec66d58e 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs @@ -87,7 +87,7 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca [DllImport(nativeLib, CharSet = charSet)] public static extern IntPtr /*(OrtStatus*)*/OrtSessionGetOutputName( IntPtr /*(OrtSession*)*/ session, - UIntPtr index, + UIntPtr index, IntPtr /*(OrtAllocator*)*/ allocator, out IntPtr /*(char**)*/name); @@ -253,7 +253,7 @@ public enum MemoryType public static extern IntPtr /*(OrtStatus*)*/ OrtGetValueType(IntPtr /*(OrtValue*)*/ value, IntPtr /*(OnnxValueType*)*/ onnxtype); [DllImport(nativeLib, CharSet = charSet)] - public static extern IntPtr /*(OrtStatus*)*/ OrtOnnxTypeFromTypeInfo(IntPtr /*(OrtTypeInfo*)*/ typeinfo, IntPtr /*(OnnxValueType*)*/ onnxtype); + public static extern IntPtr /*(OrtStatus*)*/ OrtGetOnnxTypeFromTypeInfo(IntPtr /*(OrtTypeInfo*)*/ typeinfo, IntPtr /*(OnnxValueType*)*/ onnxtype); [DllImport(nativeLib, CharSet = charSet)] public static extern IntPtr /*(OrtStatus*)*/ OrtGetValueCount(IntPtr /*(OrtValue*)*/ value, out IntPtr /*(size_t*)*/ count); diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 1b14ae61f34cd..6848fc31e453c 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -186,10 +186,10 @@ ORT_API_STATUS(OrtCreateEnvWithCustomLogger, OrtLoggingFunction logging_function // execution of OrtCreateSession, or does the OrtSession retain a handle to the file/directory // and continue to access throughout the OrtSession lifetime? // What sort of access is needed to model_path : read or read/write? -ORT_API_STATUS(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path, +ORT_API_STATUS(OrtCreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path, _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out); -ORT_API_STATUS(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length, +ORT_API_STATUS(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length, _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out); ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess, @@ -203,43 +203,43 @@ ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess, ORT_API_STATUS(OrtCreateSessionOptions, _Outptr_ OrtSessionOptions** options); // create a copy of an existing OrtSessionOptions -ORT_API_STATUS(OrtCloneSessionOptions, _In_ OrtSessionOptions* in_options, _Outptr_ OrtSessionOptions** out_options); -ORT_API_STATUS(OrtEnableSequentialExecution, _In_ OrtSessionOptions* options); -ORT_API_STATUS(OrtDisableSequentialExecution, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtCloneSessionOptions, _In_ const OrtSessionOptions* in_options, _Outptr_ OrtSessionOptions** out_options); +ORT_API_STATUS(OrtEnableSequentialExecution, _Inout_ OrtSessionOptions* options); +ORT_API_STATUS(OrtDisableSequentialExecution, _Inout_ OrtSessionOptions* options); // Enable profiling for this session. -ORT_API_STATUS(OrtEnableProfiling, _In_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix); -ORT_API_STATUS(OrtDisableProfiling, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtEnableProfiling, _Inout_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix); +ORT_API_STATUS(OrtDisableProfiling, _Inout_ OrtSessionOptions* options); // Enable the memory pattern optimization. // The idea is if the input shapes are the same, we could trace the internal memory allocation // and generate a memory pattern for future request. So next time we could just do one allocation // with a big chunk for all the internal memory allocation. // Note: memory pattern optimization is only available when SequentialExecution enabled. -ORT_API_STATUS(OrtEnableMemPattern, _In_ OrtSessionOptions* options); -ORT_API_STATUS(OrtDisableMemPattern, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtEnableMemPattern, _Inout_ OrtSessionOptions* options); +ORT_API_STATUS(OrtDisableMemPattern, _Inout_ OrtSessionOptions* options); // Enable the memory arena on CPU // Arena may pre-allocate memory for future usage. // set this option to false if you don't want it. -ORT_API_STATUS(OrtEnableCpuMemArena, _In_ OrtSessionOptions* options); -ORT_API_STATUS(OrtDisableCpuMemArena, _In_ OrtSessionOptions* options); +ORT_API_STATUS(OrtEnableCpuMemArena, _Inout_ OrtSessionOptions* options); +ORT_API_STATUS(OrtDisableCpuMemArena, _Inout_ OrtSessionOptions* options); // < logger id to use for session output -ORT_API_STATUS(OrtSetSessionLogId, _In_ OrtSessionOptions* options, const char* logid); +ORT_API_STATUS(OrtSetSessionLogId, _Inout_ OrtSessionOptions* options, const char* logid); // < applies to session load, initialization, etc -ORT_API_STATUS(OrtSetSessionLogVerbosityLevel, _In_ OrtSessionOptions* options, int session_log_verbosity_level); +ORT_API_STATUS(OrtSetSessionLogVerbosityLevel, _Inout_ OrtSessionOptions* options, int session_log_verbosity_level); // Set Graph optimization level. // Available options are : 0, 1, 2. // 0 -> Disable all optimizations // 1 -> Enable basic optimizations // 2 -> Enable all optimizations -ORT_API_STATUS(OrtSetSessionGraphOptimizationLevel, _In_ OrtSessionOptions* options, int graph_optimization_level); +ORT_API_STATUS(OrtSetSessionGraphOptimizationLevel, _Inout_ OrtSessionOptions* options, int graph_optimization_level); // How many threads in the session thread pool. -ORT_API_STATUS(OrtSetSessionThreadPoolSize, _In_ OrtSessionOptions* options, int session_thread_pool_size); +ORT_API_STATUS(OrtSetSessionThreadPoolSize, _Inout_ OrtSessionOptions* options, int session_thread_pool_size); /** * To use additional providers, you must build ORT with the extra providers enabled. Then call one of these @@ -278,7 +278,7 @@ ORT_API_STATUS(OrtSessionGetOutputName, _In_ const OrtSession* sess, size_t inde */ ORT_API_STATUS(OrtCreateRunOptions, _Outptr_ OrtRunOptions** out); -ORT_API_STATUS(OrtRunOptionsSetRunLogVerbosityLevel, _In_ OrtRunOptions* options, int value); +ORT_API_STATUS(OrtRunOptionsSetRunLogVerbosityLevel, _Inout_ OrtRunOptions* options, int value); ORT_API_STATUS(OrtRunOptionsSetRunTag, _In_ OrtRunOptions*, _In_ const char* run_tag); ORT_API_STATUS(OrtRunOptionsGetRunLogVerbosityLevel, _In_ const OrtRunOptions* options, _Out_ int* out); @@ -286,8 +286,8 @@ ORT_API_STATUS(OrtRunOptionsGetRunTag, _In_ const OrtRunOptions*, _Out_ const ch // Set a flag so that any running OrtRun* calls that are using this instance of OrtRunOptions // will exit as soon as possible if the flag is true. -// flag can be either 1 (true) or 0 (false) -ORT_API_STATUS(OrtRunOptionsSetTerminate, _In_ OrtRunOptions* options, _In_ int flag); +ORT_API_STATUS(OrtRunOptionsEnableTerminate, _Inout_ OrtRunOptions* options); +ORT_API_STATUS(OrtRunOptionsDisableTerminate, _Inout_ OrtRunOptions* options); /** * Create a tensor from an allocator. OrtReleaseValue will also release the buffer inside the output value @@ -321,7 +321,7 @@ ORT_API_STATUS(OrtIsTensor, _In_ const OrtValue* value, _Out_ int* out); * \param s each A string array. Each string in this array must be null terminated. * \param s_len length of s */ -ORT_API_STATUS(OrtFillStringTensor, _In_ OrtValue* value, _In_ const char* const* s, size_t s_len); +ORT_API_STATUS(OrtFillStringTensor, _Inout_ OrtValue* value, _In_ const char* const* s, size_t s_len); /** * \param value A tensor created from OrtCreateTensor... function. * \param len total data length, not including the trailing '\0' chars. @@ -368,19 +368,19 @@ ORT_API_STATUS(OrtGetTensorMemSizeInBytesFromTensorProto, _In_ const void* input /** * Don't free the 'out' value */ -ORT_API_STATUS(OrtCastTypeInfoToTensorInfo, _In_ OrtTypeInfo*, _Out_ const OrtTensorTypeAndShapeInfo** out); +ORT_API_STATUS(OrtCastTypeInfoToTensorInfo, _In_ const OrtTypeInfo*, _Out_ const OrtTensorTypeAndShapeInfo** out); /** * Return OnnxType from OrtTypeInfo */ -ORT_API_STATUS(OrtOnnxTypeFromTypeInfo, _In_ const OrtTypeInfo*, _Out_ enum ONNXType* out); +ORT_API_STATUS(OrtGetOnnxTypeFromTypeInfo, _In_ const OrtTypeInfo*, _Out_ enum ONNXType* out); /** * The 'out' value should be released by calling OrtReleaseTensorTypeAndShapeInfo */ ORT_API_STATUS(OrtCreateTensorTypeAndShapeInfo, _Outptr_ OrtTensorTypeAndShapeInfo** out); -ORT_API_STATUS(OrtSetTensorElementType, _In_ OrtTensorTypeAndShapeInfo*, enum ONNXTensorElementDataType type); +ORT_API_STATUS(OrtSetTensorElementType, _Inout_ OrtTensorTypeAndShapeInfo*, enum ONNXTensorElementDataType type); /** * \param info Created from OrtCreateTensorTypeAndShapeInfo() function @@ -525,7 +525,7 @@ ORT_API_STATUS(OrtGetValueCount, _In_ const OrtValue* value, _Out_ size_t* out); * sequence. 'in' should be an arrary of N OrtValues. * \value_type should be either map or sequence. */ -ORT_API_STATUS(OrtCreateValue, _In_ OrtValue** in, size_t num_values, enum ONNXType value_type, +ORT_API_STATUS(OrtCreateValue, _In_ const OrtValue* const* in, size_t num_values, enum ONNXType value_type, _Outptr_ OrtValue** out); /* @@ -561,12 +561,12 @@ struct OrtCustomOpApi { OrtStatus*(ORT_API_CALL* SetDimensions)(OrtTensorTypeAndShapeInfo* info, _In_ const int64_t* dim_values, size_t dim_count); OrtStatus*(ORT_API_CALL* GetTensorMutableData)(_Inout_ OrtValue* value, _Outptr_ void** data); - void(ORT_API_CALL* ReleaseTensorTypeAndShapeInfo)(OrtTensorTypeAndShapeInfo* input); + void(ORT_API_CALL* ReleaseTensorTypeAndShapeInfo)(_In_ OrtTensorTypeAndShapeInfo* input); - OrtStatus*(ORT_API_CALL* KernelContext_GetInputCount)(const OrtKernelContext* context, _Out_ size_t* out); - OrtStatus*(ORT_API_CALL* KernelContext_GetInput)(const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out); - OrtStatus*(ORT_API_CALL* KernelContext_GetOutputCount)(const OrtKernelContext* context, _Out_ size_t* out); - OrtStatus*(ORT_API_CALL* KernelContext_GetOutput)(OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out); + OrtStatus*(ORT_API_CALL* KernelContext_GetInputCount)(_In_ const OrtKernelContext* context, _Out_ size_t* out); + OrtStatus*(ORT_API_CALL* KernelContext_GetInput)(_In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out); + OrtStatus*(ORT_API_CALL* KernelContext_GetOutputCount)(_In_ const OrtKernelContext* context, _Out_ size_t* out); + OrtStatus*(ORT_API_CALL* KernelContext_GetOutput)(_Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out); }; typedef struct OrtCustomOpApi OrtCustomOpApi; @@ -607,13 +607,13 @@ ORT_API_STATUS(OrtCreateCustomOpDomain, _In_ const char* domain, _Outptr_ OrtCus * Add custom ops to the OrtCustomOpDomain * Note: The OrtCustomOp* pointer must remain valid until the OrtCustomOpDomain using it is released */ -ORT_API_STATUS(OrtCustomOpDomain_Add, _In_ OrtCustomOpDomain* custom_op_domain, _In_ OrtCustomOp* op); +ORT_API_STATUS(OrtCustomOpDomain_Add, _Inout_ OrtCustomOpDomain* custom_op_domain, _In_ OrtCustomOp* op); /* * Add a custom op domain to the OrtSessionOptions * Note: The OrtCustomOpDomain* must not be deleted until the sessions using it are released */ -ORT_API_STATUS(OrtAddCustomOpDomain, _In_ OrtSessionOptions* options, _In_ OrtCustomOpDomain* custom_op_domain); +ORT_API_STATUS(OrtAddCustomOpDomain, _Inout_ OrtSessionOptions* options, _In_ OrtCustomOpDomain* custom_op_domain); /* * END EXPERIMENTAL */ diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index dc10322a01b94..e21e87596781e 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -123,7 +123,8 @@ struct RunOptions : Base { RunOptions& SetRunTag(const char* run_tag); const char* GetRunTag() const; - RunOptions& SetTerminate(bool flag); + RunOptions& EnableTerminate(); + RunOptions& DisableTerminate(); }; struct SessionOptions : Base { diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index c03b61137de92..0fbbbde445b16 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -113,8 +113,13 @@ inline const char* RunOptions::GetRunTag() const { return out; } -inline RunOptions& RunOptions::SetTerminate(bool flag) { - ORT_THROW_ON_ERROR(OrtRunOptionsSetTerminate(p_, flag ? 1 : 0)); +inline RunOptions& RunOptions::EnableTerminate() { + ORT_THROW_ON_ERROR(OrtRunOptionsEnableTerminate(p_)); + return *this; +} + +inline RunOptions& RunOptions::DisableTerminate() { + ORT_THROW_ON_ERROR(OrtRunOptionsDisableTerminate(p_)); return *this; } @@ -284,7 +289,7 @@ inline Unowned TypeInfo::GetTensorTypeAndShapeInfo() con inline ONNXType TypeInfo::GetONNXType() const { ONNXType out; - ORT_THROW_ON_ERROR(OrtOnnxTypeFromTypeInfo(p_, &out)); + ORT_THROW_ON_ERROR(OrtGetOnnxTypeFromTypeInfo(p_, &out)); return out; } @@ -405,7 +410,7 @@ inline std::string CustomOpApi::KernelInfoGetAttribute(_In_ const O OrtReleaseStatus(status); out.resize(size); ORT_THROW_ON_ERROR(api_.KernelInfoGetAttribute_string(info, name, &out[0], &size)); - out.resize(size - 1); // remove the terminating character '\0' + out.resize(size - 1); // remove the terminating character '\0' } else { ORT_THROW_ON_ERROR(status); } diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index 18c46a994f4d2..d8eb1b2354027 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -72,6 +72,11 @@ std::string Profiler::EndProfiling() { profile_with_logger_ = false; return std::string(); } + + if (session_logger_) { + LOGS(*session_logger_, INFO) << "Writing profiler data to file " << profile_stream_file_; + } + std::lock_guard lock(mutex_); profile_stream_ << "[\n"; diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h index 3e0496282719c..48ecf5747467a 100644 --- a/onnxruntime/core/common/profiler.h +++ b/onnxruntime/core/common/profiler.h @@ -44,7 +44,10 @@ class Profiler { */ TimePoint StartTime() const; - bool FEnabled() const { + /* + Whether data collection and output from this profiler is enabled. + */ + bool IsEnabled() const { return enabled_; } diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.cc b/onnxruntime/core/framework/onnxruntime_typeinfo.cc index fcb6c143e5397..4f00ec89dde4a 100644 --- a/onnxruntime/core/framework/onnxruntime_typeinfo.cc +++ b/onnxruntime/core/framework/onnxruntime_typeinfo.cc @@ -21,12 +21,12 @@ OrtTypeInfo::~OrtTypeInfo() { OrtReleaseTensorTypeAndShapeInfo(data); } -ORT_API_STATUS_IMPL(OrtOnnxTypeFromTypeInfo, _In_ const struct OrtTypeInfo* input, ONNXType* out) { +ORT_API_STATUS_IMPL(OrtGetOnnxTypeFromTypeInfo, _In_ const struct OrtTypeInfo* input, ONNXType* out) { *out = input->type; return nullptr; } -ORT_API_STATUS_IMPL(OrtCastTypeInfoToTensorInfo, _In_ struct OrtTypeInfo* input, const struct OrtTensorTypeAndShapeInfo** out) { +ORT_API_STATUS_IMPL(OrtCastTypeInfoToTensorInfo, _In_ const struct OrtTypeInfo* input, const struct OrtTensorTypeAndShapeInfo** out) { *out = input->type == ONNX_TYPE_TENSOR ? input->data : nullptr; return nullptr; } diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc index dccdfc46d96de..72ee80cd421ee 100644 --- a/onnxruntime/core/framework/parallel_executor.cc +++ b/onnxruntime/core/framework/parallel_executor.cc @@ -35,8 +35,8 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v const std::unordered_map& fetch_allocators, const logging::Logger& logger) { TimePoint tp; - bool f_profiler_enabled = session_state.Profiler().FEnabled(); - if (f_profiler_enabled) { + const bool is_profiler_enabled = session_state.Profiler().IsEnabled(); + if (is_profiler_enabled) { tp = session_state.Profiler().StartTime(); } @@ -102,7 +102,7 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v } } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "ParallelExecutor::Execute", tp); } @@ -121,7 +121,7 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index, auto graph_viewer = session_state.GetGraphViewer(); TimePoint sync_time_begin; TimePoint kernel_begin_time; - bool f_profiler_enabled = session_state.Profiler().FEnabled(); + const bool f_profiler_enabled = session_state.Profiler().IsEnabled(); // Avoid context switching if possible. while (keep_running) { diff --git a/onnxruntime/core/framework/run_options.cc b/onnxruntime/core/framework/run_options.cc index d446dc42ca3fc..079be56fc5ae4 100644 --- a/onnxruntime/core/framework/run_options.cc +++ b/onnxruntime/core/framework/run_options.cc @@ -33,10 +33,12 @@ ORT_API_STATUS_IMPL(OrtRunOptionsGetRunTag, _In_ const OrtRunOptions* options, c return nullptr; } -ORT_API_STATUS_IMPL(OrtRunOptionsSetTerminate, _In_ OrtRunOptions* options, int flag) { - if (!(flag == 0 || flag == 1)) { - return OrtCreateStatus(ORT_INVALID_ARGUMENT, "Invalid value for flag. Should be either 0 or 1"); - } - options->terminate = flag; +ORT_API_STATUS_IMPL(OrtRunOptionsEnableTerminate, _Inout_ OrtRunOptions* options) { + options->terminate = true; + return nullptr; +} + +ORT_API_STATUS_IMPL(OrtRunOptionsDisableTerminate, _Inout_ OrtRunOptions* options) { + options->terminate = false; return nullptr; } diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index 44425c4ec705e..bd45bbfdc0b01 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -27,12 +27,12 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) { - bool f_profiler_enabled = session_state.Profiler().FEnabled(); + const bool is_profiler_enabled = session_state.Profiler().IsEnabled(); TimePoint tp; TimePoint sync_time_begin; TimePoint kernel_begin_time; - if (f_profiler_enabled) { + if (is_profiler_enabled) { tp = session_state.Profiler().StartTime(); } @@ -65,7 +65,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: OpKernelContextInternal op_kernel_context(session_state, frame, *p_op_kernel, logger, p_op_kernel->Node().ImplicitInputDefs(), terminate_flag_); // TODO: log kernel outputs? - if (f_profiler_enabled) { + if (is_profiler_enabled) { sync_time_begin = session_state.Profiler().StartTime(); } @@ -104,7 +104,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: utils::DumpNodeInputs(op_kernel_context, p_op_kernel->Node()); #endif - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, p_op_kernel->Node().Name() + "_fence_before", sync_time_begin, @@ -128,7 +128,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: return Status(compute_status.Category(), compute_status.Code(), msg_string); } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, p_op_kernel->Node().Name() + "_kernel_time", kernel_begin_time, @@ -159,7 +159,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: } } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::NODE_EVENT, p_op_kernel->Node().Name() + "_fence_after", sync_time_begin, @@ -199,7 +199,7 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: } } - if (f_profiler_enabled) { + if (is_profiler_enabled) { session_state.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", tp); } diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.cc b/onnxruntime/core/optimizer/matmul_add_fusion.cc index a0c392ef75354..33bc507990a40 100644 --- a/onnxruntime/core/optimizer/matmul_add_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_add_fusion.cc @@ -43,10 +43,13 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level) auto matmul_input_defs = matmul_node.MutableInputDefs(); auto add_input_defs = add_node.MutableInputDefs(); - // Gemm only support float, so the inputs of MatMul + // Gemm requires that inputs be the same data type and both floating point (float32/float16). auto matmul_type = matmul_input_defs[0]->Type(); auto add_type = add_input_defs[0]->Type(); - if ((*matmul_type) != "tensor(float)" || (*add_type) != "tensor(float)") { + if ((*matmul_type) != (*add_type)) { + continue; + } + if ((*matmul_type) != "tensor(float)" && (*matmul_type) != "tensor(float16)") { continue; } diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc index e195afb8ebc33..1a2003aca9b2d 100644 --- a/onnxruntime/core/optimizer/nchwc_transformer.cc +++ b/onnxruntime/core/optimizer/nchwc_transformer.cc @@ -18,14 +18,18 @@ class NchwcTransformerImpl { void Transform(Node& node); void Finalize(bool& modified); - static constexpr int kNchwcDims = 4; + static constexpr int kNchwcBatchChannelDims = 2; + static constexpr int kNchwcSpatialDims = 2; + static constexpr int kNchwcDims = kNchwcBatchChannelDims + kNchwcSpatialDims; private: // Associate the following state with each created NCHWc output keyed off the // original NodeArg. struct NchwcArgument { // Symbolic shape information for this NCHWc output. Each dimension stores - // the original NodeArg* that sourced the value. + // the original NodeArg* that sourced the value. Spatial dimensions also + // track the number of times the original value has been shifted down due + // to a stride count of 2. // // For example, the first Conv node that takes NCHW input will create a // NchwcArgument with the shape referencing itself. Other NCHWc nodes that @@ -39,6 +43,30 @@ class NchwcTransformerImpl { // fusion that can be detected using this additional shape hint. struct Shape { const NodeArg* dims_[kNchwcDims]; + size_t shifts_[kNchwcSpatialDims]; + + Shape(const NodeArg* initial_dim) { + std::fill_n(dims_, kNchwcDims, initial_dim); + std::fill_n(shifts_, kNchwcSpatialDims, 0); + } + + bool IsDimEqual(const Shape& other, int dim) const { + bool is_dim_equal = false; + // Test if this dimension is derived from the same NodeArg. + if (dims_[dim] == other.dims_[dim]) { + if (dim >= kNchwcBatchChannelDims) { + // Test if the NodeArg has been shifted down the same number of + // times due to striding. + int spatial_dim = dim - kNchwcBatchChannelDims; + if (shifts_[spatial_dim] == other.shifts_[spatial_dim]) { + is_dim_equal = true; + } + } else { + is_dim_equal = true; + } + } + return is_dim_equal; + } }; // Stores the node that generated the NCHWc output. @@ -181,7 +209,7 @@ void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node, NchwcArgument::Shape& output_shape, const ONNX_NAMESPACE::TensorProto* filter_shape) { // Skip the leading batch and channel counts. - const int kernel_size = kNchwcDims - 2; + const int kernel_size = kNchwcSpatialDims; // Maintain the batch count dimension from the NCHWc input. output_shape.dims_[0] = input_shape.dims_[0]; @@ -221,11 +249,18 @@ void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node, } for (int i = 0; i < kernel_size; i++) { - if ((strides_attr != nullptr && strides_attr->ints(i) != 1) || - (dilations_attr != nullptr && dilations_attr->ints(i) != 1)) { + if (dilations_attr != nullptr && dilations_attr->ints(i) != 1) { continue; } + int64_t stride = 1; + if (strides_attr != nullptr) { + stride = strides_attr->ints(i); + if (stride != 1 && stride != 2) { + continue; + } + } + int64_t padding = 0; if (pads_attr != nullptr) { padding = pads_attr->ints(i) + pads_attr->ints(i + kernel_size); @@ -238,8 +273,14 @@ void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node, kernel = filter_shape->dims(2 + i); } + // Maintain the spatial dimension from the NCHWc input if the implicit or + // explicit padding results in the same symbolic dimension before applying + // the stride. When the stride is 2, then the actual output dimensions is + // half the original value. Track the number of times the symbolic dimension + // has been halved in the shifts field. if (padding + 1 == kernel || auto_pad_same_shape) { - output_shape.dims_[2 + i] = input_shape.dims_[2 + i]; + output_shape.dims_[kNchwcBatchChannelDims + i] = input_shape.dims_[kNchwcBatchChannelDims + i]; + output_shape.shifts_[i] = input_shape.shifts_[i] + static_cast(stride) - 1; } } } @@ -396,8 +437,7 @@ void NchwcTransformerImpl::TransformConv(Node& node) { nchwc_node.MutableInputDefs()[2] = nchwc_conv_B_arg; } - NchwcArgument::Shape output_shape; - std::fill_n(output_shape.dims_, kNchwcDims, output_defs[0]); + NchwcArgument::Shape output_shape(output_defs[0]); if (do_reorder_input) { auto it = nchwc_args_.find(input_defs[0]); @@ -450,8 +490,7 @@ void NchwcTransformerImpl::TransformPool(Node& node) { kMSNchwcDomain); nchwc_node.SetExecutionProviderType(node.GetExecutionProviderType()); - NchwcArgument::Shape output_shape; - std::fill_n(output_shape.dims_, kNchwcDims, output_defs[0]); + NchwcArgument::Shape output_shape(output_defs[0]); auto it = nchwc_args_.find(input_defs[0]); if (it == nchwc_args_.end()) { @@ -492,7 +531,7 @@ void NchwcTransformerImpl::TransformAdd(Node& node) { auto* nchwc_input_n = nchwc_inputs[n]; for (int i = 0; i < kNchwcDims; i++) { // Test if this dimension is derived from the same NodeArg. - if (nchwc_input_0->shape_.dims_[i] != nchwc_input_n->shape_.dims_[i]) { + if (!nchwc_input_0->shape_.IsDimEqual(nchwc_input_n->shape_, i)) { // Check if ONNX shape inferencing has computed a precise dimension value. auto* nchwc_input_n_shape = input_defs[n]->Shape(); if ((nchwc_input_0_shape == nullptr) || (nchwc_input_n_shape == nullptr)) { diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc index 8c202586f25ea..c5be268f59e2d 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc @@ -238,7 +238,7 @@ class UniDirectionalGru { // #define DUMP_MATRIXES to provide lots of diagnostic output #if defined(DUMP_MATRIXES) -#define DumpMatrix(...) ::onnxruntime::rnn::detail::DumpMatrixImpl(__VA_ARGS__) +#define DumpMatrix(...) onnxruntime::rnn::detail::DumpMatrixImpl(__VA_ARGS__) #else #define DumpMatrix(...) ((void)0) #endif @@ -591,8 +591,9 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, // for each item in sequence run all calculations for (int step = 0; step < max_sequence_length; step++) { +#if defined(DUMP_MATRIXES) const std::string seqno_str = " [seqno=" + std::to_string(step) + "]"; - +#endif DumpMatrix("Ht-1" + seqno_str, &*prev_Ht, batch_size_, hidden_size_); out_added_offset = (step * batch_size_) * hidden_size_x3; @@ -657,7 +658,9 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, } } +#if defined(DUMP_MATRIXES) std::string label = linear_before_reset_ ? "rt (.) (Ht-1 * (Rh^T) + Rbh)" : "rt (.) Ht-1"; +#endif DumpMatrix(label + seqno_str, &*cur_h_local, batch_size_, hidden_size_); if (linear_before_reset_) { @@ -676,7 +679,9 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, } } } else { +#if defined(DUMP_MATRIXES) label += " * Rh^T"; +#endif // out_H currently contains Xt*(Wh^T). auto out_H = outputZRH_.begin() + out_added_offset + hidden_size_x2; @@ -708,9 +713,11 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, for (int r = 0; r < batch_size_; r++) { if (step >= min_sequence_length && step >= sequence_lengths[r]) { - if (output_sequence) { + // if we need output for every step, + // or we need to set prev_Ht for an empty sequence to avoid warnings about using uninitialized values + if (output_sequence || (step == 0 && sequence_lengths[r] == 0)) { auto fill_output = output + r * hidden_size_; - std::fill_n(fill_output, hidden_size_, T{}); + std::fill_n(&*fill_output, hidden_size_, T{}); } continue; @@ -772,28 +779,29 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, // copy last output to final_hidden_state for (int i = 0; i < batch_size_; i++) { const int seq_len = sequence_lengths[i]; - if (seq_len == 0) { - auto final_hidden_state_dst = final_hidden_state.begin() + i * hidden_size_; - std::fill_n(final_hidden_state_dst, hidden_size_, T{}); - continue; - } if (output_sequence) { - auto src = outputs.subspan((seq_len - 1) * output_step_length + i * hidden_size_, hidden_size_); - auto dest = final_hidden_state.subspan(i * hidden_size_, hidden_size_); - gsl::copy(src, dest); + if (seq_len == 0) { + auto final_hidden_state_dst = final_hidden_state.begin() + i * hidden_size_; + std::fill_n(&*final_hidden_state_dst, hidden_size_, T{}); + } else { + auto src = outputs.subspan((seq_len - 1) * output_step_length + i * hidden_size_, hidden_size_); + auto dest = final_hidden_state.subspan(i * hidden_size_, hidden_size_); + gsl::copy(src, dest); + } } } - // zero any values beyond the evaluated steps + // zero any values beyond the evaluated steps if the maximum explicit sequence length we saw (max_sequence_length) + // was shorter than the maximum possible sequence length (seq_length_) if (output_sequence && max_sequence_length < seq_length_) { if (output_step_length == batch_size_ * hidden_size_) { // contiguous const auto span_to_zero = outputs.subspan( max_sequence_length * output_step_length, (seq_length_ - max_sequence_length) * output_step_length); - std::fill_n(span_to_zero.begin(), span_to_zero.size(), T{}); + std::fill_n(&*span_to_zero.begin(), span_to_zero.size(), T{}); } else { for (int i = max_sequence_length; i < seq_length_; ++i) { // non-contiguous const auto span_to_zero = outputs.subspan(i * output_step_length, batch_size_ * hidden_size_); - std::fill_n(span_to_zero.begin(), span_to_zero.size(), T{}); + std::fill_n(&*span_to_zero.begin(), span_to_zero.size(), T{}); } } } diff --git a/onnxruntime/core/providers/cpu/symbols.txt b/onnxruntime/core/providers/cpu/symbols.txt index fc4859442c667..fc7560f5b7696 100644 --- a/onnxruntime/core/providers/cpu/symbols.txt +++ b/onnxruntime/core/providers/cpu/symbols.txt @@ -51,7 +51,7 @@ OrtGetValueCount OrtGetValueType OrtGetVersionString OrtIsTensor -OrtOnnxTypeFromTypeInfo +OrtGetOnnxTypeFromTypeInfo OrtReleaseAllocator OrtReleaseAllocatorInfo OrtReleaseCustomOpDomain @@ -69,7 +69,8 @@ OrtRunOptionsGetRunLogVerbosityLevel OrtRunOptionsGetRunTag OrtRunOptionsSetRunLogVerbosityLevel OrtRunOptionsSetRunTag -OrtRunOptionsSetTerminate +OrtRunOptionsEnableTerminate +OrtRunOptionsDisableTerminate OrtSessionGetInputCount OrtSessionGetInputName OrtSessionGetInputTypeInfo diff --git a/onnxruntime/core/providers/cpu/tensor/identity_op.h b/onnxruntime/core/providers/cpu/tensor/identity_op.h index 5a583e48f1679..7cea426ff9a5d 100644 --- a/onnxruntime/core/providers/cpu/tensor/identity_op.h +++ b/onnxruntime/core/providers/cpu/tensor/identity_op.h @@ -43,7 +43,18 @@ class IdentityOp final : public OpKernel { } if (is_dropout) { - context->Output(1, std::vector()); + Tensor* mask = context->Output(1, shape); + // a 'nullptr' returned would make it an unused optional output + if (mask != nullptr) { + // Opset 7 differs with Opset 10 in that the type of the 'mask' + // output is tied with the type of the input in Opset 7 whereas + // the type of 'mask' in Opset 10 is 'bool' always + // so we have a common solution + void* mask_data = mask->MutableDataRaw(); + // In 'test'/'inference' mode, there are no input values dropped out + // so fill the buffer with 0/false + memset(mask_data, 0, mask->SizeInBytes()); + } } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index bd7544396c11d..6509cf01fdf9a 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -64,10 +64,6 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() { CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& info) : IExecutionProvider{onnxruntime::kCudaExecutionProvider}, device_id_(info.device_id) { CUDA_CALL_THROW(cudaSetDevice(device_id_)); - // create streams, default is nullptr - streams_[kCudaStreamDefault] = nullptr; - CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking)); - CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking)); DeviceAllocatorRegistrationInfo default_allocator_info( {OrtMemTypeDefault, [](int id) { return std::make_unique(id); }, std::numeric_limits::max()}); @@ -93,9 +89,6 @@ CUDAExecutionProvider::~CUDAExecutionProvider() { CUDA_CALL_THROW(cudaEventDestroy(e)); it = deferred_release_cpu_ptr_.erase(it); } - CUDA_CALL_THROW(cudaStreamDestroy(streams_[kCudaStreamCopyIn])); - CUDA_CALL_THROW(cudaStreamDestroy(streams_[kCudaStreamCopyOut])); - ReleasePerThreadStuffs(); } @@ -207,7 +200,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Un class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Squeeze); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Identity); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, Dropout); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, Dropout); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, Gather); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Gemm); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Gemm); @@ -522,6 +515,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Shrink); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Shrink); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout); static void RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { @@ -532,7 +526,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -847,6 +841,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index ed3a509f853e4..bd6e25b18b7bb 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -40,11 +40,6 @@ class CUDAExecutionProvider : public IExecutionProvider { return GetPerThreadContext().CudnnHandle(); } - cudaStream_t GetStream(int queue_id) const { - ORT_ENFORCE(queue_id >= 0 && queue_id < kTotalCudaStreams); - return streams_[queue_id]; - } - template const T* GetConstOnes(size_t count) { return GetPerThreadContext().template GetConstOnes(count); @@ -69,7 +64,6 @@ class CUDAExecutionProvider : public IExecutionProvider { int GetDeviceId() const { return device_id_; } private: - cudaStream_t streams_[kTotalCudaStreams]; int device_id_; struct DeferredReleaseCPUPtrs { diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc index ec930946aa8dc..8fae7ae8b0d34 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc @@ -12,6 +12,11 @@ GPUDataTransfer::GPUDataTransfer() { CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking)); } +GPUDataTransfer::~GPUDataTransfer() { + CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn])); + CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut])); +} + bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED; diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h index f0acae0f8d448..0f3d4687eb5e5 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.h +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h @@ -18,6 +18,7 @@ enum CUDAStreamType : int { class GPUDataTransfer : public IDataTransfer { public: GPUDataTransfer(); + ~GPUDataTransfer(); bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc index 599fa39f406b2..897e981877ad0 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand.cc +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -21,6 +21,11 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { ORT_RETURN_IF_ERROR(ComputeOutputShape(Node().Name(), input0.Shape(), output_dims, output_shape)); auto rank = output_shape.NumDimensions(); auto& output_tensor = *ctx->Output(0, output_shape); + + if (0 == output_shape.Size()) { + return Status::OK(); + } + auto input_shape = input0.Shape().GetDims(); // pad input_dims with 1 to make ranks match @@ -40,6 +45,8 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { for (auto i = 0; i < rank; i++) { in_span[i] = fast_divmod(static_cast(input_shape[i])); out_span[i] = fast_divmod(static_cast(output_shape[i])); + // output_shape[i] won't be 0 here, it's covered in (0 == output_shape.Size()) + // a null output will be returned for that case subdim_size /= output_shape[i]; sdm_span[i] = static_cast(subdim_size); } diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.cc b/onnxruntime/core/providers/cuda/tensor/identity_op.cc index e9a4125c43188..890bdf5cacf87 100644 --- a/onnxruntime/core/providers/cuda/tensor/identity_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/identity_op.cc @@ -5,13 +5,28 @@ namespace onnxruntime { namespace cuda { +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + Dropout, + kOnnxDomain, + 7, 9, + kCudaExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .Alias(0, 0), + IdentityOp); + ONNX_OPERATOR_KERNEL_EX( Dropout, kOnnxDomain, - 7, + 10, kCudaExecutionProvider, KernelDefBuilder() - .TypeConstraint("T", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) .Alias(0, 0), IdentityOp); diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.h b/onnxruntime/core/providers/cuda/tensor/identity_op.h index d83fd541b3cb6..31dd544030b20 100644 --- a/onnxruntime/core/providers/cuda/tensor/identity_op.h +++ b/onnxruntime/core/providers/cuda/tensor/identity_op.h @@ -30,7 +30,18 @@ class IdentityOp final : public CudaKernel { } if (is_dropout) { - context->Output(1, std::vector()); + Tensor* mask = context->Output(1, shape); + // a 'nullptr' returned would make it an unused optional output + if (mask != nullptr) { + // Opset 7 differs with Opset 10 in that the type of the 'mask' + // output is tied with the type of the input in Opset 7 whereas + // the type of 'mask' in Opset 10 is 'bool' always + // so we have a common solution + void* mask_data = mask->MutableDataRaw(); + // In 'test'/'inference' mode, there are no input values dropped out + // so fill the buffer with 0/false + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask_data, 0, mask->SizeInBytes())); + } } return Status::OK(); diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc index fadb56745a765..710ab2db8121f 100644 --- a/onnxruntime/core/session/abi_session_options.cc +++ b/onnxruntime/core/session/abi_session_options.cc @@ -28,7 +28,7 @@ ORT_API(void, OrtReleaseSessionOptions, OrtSessionOptions* ptr) { delete ptr; } -ORT_API_STATUS_IMPL(OrtCloneSessionOptions, OrtSessionOptions* input, OrtSessionOptions** out) { +ORT_API_STATUS_IMPL(OrtCloneSessionOptions, const OrtSessionOptions* input, OrtSessionOptions** out) { API_IMPL_BEGIN *out = new OrtSessionOptions(*input); return nullptr; diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index 3605850cfad7a..38b7699aa4d35 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -29,28 +29,28 @@ ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_int64, _In_ const OrtKernelInfo* i return onnxruntime::ToOrtStatus(status); } -ORT_API_STATUS_IMPL(OrtKernelContext_GetInputCount, const OrtKernelContext* context, _Out_ size_t* out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetInputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) { *out = reinterpret_cast(context)->InputCount(); return nullptr; }; -ORT_API_STATUS_IMPL(OrtKernelContext_GetOutputCount, const OrtKernelContext* context, _Out_ size_t* out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetOutputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) { *out = reinterpret_cast(context)->OutputCount(); return nullptr; }; -ORT_API_STATUS_IMPL(OrtKernelContext_GetInput, const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out) { *out = reinterpret_cast(reinterpret_cast(context)->GetInputMLValue(index)); return nullptr; }; -ORT_API_STATUS_IMPL(OrtKernelContext_GetOutput, OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) { +ORT_API_STATUS_IMPL(OrtKernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) { onnxruntime::TensorShape shape(dim_values, dim_count); *out = reinterpret_cast(reinterpret_cast(context)->OutputMLValue(index, shape)); return nullptr; }; -ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t *size) { +ORT_API_STATUS_IMPL(OrtKernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t* size) { std::string value; auto status = reinterpret_cast(info)->GetAttr(name, &value); if (status.IsOK()) { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 98cbb126cbf2e..10ceef943af0d 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -220,7 +220,7 @@ common::Status InferenceSession::Load(std::functionMutableData(); auto len = static_cast(tensor->Shape().Size()); @@ -365,7 +365,7 @@ ORT_API_STATUS_IMPL(OrtAddCustomOpDomain, _In_ OrtSessionOptions* options, OrtCu namespace { template -OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* options, +OrtStatus* CreateSessionImpl(_In_ const OrtEnv* env, _In_ const OrtSessionOptions* options, Loader loader, _Outptr_ OrtSession** out) { auto sess = std::make_unique<::onnxruntime::InferenceSession>( options == nullptr ? onnxruntime::SessionOptions() : options->value, env->loggingManager); @@ -395,7 +395,7 @@ OrtStatus* CreateSessionImpl(_In_ OrtEnv* env, _In_ const OrtSessionOptions* opt } } // namespace -ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* model_path, +ORT_API_STATUS_IMPL(OrtCreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path, _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) { API_IMPL_BEGIN const auto loader = [model_path](InferenceSession& sess) { @@ -405,7 +405,7 @@ ORT_API_STATUS_IMPL(OrtCreateSession, _In_ OrtEnv* env, _In_ const ORTCHAR_T* mo API_IMPL_END } -ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void* model_data, size_t model_data_length, +ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length, _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) { API_IMPL_BEGIN const auto loader = [model_data, model_data_length](InferenceSession& sess) { @@ -415,7 +415,7 @@ ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ OrtEnv* env, _In_ const void API_IMPL_END } -ORT_API_STATUS_IMPL(OrtRun, _In_ OrtSession* sess, +ORT_API_STATUS_IMPL(OrtRun, _Inout_ OrtSession* sess, _In_ const OrtRunOptions* run_options, _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len, _In_ const char* const* output_names1, size_t output_names_len, _Outptr_ OrtValue** output) { @@ -477,7 +477,7 @@ ORT_API_STATUS_IMPL(OrtRun, _In_ OrtSession* sess, API_IMPL_END } -ORT_API_STATUS_IMPL(OrtGetTensorMutableData, _In_ OrtValue* value, _Outptr_ void** output) { +ORT_API_STATUS_IMPL(OrtGetTensorMutableData, _Inout_ OrtValue* value, _Outptr_ void** output) { TENSOR_READWRITE_API_BEGIN //TODO: test if it's a string tensor *output = tensor->MutableDataRaw(); @@ -933,7 +933,7 @@ ORT_API_STATUS_IMPL(OrtGetValue, const OrtValue* value, int index, OrtAllocator* /////////////////// // OrtCreateValue template -static OrtStatus* OrtCreateValueImplSeqHelperMap(OrtValue** const in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplSeqHelperMap(const OrtValue* const* in, size_t num_values, OrtValue** out) { using SeqType = std::vector; auto vec_ptr = std::make_unique(); vec_ptr->reserve(num_values); @@ -951,7 +951,7 @@ static OrtStatus* OrtCreateValueImplSeqHelperMap(OrtValue** const in, size_t num } template -static OrtStatus* OrtCreateValueImplSeqHelper(OrtValue** in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplSeqHelper(const OrtValue* const* in, size_t num_values, OrtValue** out) { using SeqType = std::vector; auto vec_ptr = std::make_unique(); vec_ptr->reserve(num_values); @@ -972,7 +972,7 @@ static OrtStatus* OrtCreateValueImplSeqHelper(OrtValue** in, size_t num_values, return nullptr; } -static OrtStatus* OrtCreateValueImplSeq(OrtValue** in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplSeq(const OrtValue* const* in, size_t num_values, OrtValue** out) { // We only support limited sequence types. For the sake of simplicity the type of the first // OrtValue* in OrtValue** will determine the type of the vector used to create the output OrtValue // this type should be either a tensor of limited types or map of limited types @@ -1069,7 +1069,7 @@ static OrtStatus* OrtCreateValueImplMapHelper(const Tensor& key_tensor, const Te } } -static OrtStatus* OrtCreateValueImplMap(OrtValue** in, size_t num_values, OrtValue** out) { +static OrtStatus* OrtCreateValueImplMap(const OrtValue* const* in, size_t num_values, OrtValue** out) { if (num_values != NUM_MAP_INDICES) { return OrtCreateStatus(ORT_FAIL, "For map type num_values MUST be 2"); } @@ -1102,7 +1102,7 @@ static OrtStatus* OrtCreateValueImplMap(OrtValue** in, size_t num_values, OrtVal return OrtCreateStatus(ORT_FAIL, "Key type is not supported yet."); } -static OrtStatus* OrtCreateValueImpl(OrtValue** in, size_t num_values, enum ONNXType value_type, OrtValue** out) { +static OrtStatus* OrtCreateValueImpl(const OrtValue* const* in, size_t num_values, enum ONNXType value_type, OrtValue** out) { if (num_values <= 0) { return OrtCreateStatus(ORT_FAIL, "Number of values should be at least 1."); } @@ -1115,7 +1115,7 @@ static OrtStatus* OrtCreateValueImpl(OrtValue** in, size_t num_values, enum ONNX return OrtCreateStatus(ORT_FAIL, "Input is not of type sequence or map."); } -ORT_API_STATUS_IMPL(OrtCreateValue, OrtValue** in, size_t num_values, enum ONNXType value_type, OrtValue** out) { +ORT_API_STATUS_IMPL(OrtCreateValue, const OrtValue* const* in, size_t num_values, enum ONNXType value_type, OrtValue** out) { API_IMPL_BEGIN return OrtCreateValueImpl(in, num_values, value_type, out); API_IMPL_END diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc index 9e99f23917e3b..68e4821e8f114 100644 --- a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc +++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc @@ -188,8 +188,14 @@ void NchwcOptimizerTester(const std::function& bu ASSERT_TRUE(num_outputs == level3_fetches.size()); for (size_t i = 0; i < num_outputs; i++) { + double per_sample_tolerance = 0.0; + double relative_per_sample_tolerance = 0.0; std::pair ret = - CompareOrtValue(level3_fetches[i], level2_fetches[i], 0.0, 0.0, false); + CompareOrtValue(level3_fetches[i], + level2_fetches[i], + per_sample_tolerance, + relative_per_sample_tolerance, + false); EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS); } } @@ -756,13 +762,58 @@ TEST(NchwcOptimizerTests, ShapeInferencing) { }; // The NCHWc optimizer does a limited amount of symbolic shape inferencing to - // handle models such as YoloV3 which can handle variable height/width. Without + // handle models such as YoloV3 which can have variable height/width. Without // shape inferencing, the transformer would be unable to detect that the inputs // to the Add node have identical shapes and thus is eligble for Conv/Add // fusion. NchwcOptimizerTester(build_test_case, check_nchwc_graph); } +TEST(NchwcOptimizerTests, ShapeInferencing2) { + auto build_test_case = [&](NchwcTestHelper& helper) { + ONNX_NAMESPACE::TypeProto type_proto; + type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_height"); + type_proto.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_param("input_width"); + + auto* input_arg = helper.MakeInput({1, 1, 49, 98}, type_proto); + auto* output_arg = helper.MakeOutput(); + + auto* conv1_output_arg = helper.MakeIntermediate(); + helper.AddConvNode(input_arg, conv1_output_arg, {16, 1, 1, 1}); + + auto* conv2a1_output_arg = helper.MakeIntermediate(); + auto& conv2a1_node = helper.AddConvNode(conv1_output_arg, conv2a1_output_arg, {16, 16, 2, 2}); + conv2a1_node.AddAttribute("pads", std::vector{1, 1, 0, 0}); + conv2a1_node.AddAttribute("strides", std::vector{2, 2}); + + auto* conv2a_output_arg = helper.MakeIntermediate(); + auto& conv2a2_node = helper.AddConvNode(conv2a1_output_arg, conv2a_output_arg, {16, 16, 2, 2}); + conv2a2_node.AddAttribute("auto_pad", "SAME_UPPER"); + + auto* conv2b_output_arg = helper.MakeIntermediate(); + auto& conv2b_node = helper.AddConvNode(conv1_output_arg, conv2b_output_arg, {16, 16, 1, 1}); + conv2b_node.AddAttribute("strides", std::vector{2, 2}); + + helper.AddNode("Add", {conv2a_output_arg, conv2b_output_arg}, {output_arg}); + }; + + auto check_nchwc_graph = [&](NchwcInferenceSession& session) { + auto op_to_count = session.CountOpsInGraph(); + EXPECT_EQ(op_to_count["nchwc.Conv"], 4); + EXPECT_EQ(op_to_count["nchwc.ReorderInput"], 0); + EXPECT_EQ(op_to_count["nchwc.ReorderOutput"], 1); + EXPECT_EQ(op_to_count["Add"], 0); + }; + + // Verify that convolutions using strides of 2 and variable height/width are + // recognized as eligible for Conv/Add fusion. This pattern occurs in models + // such as Faster-RCNN. + NchwcOptimizerTester(build_test_case, check_nchwc_graph); +} + TEST(NchwcOptimizerTests, MixedOutputUsage) { auto build_test_case = [&](NchwcTestHelper& helper) { auto* input_arg = helper.MakeInput({6, 5, 11, 11}); diff --git a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc index 7b2e91b95afdf..802aaa84b310e 100644 --- a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc @@ -23,5 +23,29 @@ TEST(Dropout, Opset10) { test.Run(); } +TEST(Dropout, WithOptionalOutputOpset10) { + OpTester test("Dropout", 10, kOnnxDomain); + std::vector dims{2, 2}; + test.AddInput("X", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("Y", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("mask", dims, {false, false, false, false}); + // The NGraph execution provider doesn't seem to support 'Dropout' with optional mask output + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kNGraphExecutionProvider}); +} + +TEST(Dropout, WithOptionalOutputOpset7) { + // Opset 7 differs with Opset 10 in that the type of the 'mask' + // output is tied with the type of the input in Opset 7 whereas + // the type of 'mask' in Opset 10 is 'bool' always + OpTester test("Dropout", 7, kOnnxDomain); + std::vector dims{2, 2}; + test.AddInput("X", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("Y", dims, {1.0f, 2.0f, 3.0f, 5.0f}); + test.AddOutput("mask", dims, {0.0f, 0.0f, 0.0f, 0.0f}); + // The NGraph execution provider doesn't seem to support 'Dropout' with optional mask output + // The TensorRT execution provider doesn't seem to support 'Dropout' with non-boolean mask output + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kNGraphExecutionProvider, kTensorrtExecutionProvider}); +} + } // namespace test } // namespace onnxruntime diff --git a/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml b/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml index 751c351a7804d..3ac6d3bc198b7 100644 --- a/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml +++ b/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml @@ -209,3 +209,45 @@ jobs: displayName: 'Component Detection' - template: templates/clean-agent-build-directory-step.yml + +- job: MacOS_py_Wheels + pool: + vmImage: 'macOS-10.13' + strategy: + matrix: + Python35: + python.version: '3.5' + Python36: + python.version: '3.6' + Python37: + python.version: '3.7' + steps: + - task: CondaEnvironment@1 + inputs: + createCustomEnvironment: true + environmentName: 'py$(python.version)' + packageSpecs: 'python=$(python.version)' + cleanEnvironment: true + + - script: | + sudo python -m pip install numpy==1.15.0 + sudo xcode-select --switch /Applications/Xcode_10.app/Contents/Developer + ./build.sh --config Release --skip_submodule_sync --parallel --use_openmp --build_wheel + displayName: 'Command Line Script' + + - task: CopyFiles@2 + displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.SourcesDirectory)/build/Linux/Release/dist' + Contents: '*.whl' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime python wheel' + inputs: + ArtifactName: onnxruntime + + - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 + displayName: 'Component Detection' + + - template: templates/clean-agent-build-directory-step.yml \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml index c69c292936d36..3d152e09ed92c 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu-nocontribops-arm64.yml @@ -51,6 +51,8 @@ jobs: NuPackScript: | mkdir $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native cd $(Build.BinariesDirectory)\arm64 + copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.pdb $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native + copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.lib $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.dll $(Build.BinariesDirectory)\arm64\runtimes\win10-arm\native powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/zip.exe -OutFile zip.exe" zip -r win10-arm.zip runtimes @@ -186,7 +188,9 @@ jobs: move win-x86\runtimes\win-x86\native\onnxruntime.dll %%~ni\runtimes\win-x86\native\onnxruntime.dll move win-x86\runtimes\win-x86\native\onnxruntime.lib %%~ni\runtimes\win-x86\native\onnxruntime.lib move win-x86\runtimes\win-x86\native\onnxruntime.pdb %%~ni\runtimes\win-x86\native\onnxruntime.pdb - move win10-arm\runtimes\win-x64\native\onnxruntime.dll %%~ni\runtimes\win10-arm\native\onnxruntime.dll + move win10-arm\runtimes\win10-arm\native\onnxruntime.lib %%~ni\runtimes\win10-arm\native\onnxruntime.lib + move win10-arm\runtimes\win10-arm\native\onnxruntime.dll %%~ni\runtimes\win10-arm\native\onnxruntime.dll + move win10-arm\runtimes\win10-arm\native\onnxruntime.pdb %%~ni\runtimes\win10-arm\native\onnxruntime.pdb move linux-x64\linux-x64\libonnxruntime.so %%~ni\runtimes\linux-x64\native\libonnxruntime.so move linux-x86\linux-x86\libonnxruntime.so %%~ni\runtimes\linux-x86\native\libonnxruntime.so unzip osx-x64.zip -d osx-x64 diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml index 93c94f35b6786..24805a2264f51 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/cpu.yml @@ -100,6 +100,7 @@ jobs: - MacOS_CI_Dev condition: succeeded() steps: + - task: DownloadPipelineArtifact@0 displayName: 'Download Pipeline Artifact - NuGet' inputs: @@ -107,6 +108,7 @@ jobs: targetPath: '$(Build.BinariesDirectory)/nuget-artifact' continueOnError: true + - task: DownloadPipelineArtifact@0 displayName: 'Download Pipeline Artifact - Win-x86' inputs: @@ -150,4 +152,39 @@ jobs: artifactName: 'drop-signed-nuget' targetPath: '$(Build.ArtifactStagingDirectory)' + - template: test_all_os.yml + +- job: Publish_NuGet_Package_And_Report + variables: + - group: Dashboard_MySQL_Secret + pool: + name: Hosted Windows 2019 with VS2019 + # AzureFileCopy@3 task has some bug that it depends on a particular version of azure power shell, + # which is not available in OnnxRuntime build VMs, but available in the latest hosted agents. + # So, all the copy/publish jobs are being run on hosted agent + # TODO: install the desired azureps on our VMs or use later bugfixed version of AzureFileCopy + demands: azureps + condition: and (${{ parameters.DoEsrp }}, eq(variables['Build.SourceBranch'], 'refs/heads/master')) + dependsOn: + - NuGet_Test_Win + - NuGet_Test_Linux + - NuGet_Test_MacOS + steps: + + - template: ../../templates/set-version-number-variables-step.yml + - template: upload-binary-sizes-from-nuget-package.yml + parameters: + downloadPath: $(Build.BinariesDirectory)/nuget-artifact/final-package + gitCommitHash: $(OnnxRuntimeGitCommitHashShort) + + - task: AzureFileCopy@3 + displayName: 'Copy Signed NuGet Package to Blob Store' + condition: ne(variables['IsReleaseBuild'], 'true') # rlease build has a different package naming scheme + inputs: + sourcePath: '$(Build.BinariesDirectory)/nuget-artifact/final-package/Microsoft.ML.OnnxRuntime.$(OnnxRuntimeVersion)-dev-$(OnnxRuntimeGitCommitHashShort).nupkg' + azureSubscription: 'AIInfraBuildOnnxRuntimeOSS' + destination: azureBlob + storage: ortpackages + containerName: ortpackages + diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml new file mode 100644 index 0000000000000..2b1e0aca9a537 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/upload-binary-sizes-from-nuget-package.yml @@ -0,0 +1,49 @@ +parameters: + gitCommitHash: '' + downloadPath: $(Build.BinariesDirectory)/nuget-artifact/final-package + +steps: +- task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - Signed NuGet Package' + inputs: + artifactName: 'drop-signed-nuget' + targetPath: '${{ parameters.downloadPath }}' + +- task: UsePythonVersion@0 + inputs: + versionSpec: '3.7' + addToPath: true + architecture: 'x64' + +- task: CmdLine@1 + displayName: 'Install conda modules mysql-connector-python' + inputs: + filename: '%CONDA%\condabin\conda.bat' + arguments: 'install -q --insecure -y mysql-connector-python' + timeoutInMinutes: 10 + +- task: CmdLine@2 + displayName: 'Post binary sizes to the dashboard database using command line' + inputs: + script: | + echo changing directory to artifact download path + pushd "${{ parameters.downloadPath }}" + echo processing nupkg + FOR /R %%i IN (*.nupkg) do ( + echo processing %%~ni.nupkg + copy %%~ni.nupkg %%~ni.zip + echo copied to zip + echo listing lib files in the zip + REM use a single .csv file to put the data + echo os,arch,build_config,size > binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\linux-x64\native\libonnxruntime.so | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo linux,x64,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\linux-x86\native\libonnxruntime.so | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo linux,x86,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\osx-x64\native\libonnxruntime.dylib | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo osx,x64,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\win-x64\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo win,x64,openmp,%%a >> binary_size_data.txt + 7z.exe l -slt %%~ni.zip runtimes\win-x86\native\onnxruntime.dll | findstr /R /C:"^Size = [0-9]*" | for /F "tokens=3" %%a in ('more') do if not "%%a" == "" echo win,x86,openmp,%%a >> binary_size_data.txt + echo calling python script to post to database + %CONDA%\python.exe $(Build.SourcesDirectory)\tools\ci_build\github\windows\post_binary_sizes_to_dashboard.py --commit_hash=${{ parameters.gitCommitHash }} --size_data_file=binary_size_data.txt --build_project=Lotus --build_id=$(Build.BuildId) + ) + + env: + DASHBOARD_MYSQL_ORT_PASSWORD: $(dashboard-mysql-ort-password) diff --git a/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml index 007d40b7aef0f..dcbdcffb730d8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml +++ b/tools/ci_build/github/azure-pipelines/templates/set-version-number-variables-step.yml @@ -12,6 +12,10 @@ steps: FOR /F "tokens=* USEBACKQ" %%F IN (`git rev-parse HEAD`) DO ( @echo ##vso[task.setvariable variable=OnnxRuntimeGitCommitHash;]%%F ) + + FOR /F "tokens=* USEBACKQ" %%F IN (`git rev-parse --short HEAD`) DO ( + @echo ##vso[task.setvariable variable=OnnxRuntimeGitCommitHashShort;]%%F + ) workingDirectory: '$(Build.SourcesDirectory)' condition: eq(variables['Agent.OS'], 'Windows_NT') @@ -26,5 +30,8 @@ steps: _OnnxRuntimeGitCommitHash=$(git rev-parse HEAD) echo "##vso[task.setvariable variable=OnnxRuntimeGitCommitHash;]$_OnnxRuntimeGitCommitHash" + _OnnxRuntimeGitCommitHash=$(git rev-parse --short=8 HEAD) + echo "##vso[task.setvariable variable=OnnxRuntimeGitCommitHashShort;]$_OnnxRuntimeGitCommitHash" + workingDirectory: '$(Build.SourcesDirectory)' condition: not(eq(variables['Agent.OS'], 'Windows_NT')) \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 8ce6984dc55b4..bcd1cb8da9b76 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -13,10 +13,12 @@ parameters: MsbuildArguments: '/m' EnvSetupScript: 'setup_env.bat' CudaVersion: '' + AgentPool: 'Win-CPU' jobs: - job: ${{ parameters.JobName }} timeoutInMinutes: 120 + pool: ${{ parameters.AgentPool }} variables: buildDirectory: '$(Build.BinariesDirectory)' BuildCommand: ${{ parameters.BuildCommand }} diff --git a/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh b/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh index 664684bd00c45..50c2b9880f719 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_manylinux2010.sh @@ -26,7 +26,7 @@ if [ ! -f /opt/onnxruntime-python/bin/python${PYTHON_VER} ]; then ln -s python /opt/onnxruntime-python/bin/python${PYTHON_VER} fi python -m pip install --upgrade --force-reinstall pip==19.1.1 -python -m pip install --upgrade --force-reinstall numpy==1.16.3 +python -m pip install --upgrade --force-reinstall numpy==1.15.0 python -m pip install --upgrade --force-reinstall requests==2.21.0 python -m pip install --upgrade --force-reinstall wheel==0.31.1 python -m pip install --upgrade --force-reinstall setuptools==41.0.1 diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py new file mode 100644 index 0000000000000..7161b6f897457 --- /dev/null +++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + + +import argparse +import mysql.connector +import xml.etree.ElementTree as ET +import sys +import os + +def parse_arguments(): + parser = argparse.ArgumentParser(description="ONNXRuntime binary size uploader for dashboard") + parser.add_argument("--commit_hash", help="Full Git commit hash") + parser.add_argument("--build_project", default='Lotus', choices=['Lotus','onnxruntime'], help="Lotus or onnxruntime build project, to construct the build URL") + parser.add_argument("--build_id", help="Build Id") + parser.add_argument("--size_data_file", help="Path to file that contains the binary size data") + + return parser.parse_args() + +# Assumes size_data_file is a csv file with a header line, containing binary sizes and other attributes +# CSV fields are: +# os,arch,build_config,size +# No empty line or space between fields expected +def get_binary_sizes(size_data_file): + binary_size = [] + with open(size_data_file, 'r') as f: + line = f.readline() + headers = line.strip().split(',') + while line: + line = f.readline() + if not line: + break; + linedata = line.strip().split(',') + tablerow = {} + for i in range(0,len(headers)): + if headers[i] == 'size': + tablerow[headers[i]] = int(linedata[i]) + else: + tablerow[headers[i]] = linedata[i] + binary_size.append(tablerow) + return binary_size + + +def write_to_db(binary_size_data, args): + # connect to database + + cnx = mysql.connector.connect( + user='ort@onnxruntimedashboard', + password=os.environ.get('DASHBOARD_MYSQL_ORT_PASSWORD'), + host='onnxruntimedashboard.mysql.database.azure.com', + database='onnxruntime') + + try: + cursor = cnx.cursor() + + #delete old records + delete_query = ('DELETE FROM onnxruntime.binary_size ' + 'WHERE build_time < DATE_SUB(Now(), INTERVAL 30 DAY);' + ) + + cursor.execute(delete_query) + + #insert current records + for row in binary_size_data: + insert_query = ('INSERT INTO onnxruntime.binary_size ' + '(build_time, build_project, build_id, commit_id, os, arch, build_config, size) ' + 'VALUES (Now(), "%s", "%s", "%s", "%s", "%s", "%s", %d) ' + 'ON DUPLICATE KEY UPDATE ' + 'build_time=Now(), build_project="%s", build_id="%s", size=%d;' + ) % ( + args.build_project, + args.build_id, + args.commit_hash, + row['os'], + row['arch'], + row['build_config'], + row['size'], + + args.build_project, + args.build_id, + row['size'] + ) + cursor.execute(insert_query) + + cnx.commit() + + # # Use below for debugging: + # cursor.execute('select * from onnxruntime.binary_size') + # for r in cursor: + # print(r) + + cursor.close() + cnx.close() + except BaseException as e: + cnx.close() + raise e + + +if __name__ == "__main__": + try: + args = parse_arguments() + binary_size_data = get_binary_sizes(args.size_data_file) + write_to_db(binary_size_data, args) + except BaseException as e: + print(str(e)) + sys.exit(1) + + +