diff --git a/ci_build/azure_pipelines/mobile-examples-pipeline.yml b/ci_build/azure_pipelines/mobile-examples-pipeline.yml index 03019beb1d5cb..d04e418f9f3f9 100644 --- a/ci_build/azure_pipelines/mobile-examples-pipeline.yml +++ b/ci_build/azure_pipelines/mobile-examples-pipeline.yml @@ -1,37 +1,49 @@ -jobs: -- job: BasicUsageIos - pool: - vmImage: "macOS-10.15" - - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.6' - addToPath: true - architecture: 'x64' - - - script: | - set -e - pip install -r ../model/requirements.txt - ../model/gen_model.sh ./OrtBasicUsage/model - workingDirectory: mobile/examples/basic_usage/ios - displayName: "Generate model" - - - task: CocoaPods@0 - inputs: - workingDirectory: 'mobile/examples/basic_usage/ios' - forceRepoUpdate: false - - - task: Xcode@5 - inputs: - actions: 'test' - configuration: 'Debug' - sdk: 'iphonesimulator' - xcWorkspacePath: 'mobile/examples/basic_usage/ios/OrtBasicUsage.xcworkspace' - scheme: 'OrtBasicUsage' - xcodeVersion: 'specifyPath' - xcodeDeveloperDir: '/Applications/Xcode_12.4.app/Contents/Developer' - packageApp: false - destinationPlatformOption: 'iOS' - destinationTypeOption: 'simulators' - destinationSimulators: 'iPhone 8' +jobs: + +# mobile/examples/basic_usage/ios +- job: BasicUsageIos + pool: + vmImage: "macOS-10.15" + + steps: + - template: templates/use-python-step.yml + + - bash: | + set -e + pip install -r ../model/requirements.txt + ../model/gen_model.sh ./OrtBasicUsage/model + workingDirectory: mobile/examples/basic_usage/ios + displayName: "Generate model" + + - script: pod install + workingDirectory: 'mobile/examples/basic_usage/ios' + displayName: "Install CocoaPods pods" + + - template: templates/xcode-build-and-test-step.yml + parameters: + xcWorkspacePath: 'mobile/examples/basic_usage/ios/OrtBasicUsage.xcworkspace' + scheme: 'OrtBasicUsage' + +# mobile/examples/speech_recognition/ios +- job: SpeechRecognitionIos + pool: + vmImage: "macOS-10.15" + + steps: + - template: templates/use-python-step.yml + + - bash: | + set -e + pip install -r ../model/requirements.txt + ../model/gen_model.sh ./SpeechRecognition/model + workingDirectory: mobile/examples/speech_recognition/ios + displayName: "Generate model" + + - script: pod install + workingDirectory: 'mobile/examples/speech_recognition/ios' + displayName: "Install CocoaPods pods" + + - template: templates/xcode-build-and-test-step.yml + parameters: + xcWorkspacePath: 'mobile/examples/speech_recognition/ios/SpeechRecognition.xcworkspace' + scheme: 'SpeechRecognition' diff --git a/ci_build/azure_pipelines/templates/use-python-step.yml b/ci_build/azure_pipelines/templates/use-python-step.yml new file mode 100644 index 0000000000000..3610547254ecd --- /dev/null +++ b/ci_build/azure_pipelines/templates/use-python-step.yml @@ -0,0 +1,7 @@ +steps: +- task: UsePythonVersion@0 + inputs: + versionSpec: '3.6' + addToPath: true + architecture: 'x64' + displayName: "Use Python 3.6" diff --git a/ci_build/azure_pipelines/templates/xcode-build-and-test-step.yml b/ci_build/azure_pipelines/templates/xcode-build-and-test-step.yml new file mode 100644 index 0000000000000..1338916e68953 --- /dev/null +++ b/ci_build/azure_pipelines/templates/xcode-build-and-test-step.yml @@ -0,0 +1,21 @@ +parameters: +- name: xcWorkspacePath + type: string +- name: scheme + type: string + +steps: +- task: Xcode@5 + inputs: + actions: 'test' + configuration: 'Debug' + sdk: 'iphonesimulator' + xcWorkspacePath: '${{ parameters.xcWorkspacePath }}' + scheme: '${{ parameters.scheme }}' + xcodeVersion: 'specifyPath' + xcodeDeveloperDir: '/Applications/Xcode_12.4.app/Contents/Developer' + packageApp: false + destinationPlatformOption: 'iOS' + destinationTypeOption: 'simulators' + destinationSimulators: 'iPhone 8' + displayName: "Xcode build and test" diff --git a/mobile/README.md b/mobile/README.md index 500ddcd2aafe1..a48aa8f899881 100644 --- a/mobile/README.md +++ b/mobile/README.md @@ -1,15 +1,43 @@ -# ONNX Runtime Mobile examples +# ONNX Runtime Mobile Examples -The following examples demonstrate how to use ONNX Runtime (ORT) Mobile in mobile applications. +These examples demonstrate how to use ONNX Runtime (ORT) Mobile in mobile applications. -## Basic usage +## General Prerequisites + +These are some general prerequisites. +Examples may specify other requirements if applicable. +Please refer to the instructions for each example. + +### Get the Code + +Clone this repo. + +```bash +git clone https://github.com/microsoft/onnxruntime-inference-examples.git +``` + +### iOS Example Prerequisites + +- Xcode 12.4+ +- CocoaPods +- A valid Apple Developer ID if you want to run the example on a device + +## Examples + +### Basic Usage The example app shows basic usage of the ORT APIs. - [iOS Basic Usage](examples/basic_usage/ios) -## Image classification +### Image Classification The example app uses image classification which is able to continuously classify the objects it sees from the device's camera in real-time and displays the most probable inference results on the screen. - [Android Image Classifier](examples/image_classifications/android) + +### Speech Recognition + +The example app uses speech recognition to transcribe speech from audio recorded by the device. + +- [iOS Speech Recognition](examples/speech_recognition/ios) diff --git a/mobile/examples/basic_usage/ios/Podfile b/mobile/examples/basic_usage/ios/Podfile index 89aff2e6a89fc..ad3801c5b0edc 100644 --- a/mobile/examples/basic_usage/ios/Podfile +++ b/mobile/examples/basic_usage/ios/Podfile @@ -3,7 +3,7 @@ platform :ios, '11.0' target 'OrtBasicUsage' do use_frameworks! - pod 'onnxruntime-mobile-objc', '1.8.0-preview' + pod 'onnxruntime-mobile-objc' target 'OrtBasicUsageTests' do inherit! :search_paths diff --git a/mobile/examples/basic_usage/model/gen_model.sh b/mobile/examples/basic_usage/model/gen_model.sh index 1f4adcfbe15f0..665c8c9b19aba 100755 --- a/mobile/examples/basic_usage/model/gen_model.sh +++ b/mobile/examples/basic_usage/model/gen_model.sh @@ -12,4 +12,3 @@ cd ${OUTPUT_DIR} python3 ${DIR}/single_add_gen.py python3 -m onnxruntime.tools.convert_onnx_models_to_ort . - diff --git a/mobile/examples/basic_usage/model/requirements.txt b/mobile/examples/basic_usage/model/requirements.txt index c98065b9d4b9c..4757a3717e3d8 100644 --- a/mobile/examples/basic_usage/model/requirements.txt +++ b/mobile/examples/basic_usage/model/requirements.txt @@ -1,2 +1,2 @@ -onnx==1.9.0 -onnxruntime==1.8.0 +onnx>=1.9.0 +onnxruntime>=1.8.0 diff --git a/mobile/examples/speech_recognition/ios/Podfile b/mobile/examples/speech_recognition/ios/Podfile new file mode 100644 index 0000000000000..7190ed4ffe162 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/Podfile @@ -0,0 +1,12 @@ +platform :ios, '11.0' + +target 'SpeechRecognition' do + use_frameworks! + + pod 'onnxruntime-mobile-objc' + + target 'SpeechRecognitionTests' do + inherit! :search_paths + end + +end diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.pbxproj b/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.pbxproj new file mode 100644 index 0000000000000..eb6d8f4084e6a --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.pbxproj @@ -0,0 +1,484 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 51; + objects = { + +/* Begin PBXBuildFile section */ + EFE237D726855E4600234E2C /* SpeechRecognitionApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = EFE237D626855E4600234E2C /* SpeechRecognitionApp.swift */; }; + EFE237D926855E4600234E2C /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = EFE237D826855E4600234E2C /* ContentView.swift */; }; + EFE237DB26855E4B00234E2C /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = EFE237DA26855E4B00234E2C /* Assets.xcassets */; }; + EFE237DE26855E4B00234E2C /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = EFE237DD26855E4B00234E2C /* Preview Assets.xcassets */; }; + EFE237E926855E4B00234E2C /* SpeechRecognitionTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = EFE237E826855E4B00234E2C /* SpeechRecognitionTests.swift */; }; + EFE2380226855FB900234E2C /* AudioRecorder.swift in Sources */ = {isa = PBXBuildFile; fileRef = EFE2380126855FB900234E2C /* AudioRecorder.swift */; }; + EFE2380526855FD700234E2C /* SpeechRecognizer.swift in Sources */ = {isa = PBXBuildFile; fileRef = EFE2380426855FD700234E2C /* SpeechRecognizer.swift */; }; + EFE238072685608A00234E2C /* wav2vec2-base-960h.all.ort in Resources */ = {isa = PBXBuildFile; fileRef = EFE238062685608A00234E2C /* wav2vec2-base-960h.all.ort */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + EFE237E526855E4B00234E2C /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = EFE237CB26855E4600234E2C /* Project object */; + proxyType = 1; + remoteGlobalIDString = EFE237D226855E4600234E2C; + remoteInfo = SpeechRecognition; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXFileReference section */ + EFE237D326855E4600234E2C /* SpeechRecognition.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = SpeechRecognition.app; sourceTree = BUILT_PRODUCTS_DIR; }; + EFE237D626855E4600234E2C /* SpeechRecognitionApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SpeechRecognitionApp.swift; sourceTree = ""; }; + EFE237D826855E4600234E2C /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + EFE237DA26855E4B00234E2C /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + EFE237DD26855E4B00234E2C /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; + EFE237DF26855E4B00234E2C /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + EFE237E426855E4B00234E2C /* SpeechRecognitionTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = SpeechRecognitionTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; + EFE237E826855E4B00234E2C /* SpeechRecognitionTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SpeechRecognitionTests.swift; sourceTree = ""; }; + EFE237EA26855E4B00234E2C /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + EFE2380126855FB900234E2C /* AudioRecorder.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = AudioRecorder.swift; sourceTree = ""; }; + EFE2380326855FC800234E2C /* SpeechRecognition-Bridging-Header.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "SpeechRecognition-Bridging-Header.h"; sourceTree = ""; }; + EFE2380426855FD700234E2C /* SpeechRecognizer.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SpeechRecognizer.swift; sourceTree = ""; }; + EFE238062685608A00234E2C /* wav2vec2-base-960h.all.ort */ = {isa = PBXFileReference; lastKnownFileType = file; name = "wav2vec2-base-960h.all.ort"; path = "model/wav2vec2-base-960h.all.ort"; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + EFE237D026855E4600234E2C /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + EFE237E126855E4B00234E2C /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + BE5303DF73A9410AA3E0E0EA /* Pods */ = { + isa = PBXGroup; + children = ( + ); + path = Pods; + sourceTree = ""; + }; + EFE237CA26855E4600234E2C = { + isa = PBXGroup; + children = ( + EFE237D526855E4600234E2C /* SpeechRecognition */, + EFE237E726855E4B00234E2C /* SpeechRecognitionTests */, + EFE237D426855E4600234E2C /* Products */, + BE5303DF73A9410AA3E0E0EA /* Pods */, + ); + sourceTree = ""; + }; + EFE237D426855E4600234E2C /* Products */ = { + isa = PBXGroup; + children = ( + EFE237D326855E4600234E2C /* SpeechRecognition.app */, + EFE237E426855E4B00234E2C /* SpeechRecognitionTests.xctest */, + ); + name = Products; + sourceTree = ""; + }; + EFE237D526855E4600234E2C /* SpeechRecognition */ = { + isa = PBXGroup; + children = ( + EFE2380126855FB900234E2C /* AudioRecorder.swift */, + EFE2380426855FD700234E2C /* SpeechRecognizer.swift */, + EFE2380326855FC800234E2C /* SpeechRecognition-Bridging-Header.h */, + EFE237D626855E4600234E2C /* SpeechRecognitionApp.swift */, + EFE237D826855E4600234E2C /* ContentView.swift */, + EFE237DA26855E4B00234E2C /* Assets.xcassets */, + EFE237DF26855E4B00234E2C /* Info.plist */, + EFE238062685608A00234E2C /* wav2vec2-base-960h.all.ort */, + EFE237DC26855E4B00234E2C /* Preview Content */, + ); + path = SpeechRecognition; + sourceTree = ""; + }; + EFE237DC26855E4B00234E2C /* Preview Content */ = { + isa = PBXGroup; + children = ( + EFE237DD26855E4B00234E2C /* Preview Assets.xcassets */, + ); + path = "Preview Content"; + sourceTree = ""; + }; + EFE237E726855E4B00234E2C /* SpeechRecognitionTests */ = { + isa = PBXGroup; + children = ( + EFE237E826855E4B00234E2C /* SpeechRecognitionTests.swift */, + EFE237EA26855E4B00234E2C /* Info.plist */, + ); + path = SpeechRecognitionTests; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + EFE237D226855E4600234E2C /* SpeechRecognition */ = { + isa = PBXNativeTarget; + buildConfigurationList = EFE237F826855E4B00234E2C /* Build configuration list for PBXNativeTarget "SpeechRecognition" */; + buildPhases = ( + EFE237CF26855E4600234E2C /* Sources */, + EFE237D026855E4600234E2C /* Frameworks */, + EFE237D126855E4600234E2C /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = SpeechRecognition; + productName = SpeechRecognition; + productReference = EFE237D326855E4600234E2C /* SpeechRecognition.app */; + productType = "com.apple.product-type.application"; + }; + EFE237E326855E4B00234E2C /* SpeechRecognitionTests */ = { + isa = PBXNativeTarget; + buildConfigurationList = EFE237FB26855E4B00234E2C /* Build configuration list for PBXNativeTarget "SpeechRecognitionTests" */; + buildPhases = ( + EFE237E026855E4B00234E2C /* Sources */, + EFE237E126855E4B00234E2C /* Frameworks */, + EFE237E226855E4B00234E2C /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + EFE237E626855E4B00234E2C /* PBXTargetDependency */, + ); + name = SpeechRecognitionTests; + productName = SpeechRecognitionTests; + productReference = EFE237E426855E4B00234E2C /* SpeechRecognitionTests.xctest */; + productType = "com.apple.product-type.bundle.unit-test"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + EFE237CB26855E4600234E2C /* Project object */ = { + isa = PBXProject; + attributes = { + LastSwiftUpdateCheck = 1250; + LastUpgradeCheck = 1250; + TargetAttributes = { + EFE237D226855E4600234E2C = { + CreatedOnToolsVersion = 12.5.1; + }; + EFE237E326855E4B00234E2C = { + CreatedOnToolsVersion = 12.5.1; + TestTargetID = EFE237D226855E4600234E2C; + }; + }; + }; + buildConfigurationList = EFE237CE26855E4600234E2C /* Build configuration list for PBXProject "SpeechRecognition" */; + compatibilityVersion = "Xcode 9.3"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = EFE237CA26855E4600234E2C; + productRefGroup = EFE237D426855E4600234E2C /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + EFE237D226855E4600234E2C /* SpeechRecognition */, + EFE237E326855E4B00234E2C /* SpeechRecognitionTests */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + EFE237D126855E4600234E2C /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + EFE238072685608A00234E2C /* wav2vec2-base-960h.all.ort in Resources */, + EFE237DE26855E4B00234E2C /* Preview Assets.xcassets in Resources */, + EFE237DB26855E4B00234E2C /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + EFE237E226855E4B00234E2C /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + EFE237CF26855E4600234E2C /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + EFE2380226855FB900234E2C /* AudioRecorder.swift in Sources */, + EFE2380526855FD700234E2C /* SpeechRecognizer.swift in Sources */, + EFE237D926855E4600234E2C /* ContentView.swift in Sources */, + EFE237D726855E4600234E2C /* SpeechRecognitionApp.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + EFE237E026855E4B00234E2C /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + EFE237E926855E4B00234E2C /* SpeechRecognitionTests.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + EFE237E626855E4B00234E2C /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = EFE237D226855E4600234E2C /* SpeechRecognition */; + targetProxy = EFE237E526855E4B00234E2C /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin XCBuildConfiguration section */ + EFE237F626855E4B00234E2C /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 14.5; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + EFE237F726855E4B00234E2C /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 14.5; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + EFE237F926855E4B00234E2C /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_ASSET_PATHS = "\"SpeechRecognition/Preview Content\""; + ENABLE_PREVIEWS = YES; + INFOPLIST_FILE = SpeechRecognition/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = com.onnxruntime.SpeechRecognition; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_OBJC_BRIDGING_HEADER = "SpeechRecognition/SpeechRecognition-Bridging-Header.h"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = 1; + }; + name = Debug; + }; + EFE237FA26855E4B00234E2C /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_ASSET_PATHS = "\"SpeechRecognition/Preview Content\""; + ENABLE_PREVIEWS = YES; + INFOPLIST_FILE = SpeechRecognition/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = com.onnxruntime.SpeechRecognition; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_OBJC_BRIDGING_HEADER = "SpeechRecognition/SpeechRecognition-Bridging-Header.h"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = 1; + }; + name = Release; + }; + EFE237FC26855E4B00234E2C /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + INFOPLIST_FILE = SpeechRecognitionTests/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = com.onnxruntime.SpeechRecognitionTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/SpeechRecognition.app/SpeechRecognition"; + }; + name = Debug; + }; + EFE237FD26855E4B00234E2C /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + INFOPLIST_FILE = SpeechRecognitionTests/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = com.onnxruntime.SpeechRecognitionTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/SpeechRecognition.app/SpeechRecognition"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + EFE237CE26855E4600234E2C /* Build configuration list for PBXProject "SpeechRecognition" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + EFE237F626855E4B00234E2C /* Debug */, + EFE237F726855E4B00234E2C /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + EFE237F826855E4B00234E2C /* Build configuration list for PBXNativeTarget "SpeechRecognition" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + EFE237F926855E4B00234E2C /* Debug */, + EFE237FA26855E4B00234E2C /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + EFE237FB26855E4B00234E2C /* Build configuration list for PBXNativeTarget "SpeechRecognitionTests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + EFE237FC26855E4B00234E2C /* Debug */, + EFE237FD26855E4B00234E2C /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = EFE237CB26855E4600234E2C /* Project object */; +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 0000000000000..919434a6254f0 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist new file mode 100644 index 0000000000000..18d981003d68d --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist @@ -0,0 +1,8 @@ + + + + + IDEDidComputeMac32BitWarning + + + diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/AccentColor.colorset/Contents.json b/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000000000..eb87897008164 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,11 @@ +{ + "colors" : [ + { + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/AppIcon.appiconset/Contents.json b/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000000000..9221b9bb1a35f --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,98 @@ +{ + "images" : [ + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "20x20" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "20x20" + }, + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "29x29" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "29x29" + }, + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "40x40" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "40x40" + }, + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "60x60" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "60x60" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "20x20" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "20x20" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "29x29" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "29x29" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "40x40" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "40x40" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "76x76" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "76x76" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "83.5x83.5" + }, + { + "idiom" : "ios-marketing", + "scale" : "1x", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/Contents.json b/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/Contents.json new file mode 100644 index 0000000000000..73c00596a7fca --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/AudioRecorder.swift b/mobile/examples/speech_recognition/ios/SpeechRecognition/AudioRecorder.swift new file mode 100644 index 0000000000000..3846727acec67 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/AudioRecorder.swift @@ -0,0 +1,130 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import AVFoundation +import Foundation + +private let kSampleRate: Int = 16000 +private let kRecordingDuration: TimeInterval = 10 + +class AudioRecorder { + typealias RecordingBufferAndData = (buffer: AVAudioBuffer, data: Data) + typealias RecordResult = Result + typealias RecordingDoneCallback = (RecordResult) -> Void + + enum AudioRecorderError: Error { + case Error(message: String) + } + + func record(callback: @escaping RecordingDoneCallback) { + let session = AVAudioSession.sharedInstance() + session.requestRecordPermission { allowed in + do { + guard allowed else { + throw AudioRecorderError.Error(message: "Recording permission denied.") + } + + try session.setCategory(.record) + try session.setActive(true) + + let tempDir = FileManager.default.temporaryDirectory + + let recordingUrl = tempDir.appendingPathComponent("recording.wav") + + let formatSettings: [String: Any] = [ + AVFormatIDKey: kAudioFormatLinearPCM, + AVSampleRateKey: kSampleRate, + AVNumberOfChannelsKey: 1, + AVLinearPCMBitDepthKey: 16, + AVLinearPCMIsBigEndianKey: false, + AVLinearPCMIsFloatKey: false, + AVEncoderAudioQualityKey: AVAudioQuality.high.rawValue, + ] + + let recorder = try AVAudioRecorder(url: recordingUrl, settings: formatSettings) + self.recorder = recorder + + let delegate = RecorderDelegate(callback: callback) + recorder.delegate = delegate + self.recorderDelegate = delegate + + guard recorder.record(forDuration: kRecordingDuration) else { + throw AudioRecorderError.Error(message: "Failed to record.") + } + + // control should resume in recorder.delegate.audioRecorderDidFinishRecording() + } catch { + callback(.failure(error)) + } + } + } + + private var recorderDelegate: RecorderDelegate? + private var recorder: AVAudioRecorder? + + private class RecorderDelegate: NSObject, AVAudioRecorderDelegate { + private let callback: RecordingDoneCallback + + init(callback: @escaping RecordingDoneCallback) { + self.callback = callback + } + + func audioRecorderDidFinishRecording( + _ recorder: AVAudioRecorder, + successfully flag: Bool + ) { + let recordResult = RecordResult { () -> RecordingBufferAndData in + guard flag else { + throw AudioRecorderError.Error(message: "Recording was unsuccessful.") + } + + let recordingUrl = recorder.url + let recordingFile = try AVAudioFile(forReading: recordingUrl) + + guard + let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: recordingFile.fileFormat.sampleRate, + channels: 1, + interleaved: false) + else { + throw AudioRecorderError.Error(message: "Failed to create audio format.") + } + + guard + let recordingBuffer = AVAudioPCMBuffer( + pcmFormat: format, + frameCapacity: AVAudioFrameCount(recordingFile.length)) + else { + throw AudioRecorderError.Error(message: "Failed to create audio buffer.") + } + + try recordingFile.read(into: recordingBuffer) + + guard let recordingFloatChannelData = recordingBuffer.floatChannelData else { + throw AudioRecorderError.Error(message: "Failed to get float channel data.") + } + + let recordingData = Data( + bytesNoCopy: recordingFloatChannelData[0], + count: Int(recordingBuffer.frameLength) * MemoryLayout.size, + deallocator: .none) + + return (recordingBuffer, recordingData) + } + + callback(recordResult) + } + + func audioRecorderEncodeErrorDidOccur( + _ recorder: AVAudioRecorder, + error: Error? + ) { + if let error = error { + callback(.failure(error)) + } else { + callback(.failure(AudioRecorderError.Error(message: "Encoding was unsuccessful."))) + } + } + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/ContentView.swift b/mobile/examples/speech_recognition/ios/SpeechRecognition/ContentView.swift new file mode 100644 index 0000000000000..105bf935aecc4 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/ContentView.swift @@ -0,0 +1,61 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import SwiftUI + +struct ContentView: View { + private let audioRecorder = AudioRecorder() + private let speechRecognizer = try! SpeechRecognizer() + + @State private var message: String = "" + @State private var successful: Bool = true + + @State private var readyToRecord: Bool = true + + private func recordAndRecognize() { + audioRecorder.record { recordResult in + let recognizeResult = recordResult.flatMap { recordingBufferAndData in + return speechRecognizer.evaluate(inputData: recordingBufferAndData.data) + } + endRecordAndRecognize(recognizeResult) + } + } + + private func endRecordAndRecognize(_ result: Result) { + DispatchQueue.main.async { + switch result { + case .success(let transcription): + message = transcription + successful = true + case .failure(let error): + message = "Error: \(error)" + successful = false + } + readyToRecord = true + } + } + + var body: some View { + VStack { + Text("Press \"Record\", say something, and get recognized!") + .padding() + + Button("Record") { + readyToRecord = false + recordAndRecognize() + } + .padding() + .disabled(!readyToRecord) + + Text("\(message)") + .foregroundColor(successful ? .none : .red) + .padding() + } + } +} + +struct ContentView_Previews: PreviewProvider { + static var previews: some View { + ContentView() + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/Info.plist b/mobile/examples/speech_recognition/ios/SpeechRecognition/Info.plist new file mode 100644 index 0000000000000..b4f95e235e355 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/Info.plist @@ -0,0 +1,52 @@ + + + + + CFBundleDevelopmentRegion + $(DEVELOPMENT_LANGUAGE) + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + $(PRODUCT_BUNDLE_PACKAGE_TYPE) + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + LSRequiresIPhoneOS + + UIApplicationSceneManifest + + UIApplicationSupportsMultipleScenes + + + UIApplicationSupportsIndirectInputEvents + + UILaunchScreen + + UIRequiredDeviceCapabilities + + armv7 + + UISupportedInterfaceOrientations + + UIInterfaceOrientationPortrait + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + UISupportedInterfaceOrientations~ipad + + UIInterfaceOrientationPortrait + UIInterfaceOrientationPortraitUpsideDown + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + NSMicrophoneUsageDescription + Audio is recorded for speech recognition. + + diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/Preview Content/Preview Assets.xcassets/Contents.json b/mobile/examples/speech_recognition/ios/SpeechRecognition/Preview Content/Preview Assets.xcassets/Contents.json new file mode 100644 index 0000000000000..73c00596a7fca --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/Preview Content/Preview Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognition-Bridging-Header.h b/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognition-Bridging-Header.h new file mode 100644 index 0000000000000..021bfc4d277e7 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognition-Bridging-Header.h @@ -0,0 +1,4 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#import diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognitionApp.swift b/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognitionApp.swift new file mode 100644 index 0000000000000..1a00df92b56d5 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognitionApp.swift @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import SwiftUI + +@main +struct SpeechRecognitionApp: App { + var body: some Scene { + WindowGroup { + ContentView() + } + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognizer.swift b/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognizer.swift new file mode 100644 index 0000000000000..799c46f9270a0 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognition/SpeechRecognizer.swift @@ -0,0 +1,97 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import Foundation + +// these labels correspond to the model's output values +// the labels and postprocessing logic were copied and adapted from: +// https://github.com/pytorch/ios-demo-app/blob/f2b9aa196821c136d3299b99c5dd592de1fa1776/SpeechRecognition/create_wav2vec2.py#L10 +private let kLabels = [ + "", "", "", "", "|", "E", "T", "A", "O", "N", "I", "H", "S", "R", "D", "L", "U", "M", "W", "C", "F", + "G", "Y", "P", "B", "V", "K", "'", "X", "J", "Q", "Z", +] + +class SpeechRecognizer { + private let ortEnv: ORTEnv + private let ortSession: ORTSession + + enum SpeechRecognizerError: Error { + case Error(_ message: String) + } + + init() throws { + ortEnv = try ORTEnv(loggingLevel: ORTLoggingLevel.warning) + guard let modelPath = Bundle.main.path(forResource: "wav2vec2-base-960h.all", ofType: "ort") else { + throw SpeechRecognizerError.Error("Failed to find model file.") + } + ortSession = try ORTSession(env: ortEnv, modelPath: modelPath, sessionOptions: nil) + } + + private func postprocess(modelOutput: UnsafeBufferPointer) -> String { + func maxIndex(_ values: S) -> Int? where S: Sequence, S.Element == Float { + var max: (idx: Int, value: Float)? + for (idx, value) in values.enumerated() { + if max == nil || value > max!.value { + max = (idx, value) + } + } + return max?.idx + } + + func labelIndexToOutput(_ index: Int) -> String { + if index == 4 { + return " " + } else if index > 4 && index < kLabels.count { + return kLabels[index] + } + return "" + } + + precondition(modelOutput.count % kLabels.count == 0) + let n = modelOutput.count / kLabels.count + var resultLabelIndices: [Int] = [] + + for i in 0.. Result { + return Result { () -> String in + let inputShape: [NSNumber] = [1, inputData.count / MemoryLayout.stride as NSNumber] + let input = try ORTValue( + tensorData: NSMutableData(data: inputData), + elementType: ORTTensorElementDataType.float, + shape: inputShape) + + let startTime = DispatchTime.now() + let outputs = try ortSession.run( + withInputs: ["input": input], + outputNames: ["output"], + runOptions: nil) + let endTime = DispatchTime.now() + print("ORT session run time: \(Float(endTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1.0e6) ms") + + guard let output = outputs["output"] else { + throw SpeechRecognizerError.Error("Failed to get model output.") + } + + let outputData = try output.tensorData() as Data + let result = outputData.withUnsafeBytes { (buffer: UnsafeRawBufferPointer) -> String in + let floatBuffer = buffer.bindMemory(to: Float.self) + return postprocess(modelOutput: floatBuffer) + } + + print("result: '\(result)'") + return result + } + } +} diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognitionTests/Info.plist b/mobile/examples/speech_recognition/ios/SpeechRecognitionTests/Info.plist new file mode 100644 index 0000000000000..64d65ca495770 --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognitionTests/Info.plist @@ -0,0 +1,22 @@ + + + + + CFBundleDevelopmentRegion + $(DEVELOPMENT_LANGUAGE) + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + $(PRODUCT_BUNDLE_PACKAGE_TYPE) + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + + diff --git a/mobile/examples/speech_recognition/ios/SpeechRecognitionTests/SpeechRecognitionTests.swift b/mobile/examples/speech_recognition/ios/SpeechRecognitionTests/SpeechRecognitionTests.swift new file mode 100644 index 0000000000000..01fda60d1c1ab --- /dev/null +++ b/mobile/examples/speech_recognition/ios/SpeechRecognitionTests/SpeechRecognitionTests.swift @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import XCTest + +@testable import SpeechRecognition + +class SpeechRecognitionTests: XCTestCase { + func testModelLoadsAndRuns() throws { + let recognizer = try SpeechRecognizer() + let dummyData = Data(count: 16000 * MemoryLayout.size) + _ = try recognizer.evaluate(inputData: dummyData).get() + } +} diff --git a/mobile/examples/speech_recognition/ios/images/screenshot.png b/mobile/examples/speech_recognition/ios/images/screenshot.png new file mode 100644 index 0000000000000..c1fce9f9e6688 Binary files /dev/null and b/mobile/examples/speech_recognition/ios/images/screenshot.png differ diff --git a/mobile/examples/speech_recognition/ios/readme.md b/mobile/examples/speech_recognition/ios/readme.md new file mode 100644 index 0000000000000..b2c858fa0b8dd --- /dev/null +++ b/mobile/examples/speech_recognition/ios/readme.md @@ -0,0 +1,41 @@ +# iOS Speech Recognition Example + +This example shows how to use ORT to do speech recognition using the [Wav2Vec 2.0](https://huggingface.co/transformers/model_doc/wav2vec2.html) model. + +It is heavily inspired by [this PyTorch example](https://github.com/pytorch/ios-demo-app/tree/f2b9aa196821c136d3299b99c5dd592de1fa1776/SpeechRecognition). + +The application lets the user make an audio recording, then recognizes the speech from that recording and displays a transcript. + +![Screenshot](images/screenshot.png) + +## Set up + +### Prerequisites + +See the general prerequisites [here](../../../README.md#General-Prerequisites). + +Additionally, you will need to be able to record audio, either on a simulator or a device. + +### Generate the model + +The model should be generated in this location: `/SpeechRecognition/model` + +See instructions [here](../model/readme.md) for how to generate the model. + +For example, with the model generation script dependencies installed, from this directory, run: + +```bash +../model/gen_model.sh ./SpeechRecognition/model +``` + +### Install the Pod dependencies + +From this directory, run: + +```bash +pod install +``` + +## Build and run + +Open the generated SpeechRecognition.xcworkspace file in Xcode to build and run the example. diff --git a/mobile/examples/speech_recognition/model/gen_model.sh b/mobile/examples/speech_recognition/model/gen_model.sh new file mode 100755 index 0000000000000..368b6d28adf36 --- /dev/null +++ b/mobile/examples/speech_recognition/model/gen_model.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e + +OUTPUT_DIR=${1:?"Please specify an output directory."} + +# Get directory this script is in +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +mkdir -p ${OUTPUT_DIR} +cd ${OUTPUT_DIR} + +python3 ${DIR}/wav2vec2_gen.py +python3 -m onnxruntime.tools.convert_onnx_models_to_ort . diff --git a/mobile/examples/speech_recognition/model/readme.md b/mobile/examples/speech_recognition/model/readme.md new file mode 100644 index 0000000000000..852d439b3c315 --- /dev/null +++ b/mobile/examples/speech_recognition/model/readme.md @@ -0,0 +1,31 @@ +# Wav2Vec 2.0 + +This example uses the [Wav2Vec 2.0](https://huggingface.co/transformers/model_doc/wav2vec2.html) model for speech recognition. + +The model generation script was adapted from [this PyTorch example script](https://github.com/pytorch/ios-demo-app/blob/f2b9aa196821c136d3299b99c5dd592de1fa1776/SpeechRecognition/create_wav2vec2.py). + +## How to generate the model + +### Install the Python requirements + +It is a good idea to use a separate Python environment instead of the system Python. +E.g., a new Conda environment. + +Run: + +```bash +python3 -m pip install -r /requirements.txt +``` + +### Run the model generation script + +Run: + +```bash +/gen_model.sh +``` + +The model will be generated in the given output directory. + +In particular, .onnx and .ort model files will be generated. +The .ort model file can be used by ONNX Runtime Mobile. diff --git a/mobile/examples/speech_recognition/model/requirements.txt b/mobile/examples/speech_recognition/model/requirements.txt new file mode 100644 index 0000000000000..e65de2c8151b9 --- /dev/null +++ b/mobile/examples/speech_recognition/model/requirements.txt @@ -0,0 +1,5 @@ +onnx>=1.9.0 +onnxruntime>=1.8.0 +torch==1.9.0 +torchaudio==0.9.0 +transformers==4.6.1 diff --git a/mobile/examples/speech_recognition/model/wav2vec2_gen.py b/mobile/examples/speech_recognition/model/wav2vec2_gen.py new file mode 100644 index 0000000000000..c92704c345e06 --- /dev/null +++ b/mobile/examples/speech_recognition/model/wav2vec2_gen.py @@ -0,0 +1,22 @@ +# this script was adapted from here: +# https://github.com/pytorch/ios-demo-app/blob/f2b9aa196821c136d3299b99c5dd592de1fa1776/SpeechRecognition/create_wav2vec2.py + +import torch +from torchaudio.models.wav2vec2.utils.import_huggingface import import_huggingface_model +from transformers import Wav2Vec2ForCTC + +# Load Wav2Vec2 pretrained model from Hugging Face Hub +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") + +# Convert the model to torchaudio format +model = import_huggingface_model(model) + +model = model.eval() + +input = torch.zeros(1, 1024) + +torch.onnx.export(model, input, "wav2vec2-base-960h.onnx", + input_names=["input"], + output_names=["output"], + dynamic_axes={"input": [1], "output": [1]}, + opset_version=13)