Add Recognition Metadata (via synth). (#7961)

googleapis · May 14, 2019 · 6b8eb5f · 6b8eb5f
1 parent d149f80
commit 6b8eb5f
Show file tree

Hide file tree

Showing 4 changed files with 725 additions and 49 deletions.
diff --git a/speech/google/cloud/speech_v1/gapic/enums.py b/speech/google/cloud/speech_v1/gapic/enums.py
@@ -82,6 +82,96 @@ class AudioEncoding(enum.IntEnum):
         SPEEX_WITH_HEADER_BYTE = 7
 
 
+class RecognitionMetadata(object):
+    class InteractionType(enum.IntEnum):
+        """
+        Use case categories that the audio recognition request can be described
+        by.
+
+        Attributes:
+          INTERACTION_TYPE_UNSPECIFIED (int): Use case is either unknown or is something other than one of the other
+          values below.
+          DISCUSSION (int): Multiple people in a conversation or discussion. For example in a
+          meeting with two or more people actively participating. Typically all
+          the primary people speaking would be in the same room (if not, see
+          PHONE\_CALL)
+          PRESENTATION (int): One or more persons lecturing or presenting to others, mostly
+          uninterrupted.
+          PHONE_CALL (int): A phone-call or video-conference in which two or more people, who are
+          not in the same room, are actively participating.
+          VOICEMAIL (int): A recorded message intended for another person to listen to.
+          PROFESSIONALLY_PRODUCED (int): Professionally produced audio (eg. TV Show, Podcast).
+          VOICE_SEARCH (int): Transcribe spoken questions and queries into text.
+          VOICE_COMMAND (int): Transcribe voice commands, such as for controlling a device.
+          DICTATION (int): Transcribe speech to text to create a written document, such as a
+          text-message, email or report.
+        """
+
+        INTERACTION_TYPE_UNSPECIFIED = 0
+        DISCUSSION = 1
+        PRESENTATION = 2
+        PHONE_CALL = 3
+        VOICEMAIL = 4
+        PROFESSIONALLY_PRODUCED = 5
+        VOICE_SEARCH = 6
+        VOICE_COMMAND = 7
+        DICTATION = 8
+
+    class MicrophoneDistance(enum.IntEnum):
+        """
+        Enumerates the types of capture settings describing an audio file.
+
+        Attributes:
+          MICROPHONE_DISTANCE_UNSPECIFIED (int): Audio type is not known.
+          NEARFIELD (int): The audio was captured from a closely placed microphone. Eg. phone,
+          dictaphone, or handheld microphone. Generally if there speaker is within
+          1 meter of the microphone.
+          MIDFIELD (int): The speaker if within 3 meters of the microphone.
+          FARFIELD (int): The speaker is more than 3 meters away from the microphone.
+        """
+
+        MICROPHONE_DISTANCE_UNSPECIFIED = 0
+        NEARFIELD = 1
+        MIDFIELD = 2
+        FARFIELD = 3
+
+    class OriginalMediaType(enum.IntEnum):
+        """
+        The original media the speech was recorded on.
+
+        Attributes:
+          ORIGINAL_MEDIA_TYPE_UNSPECIFIED (int): Unknown original media type.
+          AUDIO (int): The speech data is an audio recording.
+          VIDEO (int): The speech data originally recorded on a video.
+        """
+
+        ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
+        AUDIO = 1
+        VIDEO = 2
+
+    class RecordingDeviceType(enum.IntEnum):
+        """
+        The type of device the speech was recorded with.
+
+        Attributes:
+          RECORDING_DEVICE_TYPE_UNSPECIFIED (int): The recording device is unknown.
+          SMARTPHONE (int): Speech was recorded on a smartphone.
+          PC (int): Speech was recorded using a personal computer or tablet.
+          PHONE_LINE (int): Speech was recorded over a phone line.
+          VEHICLE (int): Speech was recorded in a vehicle.
+          OTHER_OUTDOOR_DEVICE (int): Speech was recorded outdoors.
+          OTHER_INDOOR_DEVICE (int): Speech was recorded indoors.
+        """
+
+        RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
+        SMARTPHONE = 1
+        PC = 2
+        PHONE_LINE = 3
+        VEHICLE = 4
+        OTHER_OUTDOOR_DEVICE = 5
+        OTHER_INDOOR_DEVICE = 6
+
+
 class StreamingRecognizeResponse(object):
     class SpeechEventType(enum.IntEnum):
         """

diff --git a/speech/google/cloud/speech_v1/proto/cloud_speech.proto b/speech/google/cloud/speech_v1/proto/cloud_speech.proto
@@ -19,9 +19,7 @@ package google.cloud.speech.v1;
 
 import "google/api/annotations.proto";
 import "google/longrunning/operations.proto";
-import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
-import "google/protobuf/empty.proto";
 import "google/protobuf/timestamp.proto";
 import "google/rpc/status.proto";
 
@@ -278,6 +276,9 @@ message RecognitionConfig {
   // premium feature.
   bool enable_automatic_punctuation = 11;
 
+  // *Optional* Metadata regarding this request.
+  RecognitionMetadata metadata = 9;
+
   // *Optional* Which model to select for the given request. Select the model
   // best suited to your domain to get best results. If a model is not
   // explicitly specified, then we auto-select a model based on the parameters
@@ -330,6 +331,133 @@ message RecognitionConfig {
   bool use_enhanced = 14;
 }
 
+// Description of audio data to be recognized.
+message RecognitionMetadata {
+  // Use case categories that the audio recognition request can be described
+  // by.
+  enum InteractionType {
+    // Use case is either unknown or is something other than one of the other
+    // values below.
+    INTERACTION_TYPE_UNSPECIFIED = 0;
+
+    // Multiple people in a conversation or discussion. For example in a
+    // meeting with two or more people actively participating. Typically
+    // all the primary people speaking would be in the same room (if not,
+    // see PHONE_CALL)
+    DISCUSSION = 1;
+
+    // One or more persons lecturing or presenting to others, mostly
+    // uninterrupted.
+    PRESENTATION = 2;
+
+    // A phone-call or video-conference in which two or more people, who are
+    // not in the same room, are actively participating.
+    PHONE_CALL = 3;
+
+    // A recorded message intended for another person to listen to.
+    VOICEMAIL = 4;
+
+    // Professionally produced audio (eg. TV Show, Podcast).
+    PROFESSIONALLY_PRODUCED = 5;
+
+    // Transcribe spoken questions and queries into text.
+    VOICE_SEARCH = 6;
+
+    // Transcribe voice commands, such as for controlling a device.
+    VOICE_COMMAND = 7;
+
+    // Transcribe speech to text to create a written document, such as a
+    // text-message, email or report.
+    DICTATION = 8;
+  }
+
+  // The use case most closely describing the audio content to be recognized.
+  InteractionType interaction_type = 1;
+
+  // The industry vertical to which this speech recognition request most
+  // closely applies. This is most indicative of the topics contained
+  // in the audio.  Use the 6-digit NAICS code to identify the industry
+  // vertical - see https://www.naics.com/search/.
+  uint32 industry_naics_code_of_audio = 3;
+
+  // Enumerates the types of capture settings describing an audio file.
+  enum MicrophoneDistance {
+    // Audio type is not known.
+    MICROPHONE_DISTANCE_UNSPECIFIED = 0;
+
+    // The audio was captured from a closely placed microphone. Eg. phone,
+    // dictaphone, or handheld microphone. Generally if there speaker is within
+    // 1 meter of the microphone.
+    NEARFIELD = 1;
+
+    // The speaker if within 3 meters of the microphone.
+    MIDFIELD = 2;
+
+    // The speaker is more than 3 meters away from the microphone.
+    FARFIELD = 3;
+  }
+
+  // The audio type that most closely describes the audio being recognized.
+  MicrophoneDistance microphone_distance = 4;
+
+  // The original media the speech was recorded on.
+  enum OriginalMediaType {
+    // Unknown original media type.
+    ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;
+
+    // The speech data is an audio recording.
+    AUDIO = 1;
+
+    // The speech data originally recorded on a video.
+    VIDEO = 2;
+  }
+
+  // The original media the speech was recorded on.
+  OriginalMediaType original_media_type = 5;
+
+  // The type of device the speech was recorded with.
+  enum RecordingDeviceType {
+    // The recording device is unknown.
+    RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;
+
+    // Speech was recorded on a smartphone.
+    SMARTPHONE = 1;
+
+    // Speech was recorded using a personal computer or tablet.
+    PC = 2;
+
+    // Speech was recorded over a phone line.
+    PHONE_LINE = 3;
+
+    // Speech was recorded in a vehicle.
+    VEHICLE = 4;
+
+    // Speech was recorded outdoors.
+    OTHER_OUTDOOR_DEVICE = 5;
+
+    // Speech was recorded indoors.
+    OTHER_INDOOR_DEVICE = 6;
+  }
+
+  // The type of device the speech was recorded with.
+  RecordingDeviceType recording_device_type = 6;
+
+  // The device used to make the recording.  Examples 'Nexus 5X' or
+  // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
+  // 'Cardioid Microphone'.
+  string recording_device_name = 7;
+
+  // Mime type of the original audio file.  For example `audio/m4a`,
+  // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
+  // A list of possible audio mime types is maintained at
+  // http://www.iana.org/assignments/media-types/media-types.xhtml#audio
+  string original_mime_type = 8;
+
+  // Description of the content. Eg. "Recordings of federal supreme court
+  // hearings from 2012".
+  string audio_topic = 10;
+}
+
 // Provides "hints" to the speech recognizer to favor specific words and phrases
 // in the results.
 message SpeechContext {
@@ -504,10 +632,20 @@ message StreamingRecognitionResult {
   // The default of 0.0 is a sentinel value indicating `stability` was not set.
   float stability = 3;
 
+  // Output only. Time offset of the end of this result relative to the
+  // beginning of the audio.
+  google.protobuf.Duration result_end_time = 4;
+
   // For multi-channel audio, this is the channel number corresponding to the
   // recognized result for the audio from that channel.
   // For audio_channel_count = N, its output values can range from '1' to 'N'.
   int32 channel_tag = 5;
+
+  // Output only. The
+  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
+  // language in this result. This language code was detected to have the most
+  // likelihood of being spoken in the audio.
+  string language_code = 6;
 }
 
 // A speech recognition result corresponding to a portion of the audio.