Skip to content

Commit

Permalink
Add Recognition Metadata (via synth). (#7961)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoshi-automation authored and busunkim96 committed May 14, 2019
1 parent d149f80 commit 6b8eb5f
Show file tree
Hide file tree
Showing 4 changed files with 725 additions and 49 deletions.
90 changes: 90 additions & 0 deletions speech/google/cloud/speech_v1/gapic/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,96 @@ class AudioEncoding(enum.IntEnum):
SPEEX_WITH_HEADER_BYTE = 7


class RecognitionMetadata(object):
class InteractionType(enum.IntEnum):
"""
Use case categories that the audio recognition request can be described
by.
Attributes:
INTERACTION_TYPE_UNSPECIFIED (int): Use case is either unknown or is something other than one of the other
values below.
DISCUSSION (int): Multiple people in a conversation or discussion. For example in a
meeting with two or more people actively participating. Typically all
the primary people speaking would be in the same room (if not, see
PHONE\_CALL)
PRESENTATION (int): One or more persons lecturing or presenting to others, mostly
uninterrupted.
PHONE_CALL (int): A phone-call or video-conference in which two or more people, who are
not in the same room, are actively participating.
VOICEMAIL (int): A recorded message intended for another person to listen to.
PROFESSIONALLY_PRODUCED (int): Professionally produced audio (eg. TV Show, Podcast).
VOICE_SEARCH (int): Transcribe spoken questions and queries into text.
VOICE_COMMAND (int): Transcribe voice commands, such as for controlling a device.
DICTATION (int): Transcribe speech to text to create a written document, such as a
text-message, email or report.
"""

INTERACTION_TYPE_UNSPECIFIED = 0
DISCUSSION = 1
PRESENTATION = 2
PHONE_CALL = 3
VOICEMAIL = 4
PROFESSIONALLY_PRODUCED = 5
VOICE_SEARCH = 6
VOICE_COMMAND = 7
DICTATION = 8

class MicrophoneDistance(enum.IntEnum):
"""
Enumerates the types of capture settings describing an audio file.
Attributes:
MICROPHONE_DISTANCE_UNSPECIFIED (int): Audio type is not known.
NEARFIELD (int): The audio was captured from a closely placed microphone. Eg. phone,
dictaphone, or handheld microphone. Generally if there speaker is within
1 meter of the microphone.
MIDFIELD (int): The speaker if within 3 meters of the microphone.
FARFIELD (int): The speaker is more than 3 meters away from the microphone.
"""

MICROPHONE_DISTANCE_UNSPECIFIED = 0
NEARFIELD = 1
MIDFIELD = 2
FARFIELD = 3

class OriginalMediaType(enum.IntEnum):
"""
The original media the speech was recorded on.
Attributes:
ORIGINAL_MEDIA_TYPE_UNSPECIFIED (int): Unknown original media type.
AUDIO (int): The speech data is an audio recording.
VIDEO (int): The speech data originally recorded on a video.
"""

ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
AUDIO = 1
VIDEO = 2

class RecordingDeviceType(enum.IntEnum):
"""
The type of device the speech was recorded with.
Attributes:
RECORDING_DEVICE_TYPE_UNSPECIFIED (int): The recording device is unknown.
SMARTPHONE (int): Speech was recorded on a smartphone.
PC (int): Speech was recorded using a personal computer or tablet.
PHONE_LINE (int): Speech was recorded over a phone line.
VEHICLE (int): Speech was recorded in a vehicle.
OTHER_OUTDOOR_DEVICE (int): Speech was recorded outdoors.
OTHER_INDOOR_DEVICE (int): Speech was recorded indoors.
"""

RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
SMARTPHONE = 1
PC = 2
PHONE_LINE = 3
VEHICLE = 4
OTHER_OUTDOOR_DEVICE = 5
OTHER_INDOOR_DEVICE = 6


class StreamingRecognizeResponse(object):
class SpeechEventType(enum.IntEnum):
"""
Expand Down
142 changes: 140 additions & 2 deletions speech/google/cloud/speech_v1/proto/cloud_speech.proto
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ package google.cloud.speech.v1;

import "google/api/annotations.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";

Expand Down Expand Up @@ -278,6 +276,9 @@ message RecognitionConfig {
// premium feature.
bool enable_automatic_punctuation = 11;

// *Optional* Metadata regarding this request.
RecognitionMetadata metadata = 9;

// *Optional* Which model to select for the given request. Select the model
// best suited to your domain to get best results. If a model is not
// explicitly specified, then we auto-select a model based on the parameters
Expand Down Expand Up @@ -330,6 +331,133 @@ message RecognitionConfig {
bool use_enhanced = 14;
}

// Description of audio data to be recognized.
message RecognitionMetadata {
// Use case categories that the audio recognition request can be described
// by.
enum InteractionType {
// Use case is either unknown or is something other than one of the other
// values below.
INTERACTION_TYPE_UNSPECIFIED = 0;

// Multiple people in a conversation or discussion. For example in a
// meeting with two or more people actively participating. Typically
// all the primary people speaking would be in the same room (if not,
// see PHONE_CALL)
DISCUSSION = 1;

// One or more persons lecturing or presenting to others, mostly
// uninterrupted.
PRESENTATION = 2;

// A phone-call or video-conference in which two or more people, who are
// not in the same room, are actively participating.
PHONE_CALL = 3;

// A recorded message intended for another person to listen to.
VOICEMAIL = 4;

// Professionally produced audio (eg. TV Show, Podcast).
PROFESSIONALLY_PRODUCED = 5;

// Transcribe spoken questions and queries into text.
VOICE_SEARCH = 6;

// Transcribe voice commands, such as for controlling a device.
VOICE_COMMAND = 7;

// Transcribe speech to text to create a written document, such as a
// text-message, email or report.
DICTATION = 8;
}

// The use case most closely describing the audio content to be recognized.
InteractionType interaction_type = 1;

// The industry vertical to which this speech recognition request most
// closely applies. This is most indicative of the topics contained
// in the audio. Use the 6-digit NAICS code to identify the industry
// vertical - see https://www.naics.com/search/.
uint32 industry_naics_code_of_audio = 3;

// Enumerates the types of capture settings describing an audio file.
enum MicrophoneDistance {
// Audio type is not known.
MICROPHONE_DISTANCE_UNSPECIFIED = 0;

// The audio was captured from a closely placed microphone. Eg. phone,
// dictaphone, or handheld microphone. Generally if there speaker is within
// 1 meter of the microphone.
NEARFIELD = 1;

// The speaker if within 3 meters of the microphone.
MIDFIELD = 2;

// The speaker is more than 3 meters away from the microphone.
FARFIELD = 3;
}

// The audio type that most closely describes the audio being recognized.
MicrophoneDistance microphone_distance = 4;

// The original media the speech was recorded on.
enum OriginalMediaType {
// Unknown original media type.
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;

// The speech data is an audio recording.
AUDIO = 1;

// The speech data originally recorded on a video.
VIDEO = 2;
}

// The original media the speech was recorded on.
OriginalMediaType original_media_type = 5;

// The type of device the speech was recorded with.
enum RecordingDeviceType {
// The recording device is unknown.
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;

// Speech was recorded on a smartphone.
SMARTPHONE = 1;

// Speech was recorded using a personal computer or tablet.
PC = 2;

// Speech was recorded over a phone line.
PHONE_LINE = 3;

// Speech was recorded in a vehicle.
VEHICLE = 4;

// Speech was recorded outdoors.
OTHER_OUTDOOR_DEVICE = 5;

// Speech was recorded indoors.
OTHER_INDOOR_DEVICE = 6;
}

// The type of device the speech was recorded with.
RecordingDeviceType recording_device_type = 6;

// The device used to make the recording. Examples 'Nexus 5X' or
// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
// 'Cardioid Microphone'.
string recording_device_name = 7;

// Mime type of the original audio file. For example `audio/m4a`,
// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
// A list of possible audio mime types is maintained at
// http://www.iana.org/assignments/media-types/media-types.xhtml#audio
string original_mime_type = 8;

// Description of the content. Eg. "Recordings of federal supreme court
// hearings from 2012".
string audio_topic = 10;
}

// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message SpeechContext {
Expand Down Expand Up @@ -504,10 +632,20 @@ message StreamingRecognitionResult {
// The default of 0.0 is a sentinel value indicating `stability` was not set.
float stability = 3;

// Output only. Time offset of the end of this result relative to the
// beginning of the audio.
google.protobuf.Duration result_end_time = 4;

// For multi-channel audio, this is the channel number corresponding to the
// recognized result for the audio from that channel.
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 5;

// Output only. The
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
// language in this result. This language code was detected to have the most
// likelihood of being spoken in the audio.
string language_code = 6;
}

// A speech recognition result corresponding to a portion of the audio.
Expand Down
Loading

0 comments on commit 6b8eb5f

Please sign in to comment.