-
Notifications
You must be signed in to change notification settings - Fork 6.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add word time offset samples #1050
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/usr/bin/env python | ||
|
||
# Copyright 2017 Google Inc. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""Google Cloud Speech API sample that demonstrates word time offsets. | ||
Example usage: | ||
python transcribe_word_time_offsets.py resources/audio.raw | ||
python transcribe_word_time_offsets.py \ | ||
gs://cloud-samples-tests/speech/vr.flac | ||
""" | ||
|
||
import argparse | ||
import io | ||
|
||
|
||
def transcribe_file_with_word_time_offsets(speech_file): | ||
"""Transcribe the given audio file synchronously and output the word time | ||
offsets.""" | ||
from google.cloud import speech | ||
from google.cloud.speech import enums | ||
from google.cloud.speech import types | ||
client = speech.SpeechClient() | ||
|
||
with io.open(speech_file, 'rb') as audio_file: | ||
content = audio_file.read() | ||
|
||
audio = types.RecognitionAudio(content=content) | ||
config = types.RecognitionConfig( | ||
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, | ||
sample_rate_hertz=16000, | ||
language_code='en-US', | ||
enable_word_time_offsets=True) | ||
|
||
response = client.recognize(config, audio) | ||
|
||
alternatives = response.results[0].alternatives | ||
|
||
for alternative in alternatives: | ||
print('Transcript: {}'.format(alternative.transcript)) | ||
|
||
for word_info in alternative.words: | ||
word = word_info.word | ||
start_time = word_info.start_time | ||
end_time = word_info.end_time | ||
print('Word: {}, start_time: {}, end_time: {}'.format( | ||
word, | ||
start_time.seconds + start_time.nanos * 1e-9, | ||
end_time.seconds + end_time.nanos * 1e-9)) | ||
|
||
|
||
# [START def_transcribe_gcs] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The If using region tags, the GCS tag is: Again, if you normally use I'm guessing that we should embed these snippets using indented region for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the ranges of indented blocks and region tags overlap then sometimes the code blocks would be incorrectly displayed in the include code widget. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see how that would be an issue for these samples. |
||
def transcribe_gcs_with_word_time_offsets(gcs_uri): | ||
"""Transcribe the given audio file asynchronously and output the word time | ||
offsets.""" | ||
from google.cloud import speech | ||
from google.cloud.speech import enums | ||
from google.cloud.speech import types | ||
client = speech.SpeechClient() | ||
|
||
audio = types.RecognitionAudio(uri=gcs_uri) | ||
config = types.RecognitionConfig( | ||
encoding=enums.RecognitionConfig.AudioEncoding.FLAC, | ||
sample_rate_hertz=16000, | ||
language_code='en-US', | ||
enable_word_time_offsets=True) | ||
|
||
operation = client.long_running_recognize(config, audio) | ||
|
||
print('Waiting for operation to complete...') | ||
result = operation.result(timeout=90) | ||
|
||
alternatives = result.results[0].alternatives | ||
for alternative in alternatives: | ||
print('Transcript: {}'.format(alternative.transcript)) | ||
print('Confidence: {}'.format(alternative.confidence)) | ||
|
||
for word_info in alternative.words: | ||
word = word_info.word | ||
start_time = word_info.start_time | ||
end_time = word_info.end_time | ||
print('Word: {}, start_time: {}, end_time: {}'.format( | ||
word, | ||
start_time.seconds + start_time.nanos * 1e-9, | ||
end_time.seconds + end_time.nanos * 1e-9)) | ||
# [END def_transcribe_gcs] | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description=__doc__, | ||
formatter_class=argparse.RawDescriptionHelpFormatter) | ||
parser.add_argument( | ||
'path', help='File or GCS path for audio file to be recognized') | ||
args = parser.parse_args() | ||
if args.path.startswith('gs://'): | ||
transcribe_gcs_with_word_time_offsets(args.path) | ||
else: | ||
transcribe_file_with_word_time_offsets(args.path) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Copyright 2016, Google, Inc. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
import re | ||
|
||
import transcribe_word_time_offsets | ||
|
||
RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') | ||
|
||
|
||
def test_transcribe_file_with_word_time_offsets(capsys): | ||
transcribe_word_time_offsets.transcribe_file_with_word_time_offsets( | ||
os.path.join(RESOURCES, 'audio.raw')) | ||
out, _ = capsys.readouterr() | ||
|
||
print(out) | ||
match = re.search(r'Bridge, start_time: ([0-9.]+)', out, re.DOTALL | re.I) | ||
time = float(match.group(1)) | ||
|
||
assert time > 0 | ||
|
||
|
||
def test_transcribe_gcs_with_word_time_offsets(capsys): | ||
transcribe_word_time_offsets.transcribe_gcs_with_word_time_offsets( | ||
'gs://python-docs-samples-tests/speech/audio.flac') | ||
out, _ = capsys.readouterr() | ||
|
||
print(out) | ||
match = re.search(r'Bridge, start_time: ([0-9.]+)', out, re.DOTALL | re.I) | ||
time = float(match.group(1)) | ||
|
||
assert time > 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Optional/Nit: the sample hardcodes LINEAR16 so Flac audio won't work.
(This has come up when people have tried to use the sample)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests use the flac audio and pass. No idea why that's the case, but this seems like a non-issue.