Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add word time offset samples #1050

Merged
merged 1 commit into from
Aug 3, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions speech/cloud-client/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,32 @@ To run this sample:
-h, --help show this help message and exit
Transcribe with word time offsets
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



To run this sample:

.. code-block:: bash
$ python transcribe_word_time_offsets.py
usage: transcribe_word_time_offsets.py [-h] path
Google Cloud Speech API sample that demonstrates word time offsets.
Example usage:
python transcribe_word_time_offsets.py resources/audio.raw
python transcribe_word_time_offsets.py gs://cloud-samples-tests/speech/vr.flac
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional/Nit: the sample hardcodes LINEAR16 so Flac audio won't work.

(This has come up when people have tried to use the sample)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests use the flac audio and pass. No idea why that's the case, but this seems like a non-issue.

positional arguments:
path File or GCS path for audio file to be recognized
optional arguments:
-h, --help show this help message and exit
Transcribe Streaming
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Expand Down
3 changes: 3 additions & 0 deletions speech/cloud-client/README.rst.in
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ samples:
- name: Transcribe async
file: transcribe_async.py
show_help: true
- name: Transcribe with word time offsets
file: transcribe_word_time_offsets.py
show_help: true
- name: Transcribe Streaming
file: transcribe_streaming.py
show_help: true
Expand Down
35 changes: 6 additions & 29 deletions speech/cloud-client/transcribe_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

import argparse
import io
import time


# [START def_transcribe_file]
Expand All @@ -49,17 +48,10 @@ def transcribe_file(speech_file):
operation = client.long_running_recognize(config, audio)
# [END migration_async_request]

# Sleep and poll operation.done()
retry_count = 100
while retry_count > 0 and not operation.done():
retry_count -= 1
time.sleep(2)
print('Waiting for operation to complete...')
result = operation.result(timeout=90)

if not operation.done():
print('Operation not complete and retry limit reached.')
return

alternatives = operation.result().results[0].alternatives
alternatives = result.results[0].alternatives
for alternative in alternatives:
print('Transcript: {}'.format(alternative.transcript))
print('Confidence: {}'.format(alternative.confidence))
Expand All @@ -84,28 +76,13 @@ def transcribe_gcs(gcs_uri):

operation = client.long_running_recognize(config, audio)

retry_count = 100
while retry_count > 0 and not operation.done():
retry_count -= 1
time.sleep(2)

if not operation.done():
print('Operation not complete and retry limit reached.')
return
print('Waiting for operation to complete...')
result = operation.result(timeout=90)

alternatives = operation.result().results[0].alternatives
alternatives = result.results[0].alternatives
for alternative in alternatives:
print('Transcript: {}'.format(alternative.transcript))
print('Confidence: {}'.format(alternative.confidence))

for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print('Word: {}, start_time: {}, end_time: {}'.format(
word,
start_time.seconds + start_time.nanos * 1e-9,
end_time.seconds + end_time.nanos * 1e-9))
# [END def_transcribe_gcs]


Expand Down
11 changes: 0 additions & 11 deletions speech/cloud-client/transcribe_async_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,3 @@ def test_transcribe_gcs(capsys):
out, err = capsys.readouterr()

assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I)


def test_transcribe_gcs_word_time_offsets(capsys):
transcribe_async.transcribe_gcs(
'gs://python-docs-samples-tests/speech/audio.flac')
out, err = capsys.readouterr()

match = re.search(r'Bridge, start_time: ([0-9.]+)', out, re.DOTALL | re.I)
time = float(match.group(1))

assert time > 0
111 changes: 111 additions & 0 deletions speech/cloud-client/transcribe_word_time_offsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python

# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Google Cloud Speech API sample that demonstrates word time offsets.
Example usage:
python transcribe_word_time_offsets.py resources/audio.raw
python transcribe_word_time_offsets.py \
gs://cloud-samples-tests/speech/vr.flac
"""

import argparse
import io


def transcribe_file_with_word_time_offsets(speech_file):
"""Transcribe the given audio file synchronously and output the word time
offsets."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()

with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()

audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US',
enable_word_time_offsets=True)

response = client.recognize(config, audio)

alternatives = response.results[0].alternatives

for alternative in alternatives:
print('Transcript: {}'.format(alternative.transcript))

for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print('Word: {}, start_time: {}, end_time: {}'.format(
word,
start_time.seconds + start_time.nanos * 1e-9,
end_time.seconds + end_time.nanos * 1e-9))


# [START def_transcribe_gcs]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The transcribe_file_with_word_time_offsets function doesn't use a region tag but this does. Ignore comment if that's how Python often does samples!

If using region tags, the GCS tag is: speech_async_recognize_gcs_words

Again, if you normally use def_... region tags, then all good!


I'm guessing that we should embed these snippets using indented region for transcribe_file_with_word_time_offsets but region rag for def_transcribe_gcs?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the ranges of indented blocks and region tags overlap then sometimes the code blocks would be incorrectly displayed in the include code widget.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indented_block="def function_name" is the preferred way for Python samples if the entire body of the function will be used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how that would be an issue for these samples.

def transcribe_gcs_with_word_time_offsets(gcs_uri):
"""Transcribe the given audio file asynchronously and output the word time
offsets."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()

audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=16000,
language_code='en-US',
enable_word_time_offsets=True)

operation = client.long_running_recognize(config, audio)

print('Waiting for operation to complete...')
result = operation.result(timeout=90)

alternatives = result.results[0].alternatives
for alternative in alternatives:
print('Transcript: {}'.format(alternative.transcript))
print('Confidence: {}'.format(alternative.confidence))

for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print('Word: {}, start_time: {}, end_time: {}'.format(
word,
start_time.seconds + start_time.nanos * 1e-9,
end_time.seconds + end_time.nanos * 1e-9))
# [END def_transcribe_gcs]


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(
'path', help='File or GCS path for audio file to be recognized')
args = parser.parse_args()
if args.path.startswith('gs://'):
transcribe_gcs_with_word_time_offsets(args.path)
else:
transcribe_file_with_word_time_offsets(args.path)
43 changes: 43 additions & 0 deletions speech/cloud-client/transcribe_word_time_offsets_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright 2016, Google, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re

import transcribe_word_time_offsets

RESOURCES = os.path.join(os.path.dirname(__file__), 'resources')


def test_transcribe_file_with_word_time_offsets(capsys):
transcribe_word_time_offsets.transcribe_file_with_word_time_offsets(
os.path.join(RESOURCES, 'audio.raw'))
out, _ = capsys.readouterr()

print(out)
match = re.search(r'Bridge, start_time: ([0-9.]+)', out, re.DOTALL | re.I)
time = float(match.group(1))

assert time > 0


def test_transcribe_gcs_with_word_time_offsets(capsys):
transcribe_word_time_offsets.transcribe_gcs_with_word_time_offsets(
'gs://python-docs-samples-tests/speech/audio.flac')
out, _ = capsys.readouterr()

print(out)
match = re.search(r'Bridge, start_time: ([0-9.]+)', out, re.DOTALL | re.I)
time = float(match.group(1))

assert time > 0